aboutsummaryrefslogtreecommitdiff
path: root/src/link/tapi/Tokenizer.zig
diff options
context:
space:
mode:
authorJakub Konka <kubkon@jakubkonka.com>2021-06-19 10:45:56 +0200
committerJakub Konka <kubkon@jakubkonka.com>2021-06-24 14:45:45 +0200
commitfbdc5154184b0752175dcafc5bfdc4ea6a0cfebf (patch)
tree0689ba18a8ed01dea95cfe242a04646a05d9107e /src/link/tapi/Tokenizer.zig
parent31c49ad64ded02b9dde57f5d3ef102a771fa5cf7 (diff)
downloadzig-fbdc5154184b0752175dcafc5bfdc4ea6a0cfebf.tar.gz
zig-fbdc5154184b0752175dcafc5bfdc4ea6a0cfebf.zip
link: add basic TAPI parser for linkers
Parser uses kubkon/zig-yaml gitrev c3eae1e40a02aedd44ad1171e5c8b259896cbda0
Diffstat (limited to 'src/link/tapi/Tokenizer.zig')
-rw-r--r--src/link/tapi/Tokenizer.zig439
1 files changed, 439 insertions, 0 deletions
diff --git a/src/link/tapi/Tokenizer.zig b/src/link/tapi/Tokenizer.zig
new file mode 100644
index 0000000000..37fcedbfce
--- /dev/null
+++ b/src/link/tapi/Tokenizer.zig
@@ -0,0 +1,439 @@
+const Tokenizer = @This();
+
+const std = @import("std");
+const log = std.log.scoped(.tapi);
+const testing = std.testing;
+
+buffer: []const u8,
+index: usize = 0,
+
+pub const Token = struct {
+ id: Id,
+ start: usize,
+ end: usize,
+ // Count of spaces/tabs.
+ // Only active for .Space and .Tab tokens.
+ count: ?usize = null,
+
+ pub const Id = enum {
+ Eof,
+
+ NewLine,
+ DocStart, // ---
+ DocEnd, // ...
+ SeqItemInd, // -
+ MapValueInd, // :
+ FlowMapStart, // {
+ FlowMapEnd, // }
+ FlowSeqStart, // [
+ FlowSeqEnd, // ]
+
+ Comma,
+ Space,
+ Tab,
+ Comment, // #
+ Alias, // *
+ Anchor, // &
+ Tag, // !
+ SingleQuote, // '
+ DoubleQuote, // "
+
+ Literal,
+ };
+};
+
+pub const TokenIndex = usize;
+
+pub const TokenIterator = struct {
+ buffer: []const Token,
+ pos: TokenIndex = 0,
+
+ pub fn next(self: *TokenIterator) Token {
+ const token = self.buffer[self.pos];
+ self.pos += 1;
+ return token;
+ }
+
+ pub fn peek(self: TokenIterator) ?Token {
+ if (self.pos >= self.buffer.len) return null;
+ return self.buffer[self.pos];
+ }
+
+ pub fn reset(self: *TokenIterator) void {
+ self.pos = 0;
+ }
+
+ pub fn seekTo(self: *TokenIterator, pos: TokenIndex) void {
+ self.pos = pos;
+ }
+
+ pub fn seekBy(self: *TokenIterator, offset: isize) void {
+ const new_pos = @bitCast(isize, self.pos) + offset;
+ if (new_pos < 0) {
+ self.pos = 0;
+ } else {
+ self.pos = @intCast(usize, new_pos);
+ }
+ }
+};
+
+pub fn next(self: *Tokenizer) Token {
+ var result = Token{
+ .id = .Eof,
+ .start = self.index,
+ .end = undefined,
+ };
+
+ var state: union(enum) {
+ Start,
+ NewLine,
+ Space: usize,
+ Tab: usize,
+ Hyphen: usize,
+ Dot: usize,
+ Literal,
+ } = .Start;
+
+ while (self.index < self.buffer.len) : (self.index += 1) {
+ const c = self.buffer[self.index];
+ switch (state) {
+ .Start => switch (c) {
+ ' ' => {
+ state = .{ .Space = 1 };
+ },
+ '\t' => {
+ state = .{ .Tab = 1 };
+ },
+ '\n' => {
+ result.id = .NewLine;
+ self.index += 1;
+ break;
+ },
+ '\r' => {
+ state = .NewLine;
+ },
+ '-' => {
+ state = .{ .Hyphen = 1 };
+ },
+ '.' => {
+ state = .{ .Dot = 1 };
+ },
+ ',' => {
+ result.id = .Comma;
+ self.index += 1;
+ break;
+ },
+ '#' => {
+ result.id = .Comment;
+ self.index += 1;
+ break;
+ },
+ '*' => {
+ result.id = .Alias;
+ self.index += 1;
+ break;
+ },
+ '&' => {
+ result.id = .Anchor;
+ self.index += 1;
+ break;
+ },
+ '!' => {
+ result.id = .Tag;
+ self.index += 1;
+ break;
+ },
+ '\'' => {
+ result.id = .SingleQuote;
+ self.index += 1;
+ break;
+ },
+ '"' => {
+ result.id = .DoubleQuote;
+ self.index += 1;
+ break;
+ },
+ '[' => {
+ result.id = .FlowSeqStart;
+ self.index += 1;
+ break;
+ },
+ ']' => {
+ result.id = .FlowSeqEnd;
+ self.index += 1;
+ break;
+ },
+ ':' => {
+ result.id = .MapValueInd;
+ self.index += 1;
+ break;
+ },
+ '{' => {
+ result.id = .FlowMapStart;
+ self.index += 1;
+ break;
+ },
+ '}' => {
+ result.id = .FlowMapEnd;
+ self.index += 1;
+ break;
+ },
+ else => {
+ state = .Literal;
+ },
+ },
+ .Space => |*count| switch (c) {
+ ' ' => {
+ count.* += 1;
+ },
+ else => {
+ result.id = .Space;
+ result.count = count.*;
+ break;
+ },
+ },
+ .Tab => |*count| switch (c) {
+ ' ' => {
+ count.* += 1;
+ },
+ else => {
+ result.id = .Tab;
+ result.count = count.*;
+ break;
+ },
+ },
+ .NewLine => switch (c) {
+ '\n' => {
+ result.id = .NewLine;
+ self.index += 1;
+ break;
+ },
+ else => {}, // TODO this should be an error condition
+ },
+ .Hyphen => |*count| switch (c) {
+ ' ' => {
+ result.id = .SeqItemInd;
+ self.index += 1;
+ break;
+ },
+ '-' => {
+ count.* += 1;
+
+ if (count.* == 3) {
+ result.id = .DocStart;
+ self.index += 1;
+ break;
+ }
+ },
+ else => {
+ state = .Literal;
+ },
+ },
+ .Dot => |*count| switch (c) {
+ '.' => {
+ count.* += 1;
+
+ if (count.* == 3) {
+ result.id = .DocEnd;
+ self.index += 1;
+ break;
+ }
+ },
+ else => {
+ state = .Literal;
+ },
+ },
+ .Literal => switch (c) {
+ '\r', '\n', ' ', '\'', '"', ',', ':', ']', '}' => {
+ result.id = .Literal;
+ break;
+ },
+ else => {
+ result.id = .Literal;
+ },
+ },
+ }
+ }
+
+ if (state == .Literal and result.id == .Eof) {
+ result.id = .Literal;
+ }
+
+ result.end = self.index;
+
+ log.debug("{any}", .{result});
+ log.debug(" | {s}", .{self.buffer[result.start..result.end]});
+
+ return result;
+}
+
+fn testExpected(source: []const u8, expected: []const Token.Id) !void {
+ var tokenizer = Tokenizer{
+ .buffer = source,
+ };
+
+ for (expected) |exp| {
+ const token = tokenizer.next();
+ try testing.expectEqual(exp, token.id);
+ }
+}
+
+test "empty doc" {
+ try testExpected("", &[_]Token.Id{.Eof});
+}
+
+test "empty doc with explicit markers" {
+ try testExpected(
+ \\---
+ \\...
+ , &[_]Token.Id{
+ .DocStart, .NewLine, .DocEnd, .Eof,
+ });
+}
+
+test "sequence of values" {
+ try testExpected(
+ \\- 0
+ \\- 1
+ \\- 2
+ , &[_]Token.Id{
+ .SeqItemInd,
+ .Literal,
+ .NewLine,
+ .SeqItemInd,
+ .Literal,
+ .NewLine,
+ .SeqItemInd,
+ .Literal,
+ .Eof,
+ });
+}
+
+test "sequence of sequences" {
+ try testExpected(
+ \\- [ val1, val2]
+ \\- [val3, val4 ]
+ , &[_]Token.Id{
+ .SeqItemInd,
+ .FlowSeqStart,
+ .Space,
+ .Literal,
+ .Comma,
+ .Space,
+ .Literal,
+ .FlowSeqEnd,
+ .NewLine,
+ .SeqItemInd,
+ .FlowSeqStart,
+ .Literal,
+ .Comma,
+ .Space,
+ .Literal,
+ .Space,
+ .FlowSeqEnd,
+ .Eof,
+ });
+}
+
+test "mappings" {
+ try testExpected(
+ \\key1: value1
+ \\key2: value2
+ , &[_]Token.Id{
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .Literal,
+ .NewLine,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .Literal,
+ .Eof,
+ });
+}
+
+test "inline mapped sequence of values" {
+ try testExpected(
+ \\key : [ val1,
+ \\ val2 ]
+ , &[_]Token.Id{
+ .Literal,
+ .Space,
+ .MapValueInd,
+ .Space,
+ .FlowSeqStart,
+ .Space,
+ .Literal,
+ .Comma,
+ .Space,
+ .NewLine,
+ .Space,
+ .Literal,
+ .Space,
+ .FlowSeqEnd,
+ .Eof,
+ });
+}
+
+test "part of tdb" {
+ try testExpected(
+ \\--- !tapi-tbd
+ \\tbd-version: 4
+ \\targets: [ x86_64-macos ]
+ \\
+ \\uuids:
+ \\ - target: x86_64-macos
+ \\ value: F86CC732-D5E4-30B5-AA7D-167DF5EC2708
+ \\
+ \\install-name: '/usr/lib/libSystem.B.dylib'
+ \\...
+ , &[_]Token.Id{
+ .DocStart,
+ .Space,
+ .Tag,
+ .Literal,
+ .NewLine,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .Literal,
+ .NewLine,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .FlowSeqStart,
+ .Space,
+ .Literal,
+ .Space,
+ .FlowSeqEnd,
+ .NewLine,
+ .NewLine,
+ .Literal,
+ .MapValueInd,
+ .NewLine,
+ .Space,
+ .SeqItemInd,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .Literal,
+ .NewLine,
+ .Space,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .Literal,
+ .NewLine,
+ .NewLine,
+ .Literal,
+ .MapValueInd,
+ .Space,
+ .SingleQuote,
+ .Literal,
+ .SingleQuote,
+ .NewLine,
+ .DocEnd,
+ .Eof,
+ });
+}