struct Tokenizer [src]

Alias for std.zig.tokenizer.Tokenizer

Fields

buffer: [:0]const u8
index: usize

Members

Source

pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, /// For debugging purposes. pub fn dump(self: *Tokenizer, token: *const Token) void { std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] }); } pub fn init(buffer: [:0]const u8) Tokenizer { // Skip the UTF-8 BOM if present. return .{ .buffer = buffer, .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, }; } const State = enum { start, expect_newline, identifier, builtin, string_literal, string_literal_backslash, multiline_string_literal_line, char_literal, char_literal_backslash, backslash, equal, bang, pipe, minus, minus_percent, minus_pipe, asterisk, asterisk_percent, asterisk_pipe, slash, line_comment_start, line_comment, doc_comment_start, doc_comment, int, int_exponent, int_period, float, float_exponent, ampersand, caret, percent, plus, plus_percent, plus_pipe, angle_bracket_left, angle_bracket_angle_bracket_left, angle_bracket_angle_bracket_left_pipe, angle_bracket_right, angle_bracket_angle_bracket_right, period, period_2, period_asterisk, saw_at_sign, invalid, }; /// After this returns invalid, it will reset on the next newline, returning tokens starting from there. /// An eof token will always be returned at the end. pub fn next(self: *Tokenizer) Token { var result: Token = .{ .tag = undefined, .loc = .{ .start = self.index, .end = undefined, }, }; state: switch (State.start) { .start => switch (self.buffer[self.index]) { 0 => { if (self.index == self.buffer.len) { return .{ .tag = .eof, .loc = .{ .start = self.index, .end = self.index, }, }; } else { continue :state .invalid; } }, ' ', '\n', '\t', '\r' => { self.index += 1; result.loc.start = self.index; continue :state .start; }, '"' => { result.tag = .string_literal; continue :state .string_literal; }, '\'' => { result.tag = .char_literal; continue :state .char_literal; }, 'a'...'z', 'A'...'Z', '_' => { result.tag = .identifier; continue :state .identifier; }, '@' => continue :state .saw_at_sign, '=' => continue :state .equal, '!' => continue :state .bang, '|' => continue :state .pipe, '(' => { result.tag = .l_paren; self.index += 1; }, ')' => { result.tag = .r_paren; self.index += 1; }, '[' => { result.tag = .l_bracket; self.index += 1; }, ']' => { result.tag = .r_bracket; self.index += 1; }, ';' => { result.tag = .semicolon; self.index += 1; }, ',' => { result.tag = .comma; self.index += 1; }, '?' => { result.tag = .question_mark; self.index += 1; }, ':' => { result.tag = .colon; self.index += 1; }, '%' => continue :state .percent, '*' => continue :state .asterisk, '+' => continue :state .plus, '<' => continue :state .angle_bracket_left, '>' => continue :state .angle_bracket_right, '^' => continue :state .caret, '\\' => { result.tag = .multiline_string_literal_line; continue :state .backslash; }, '{' => { result.tag = .l_brace; self.index += 1; }, '}' => { result.tag = .r_brace; self.index += 1; }, '~' => { result.tag = .tilde; self.index += 1; }, '.' => continue :state .period, '-' => continue :state .minus, '/' => continue :state .slash, '&' => continue :state .ampersand, '0'...'9' => { result.tag = .number_literal; self.index += 1; continue :state .int; }, else => continue :state .invalid, }, .expect_newline => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index == self.buffer.len) { result.tag = .invalid; } else { continue :state .invalid; } }, '\n' => { self.index += 1; result.loc.start = self.index; continue :state .start; }, else => continue :state .invalid, } }, .invalid => { self.index += 1; switch (self.buffer[self.index]) { 0 => if (self.index == self.buffer.len) { result.tag = .invalid; } else { continue :state .invalid; }, '\n' => result.tag = .invalid, else => continue :state .invalid, } }, .saw_at_sign => { self.index += 1; switch (self.buffer[self.index]) { 0, '\n' => result.tag = .invalid, '"' => { result.tag = .identifier; continue :state .string_literal; }, 'a'...'z', 'A'...'Z', '_' => { result.tag = .builtin; continue :state .builtin; }, else => continue :state .invalid, } }, .ampersand => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .ampersand_equal; self.index += 1; }, else => result.tag = .ampersand, } }, .asterisk => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .asterisk_equal; self.index += 1; }, '*' => { result.tag = .asterisk_asterisk; self.index += 1; }, '%' => continue :state .asterisk_percent, '|' => continue :state .asterisk_pipe, else => result.tag = .asterisk, } }, .asterisk_percent => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .asterisk_percent_equal; self.index += 1; }, else => result.tag = .asterisk_percent, } }, .asterisk_pipe => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .asterisk_pipe_equal; self.index += 1; }, else => result.tag = .asterisk_pipe, } }, .percent => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .percent_equal; self.index += 1; }, else => result.tag = .percent, } }, .plus => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .plus_equal; self.index += 1; }, '+' => { result.tag = .plus_plus; self.index += 1; }, '%' => continue :state .plus_percent, '|' => continue :state .plus_pipe, else => result.tag = .plus, } }, .plus_percent => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .plus_percent_equal; self.index += 1; }, else => result.tag = .plus_percent, } }, .plus_pipe => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .plus_pipe_equal; self.index += 1; }, else => result.tag = .plus_pipe, } }, .caret => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .caret_equal; self.index += 1; }, else => result.tag = .caret, } }, .identifier => { self.index += 1; switch (self.buffer[self.index]) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue :state .identifier, else => { const ident = self.buffer[result.loc.start..self.index]; if (Token.getKeyword(ident)) |tag| { result.tag = tag; } }, } }, .builtin => { self.index += 1; switch (self.buffer[self.index]) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue :state .builtin, else => {}, } }, .backslash => { self.index += 1; switch (self.buffer[self.index]) { 0 => result.tag = .invalid, '\\' => continue :state .multiline_string_literal_line, '\n' => result.tag = .invalid, else => continue :state .invalid, } }, .string_literal => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index != self.buffer.len) { continue :state .invalid; } else { result.tag = .invalid; } }, '\n' => result.tag = .invalid, '\\' => continue :state .string_literal_backslash, '"' => self.index += 1, 0x01...0x09, 0x0b...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .string_literal, } }, .string_literal_backslash => { self.index += 1; switch (self.buffer[self.index]) { 0, '\n' => result.tag = .invalid, else => continue :state .string_literal, } }, .char_literal => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index != self.buffer.len) { continue :state .invalid; } else { result.tag = .invalid; } }, '\n' => result.tag = .invalid, '\\' => continue :state .char_literal_backslash, '\'' => self.index += 1, 0x01...0x09, 0x0b...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .char_literal, } }, .char_literal_backslash => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index != self.buffer.len) { continue :state .invalid; } else { result.tag = .invalid; } }, '\n' => result.tag = .invalid, 0x01...0x09, 0x0b...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .char_literal, } }, .multiline_string_literal_line => { self.index += 1; switch (self.buffer[self.index]) { 0 => if (self.index != self.buffer.len) { continue :state .invalid; }, '\n' => {}, '\r' => if (self.buffer[self.index + 1] != '\n') { continue :state .invalid; }, 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => continue :state .invalid, else => continue :state .multiline_string_literal_line, } }, .bang => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .bang_equal; self.index += 1; }, else => result.tag = .bang, } }, .pipe => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .pipe_equal; self.index += 1; }, '|' => { result.tag = .pipe_pipe; self.index += 1; }, else => result.tag = .pipe, } }, .equal => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .equal_equal; self.index += 1; }, '>' => { result.tag = .equal_angle_bracket_right; self.index += 1; }, else => result.tag = .equal, } }, .minus => { self.index += 1; switch (self.buffer[self.index]) { '>' => { result.tag = .arrow; self.index += 1; }, '=' => { result.tag = .minus_equal; self.index += 1; }, '%' => continue :state .minus_percent, '|' => continue :state .minus_pipe, else => result.tag = .minus, } }, .minus_percent => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .minus_percent_equal; self.index += 1; }, else => result.tag = .minus_percent, } }, .minus_pipe => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .minus_pipe_equal; self.index += 1; }, else => result.tag = .minus_pipe, } }, .angle_bracket_left => { self.index += 1; switch (self.buffer[self.index]) { '<' => continue :state .angle_bracket_angle_bracket_left, '=' => { result.tag = .angle_bracket_left_equal; self.index += 1; }, else => result.tag = .angle_bracket_left, } }, .angle_bracket_angle_bracket_left => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .angle_bracket_angle_bracket_left_equal; self.index += 1; }, '|' => continue :state .angle_bracket_angle_bracket_left_pipe, else => result.tag = .angle_bracket_angle_bracket_left, } }, .angle_bracket_angle_bracket_left_pipe => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .angle_bracket_angle_bracket_left_pipe_equal; self.index += 1; }, else => result.tag = .angle_bracket_angle_bracket_left_pipe, } }, .angle_bracket_right => { self.index += 1; switch (self.buffer[self.index]) { '>' => continue :state .angle_bracket_angle_bracket_right, '=' => { result.tag = .angle_bracket_right_equal; self.index += 1; }, else => result.tag = .angle_bracket_right, } }, .angle_bracket_angle_bracket_right => { self.index += 1; switch (self.buffer[self.index]) { '=' => { result.tag = .angle_bracket_angle_bracket_right_equal; self.index += 1; }, else => result.tag = .angle_bracket_angle_bracket_right, } }, .period => { self.index += 1; switch (self.buffer[self.index]) { '.' => continue :state .period_2, '*' => continue :state .period_asterisk, else => result.tag = .period, } }, .period_2 => { self.index += 1; switch (self.buffer[self.index]) { '.' => { result.tag = .ellipsis3; self.index += 1; }, else => result.tag = .ellipsis2, } }, .period_asterisk => { self.index += 1; switch (self.buffer[self.index]) { '*' => result.tag = .invalid_periodasterisks, else => result.tag = .period_asterisk, } }, .slash => { self.index += 1; switch (self.buffer[self.index]) { '/' => continue :state .line_comment_start, '=' => { result.tag = .slash_equal; self.index += 1; }, else => result.tag = .slash, } }, .line_comment_start => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index != self.buffer.len) { continue :state .invalid; } else return .{ .tag = .eof, .loc = .{ .start = self.index, .end = self.index, }, }; }, '!' => { result.tag = .container_doc_comment; continue :state .doc_comment; }, '\n' => { self.index += 1; result.loc.start = self.index; continue :state .start; }, '/' => continue :state .doc_comment_start, '\r' => continue :state .expect_newline, 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .line_comment, } }, .doc_comment_start => { self.index += 1; switch (self.buffer[self.index]) { 0, '\n' => result.tag = .doc_comment, '\r' => { if (self.buffer[self.index + 1] == '\n') { result.tag = .doc_comment; } else { continue :state .invalid; } }, '/' => continue :state .line_comment, 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { continue :state .invalid; }, else => { result.tag = .doc_comment; continue :state .doc_comment; }, } }, .line_comment => { self.index += 1; switch (self.buffer[self.index]) { 0 => { if (self.index != self.buffer.len) { continue :state .invalid; } else return .{ .tag = .eof, .loc = .{ .start = self.index, .end = self.index, }, }; }, '\n' => { self.index += 1; result.loc.start = self.index; continue :state .start; }, '\r' => continue :state .expect_newline, 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .line_comment, } }, .doc_comment => { self.index += 1; switch (self.buffer[self.index]) { 0, '\n' => {}, '\r' => if (self.buffer[self.index + 1] != '\n') { continue :state .invalid; }, 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { continue :state .invalid; }, else => continue :state .doc_comment, } }, .int => switch (self.buffer[self.index]) { '.' => continue :state .int_period, '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => { self.index += 1; continue :state .int; }, 'e', 'E', 'p', 'P' => { continue :state .int_exponent; }, else => {}, }, .int_exponent => { self.index += 1; switch (self.buffer[self.index]) { '-', '+' => { self.index += 1; continue :state .float; }, else => continue :state .int, } }, .int_period => { self.index += 1; switch (self.buffer[self.index]) { '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => { self.index += 1; continue :state .float; }, 'e', 'E', 'p', 'P' => { continue :state .float_exponent; }, else => self.index -= 1, } }, .float => switch (self.buffer[self.index]) { '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => { self.index += 1; continue :state .float; }, 'e', 'E', 'p', 'P' => { continue :state .float_exponent; }, else => {}, }, .float_exponent => { self.index += 1; switch (self.buffer[self.index]) { '-', '+' => { self.index += 1; continue :state .float; }, else => continue :state .float, } }, } result.loc.end = self.index; return result; } }