struct Tokenizer [src]
Alias for std.zig.tokenizer.Tokenizer
Fields
buffer: [:0]const u8
index: usize
Members
Source
pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
/// For debugging purposes.
pub fn dump(self: *Tokenizer, token: *const Token) void {
std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] });
}
pub fn init(buffer: [:0]const u8) Tokenizer {
// Skip the UTF-8 BOM if present.
return .{
.buffer = buffer,
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
};
}
const State = enum {
start,
expect_newline,
identifier,
builtin,
string_literal,
string_literal_backslash,
multiline_string_literal_line,
char_literal,
char_literal_backslash,
backslash,
equal,
bang,
pipe,
minus,
minus_percent,
minus_pipe,
asterisk,
asterisk_percent,
asterisk_pipe,
slash,
line_comment_start,
line_comment,
doc_comment_start,
doc_comment,
int,
int_exponent,
int_period,
float,
float_exponent,
ampersand,
caret,
percent,
plus,
plus_percent,
plus_pipe,
angle_bracket_left,
angle_bracket_angle_bracket_left,
angle_bracket_angle_bracket_left_pipe,
angle_bracket_right,
angle_bracket_angle_bracket_right,
period,
period_2,
period_asterisk,
saw_at_sign,
invalid,
};
/// After this returns invalid, it will reset on the next newline, returning tokens starting from there.
/// An eof token will always be returned at the end.
pub fn next(self: *Tokenizer) Token {
var result: Token = .{
.tag = undefined,
.loc = .{
.start = self.index,
.end = undefined,
},
};
state: switch (State.start) {
.start => switch (self.buffer[self.index]) {
0 => {
if (self.index == self.buffer.len) {
return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
} else {
continue :state .invalid;
}
},
' ', '\n', '\t', '\r' => {
self.index += 1;
result.loc.start = self.index;
continue :state .start;
},
'"' => {
result.tag = .string_literal;
continue :state .string_literal;
},
'\'' => {
result.tag = .char_literal;
continue :state .char_literal;
},
'a'...'z', 'A'...'Z', '_' => {
result.tag = .identifier;
continue :state .identifier;
},
'@' => continue :state .saw_at_sign,
'=' => continue :state .equal,
'!' => continue :state .bang,
'|' => continue :state .pipe,
'(' => {
result.tag = .l_paren;
self.index += 1;
},
')' => {
result.tag = .r_paren;
self.index += 1;
},
'[' => {
result.tag = .l_bracket;
self.index += 1;
},
']' => {
result.tag = .r_bracket;
self.index += 1;
},
';' => {
result.tag = .semicolon;
self.index += 1;
},
',' => {
result.tag = .comma;
self.index += 1;
},
'?' => {
result.tag = .question_mark;
self.index += 1;
},
':' => {
result.tag = .colon;
self.index += 1;
},
'%' => continue :state .percent,
'*' => continue :state .asterisk,
'+' => continue :state .plus,
'<' => continue :state .angle_bracket_left,
'>' => continue :state .angle_bracket_right,
'^' => continue :state .caret,
'\\' => {
result.tag = .multiline_string_literal_line;
continue :state .backslash;
},
'{' => {
result.tag = .l_brace;
self.index += 1;
},
'}' => {
result.tag = .r_brace;
self.index += 1;
},
'~' => {
result.tag = .tilde;
self.index += 1;
},
'.' => continue :state .period,
'-' => continue :state .minus,
'/' => continue :state .slash,
'&' => continue :state .ampersand,
'0'...'9' => {
result.tag = .number_literal;
self.index += 1;
continue :state .int;
},
else => continue :state .invalid,
},
.expect_newline => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index == self.buffer.len) {
result.tag = .invalid;
} else {
continue :state .invalid;
}
},
'\n' => {
self.index += 1;
result.loc.start = self.index;
continue :state .start;
},
else => continue :state .invalid,
}
},
.invalid => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => if (self.index == self.buffer.len) {
result.tag = .invalid;
} else {
continue :state .invalid;
},
'\n' => result.tag = .invalid,
else => continue :state .invalid,
}
},
.saw_at_sign => {
self.index += 1;
switch (self.buffer[self.index]) {
0, '\n' => result.tag = .invalid,
'"' => {
result.tag = .identifier;
continue :state .string_literal;
},
'a'...'z', 'A'...'Z', '_' => {
result.tag = .builtin;
continue :state .builtin;
},
else => continue :state .invalid,
}
},
.ampersand => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .ampersand_equal;
self.index += 1;
},
else => result.tag = .ampersand,
}
},
.asterisk => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .asterisk_equal;
self.index += 1;
},
'*' => {
result.tag = .asterisk_asterisk;
self.index += 1;
},
'%' => continue :state .asterisk_percent,
'|' => continue :state .asterisk_pipe,
else => result.tag = .asterisk,
}
},
.asterisk_percent => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .asterisk_percent_equal;
self.index += 1;
},
else => result.tag = .asterisk_percent,
}
},
.asterisk_pipe => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .asterisk_pipe_equal;
self.index += 1;
},
else => result.tag = .asterisk_pipe,
}
},
.percent => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .percent_equal;
self.index += 1;
},
else => result.tag = .percent,
}
},
.plus => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .plus_equal;
self.index += 1;
},
'+' => {
result.tag = .plus_plus;
self.index += 1;
},
'%' => continue :state .plus_percent,
'|' => continue :state .plus_pipe,
else => result.tag = .plus,
}
},
.plus_percent => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .plus_percent_equal;
self.index += 1;
},
else => result.tag = .plus_percent,
}
},
.plus_pipe => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .plus_pipe_equal;
self.index += 1;
},
else => result.tag = .plus_pipe,
}
},
.caret => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .caret_equal;
self.index += 1;
},
else => result.tag = .caret,
}
},
.identifier => {
self.index += 1;
switch (self.buffer[self.index]) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => continue :state .identifier,
else => {
const ident = self.buffer[result.loc.start..self.index];
if (Token.getKeyword(ident)) |tag| {
result.tag = tag;
}
},
}
},
.builtin => {
self.index += 1;
switch (self.buffer[self.index]) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => continue :state .builtin,
else => {},
}
},
.backslash => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => result.tag = .invalid,
'\\' => continue :state .multiline_string_literal_line,
'\n' => result.tag = .invalid,
else => continue :state .invalid,
}
},
.string_literal => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else {
result.tag = .invalid;
}
},
'\n' => result.tag = .invalid,
'\\' => continue :state .string_literal_backslash,
'"' => self.index += 1,
0x01...0x09, 0x0b...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .string_literal,
}
},
.string_literal_backslash => {
self.index += 1;
switch (self.buffer[self.index]) {
0, '\n' => result.tag = .invalid,
else => continue :state .string_literal,
}
},
.char_literal => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else {
result.tag = .invalid;
}
},
'\n' => result.tag = .invalid,
'\\' => continue :state .char_literal_backslash,
'\'' => self.index += 1,
0x01...0x09, 0x0b...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .char_literal,
}
},
.char_literal_backslash => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else {
result.tag = .invalid;
}
},
'\n' => result.tag = .invalid,
0x01...0x09, 0x0b...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .char_literal,
}
},
.multiline_string_literal_line => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => if (self.index != self.buffer.len) {
continue :state .invalid;
},
'\n' => {},
'\r' => if (self.buffer[self.index + 1] != '\n') {
continue :state .invalid;
},
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => continue :state .invalid,
else => continue :state .multiline_string_literal_line,
}
},
.bang => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .bang_equal;
self.index += 1;
},
else => result.tag = .bang,
}
},
.pipe => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .pipe_equal;
self.index += 1;
},
'|' => {
result.tag = .pipe_pipe;
self.index += 1;
},
else => result.tag = .pipe,
}
},
.equal => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .equal_equal;
self.index += 1;
},
'>' => {
result.tag = .equal_angle_bracket_right;
self.index += 1;
},
else => result.tag = .equal,
}
},
.minus => {
self.index += 1;
switch (self.buffer[self.index]) {
'>' => {
result.tag = .arrow;
self.index += 1;
},
'=' => {
result.tag = .minus_equal;
self.index += 1;
},
'%' => continue :state .minus_percent,
'|' => continue :state .minus_pipe,
else => result.tag = .minus,
}
},
.minus_percent => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .minus_percent_equal;
self.index += 1;
},
else => result.tag = .minus_percent,
}
},
.minus_pipe => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .minus_pipe_equal;
self.index += 1;
},
else => result.tag = .minus_pipe,
}
},
.angle_bracket_left => {
self.index += 1;
switch (self.buffer[self.index]) {
'<' => continue :state .angle_bracket_angle_bracket_left,
'=' => {
result.tag = .angle_bracket_left_equal;
self.index += 1;
},
else => result.tag = .angle_bracket_left,
}
},
.angle_bracket_angle_bracket_left => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .angle_bracket_angle_bracket_left_equal;
self.index += 1;
},
'|' => continue :state .angle_bracket_angle_bracket_left_pipe,
else => result.tag = .angle_bracket_angle_bracket_left,
}
},
.angle_bracket_angle_bracket_left_pipe => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .angle_bracket_angle_bracket_left_pipe_equal;
self.index += 1;
},
else => result.tag = .angle_bracket_angle_bracket_left_pipe,
}
},
.angle_bracket_right => {
self.index += 1;
switch (self.buffer[self.index]) {
'>' => continue :state .angle_bracket_angle_bracket_right,
'=' => {
result.tag = .angle_bracket_right_equal;
self.index += 1;
},
else => result.tag = .angle_bracket_right,
}
},
.angle_bracket_angle_bracket_right => {
self.index += 1;
switch (self.buffer[self.index]) {
'=' => {
result.tag = .angle_bracket_angle_bracket_right_equal;
self.index += 1;
},
else => result.tag = .angle_bracket_angle_bracket_right,
}
},
.period => {
self.index += 1;
switch (self.buffer[self.index]) {
'.' => continue :state .period_2,
'*' => continue :state .period_asterisk,
else => result.tag = .period,
}
},
.period_2 => {
self.index += 1;
switch (self.buffer[self.index]) {
'.' => {
result.tag = .ellipsis3;
self.index += 1;
},
else => result.tag = .ellipsis2,
}
},
.period_asterisk => {
self.index += 1;
switch (self.buffer[self.index]) {
'*' => result.tag = .invalid_periodasterisks,
else => result.tag = .period_asterisk,
}
},
.slash => {
self.index += 1;
switch (self.buffer[self.index]) {
'/' => continue :state .line_comment_start,
'=' => {
result.tag = .slash_equal;
self.index += 1;
},
else => result.tag = .slash,
}
},
.line_comment_start => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'!' => {
result.tag = .container_doc_comment;
continue :state .doc_comment;
},
'\n' => {
self.index += 1;
result.loc.start = self.index;
continue :state .start;
},
'/' => continue :state .doc_comment_start,
'\r' => continue :state .expect_newline,
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .line_comment,
}
},
.doc_comment_start => {
self.index += 1;
switch (self.buffer[self.index]) {
0, '\n' => result.tag = .doc_comment,
'\r' => {
if (self.buffer[self.index + 1] == '\n') {
result.tag = .doc_comment;
} else {
continue :state .invalid;
}
},
'/' => continue :state .line_comment,
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
continue :state .invalid;
},
else => {
result.tag = .doc_comment;
continue :state .doc_comment;
},
}
},
.line_comment => {
self.index += 1;
switch (self.buffer[self.index]) {
0 => {
if (self.index != self.buffer.len) {
continue :state .invalid;
} else return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
},
'\n' => {
self.index += 1;
result.loc.start = self.index;
continue :state .start;
},
'\r' => continue :state .expect_newline,
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .line_comment,
}
},
.doc_comment => {
self.index += 1;
switch (self.buffer[self.index]) {
0, '\n' => {},
'\r' => if (self.buffer[self.index + 1] != '\n') {
continue :state .invalid;
},
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
continue :state .invalid;
},
else => continue :state .doc_comment,
}
},
.int => switch (self.buffer[self.index]) {
'.' => continue :state .int_period,
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {
self.index += 1;
continue :state .int;
},
'e', 'E', 'p', 'P' => {
continue :state .int_exponent;
},
else => {},
},
.int_exponent => {
self.index += 1;
switch (self.buffer[self.index]) {
'-', '+' => {
self.index += 1;
continue :state .float;
},
else => continue :state .int,
}
},
.int_period => {
self.index += 1;
switch (self.buffer[self.index]) {
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {
self.index += 1;
continue :state .float;
},
'e', 'E', 'p', 'P' => {
continue :state .float_exponent;
},
else => self.index -= 1,
}
},
.float => switch (self.buffer[self.index]) {
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {
self.index += 1;
continue :state .float;
},
'e', 'E', 'p', 'P' => {
continue :state .float_exponent;
},
else => {},
},
.float_exponent => {
self.index += 1;
switch (self.buffer[self.index]) {
'-', '+' => {
self.index += 1;
continue :state .float;
},
else => continue :state .float,
}
},
}
result.loc.end = self.index;
return result;
}
}