struct Scanner [src]

Alias for std.json.scanner.Scanner

The lowest level parsing API in this package; supports streaming input with a low memory footprint. The memory requirement is O(d) where d is the nesting depth of [] or {} containers in the input. Specifically d/8 bytes are required for this purpose, with some extra buffer according to the implementation of std.ArrayList. This scanner can emit partial tokens; see std.json.Token. The input to this class is a sequence of input buffers that you must supply one at a time. Call feedInput() with the first buffer, then call next() repeatedly until error.BufferUnderrun is returned. Then call feedInput() again and so forth. Call endInput() when the last input buffer has been given to feedInput(), either immediately after calling feedInput(), or when error.BufferUnderrun requests more data and there is no more. Be sure to call next() after calling endInput() until Token.end_of_document has been returned.

Fields

state: State = .value
string_is_object_key: bool = false
stack: BitStack
value_start: usize = undefined
utf16_code_units: [2]u16 = undefined
input: []const u8 = ""
cursor: usize = 0
is_end_of_input: bool = false
diagnostics: ?*Diagnostics = null

Members

Source

pub const Scanner = struct { state: State = .value, string_is_object_key: bool = false, stack: BitStack, value_start: usize = undefined, utf16_code_units: [2]u16 = undefined, input: []const u8 = "", cursor: usize = 0, is_end_of_input: bool = false, diagnostics: ?*Diagnostics = null, /// The allocator is only used to track `[]` and `{}` nesting levels. pub fn initStreaming(allocator: Allocator) @This() { return .{ .stack = BitStack.init(allocator), }; } /// Use this if your input is a single slice. /// This is effectively equivalent to: /// ``` /// initStreaming(allocator); /// feedInput(complete_input); /// endInput(); /// ``` pub fn initCompleteInput(allocator: Allocator, complete_input: []const u8) @This() { return .{ .stack = BitStack.init(allocator), .input = complete_input, .is_end_of_input = true, }; } pub fn deinit(self: *@This()) void { self.stack.deinit(); self.* = undefined; } pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void { diagnostics.cursor_pointer = &self.cursor; self.diagnostics = diagnostics; } /// Call this whenever you get `error.BufferUnderrun` from `next()`. /// When there is no more input to provide, call `endInput()`. pub fn feedInput(self: *@This(), input: []const u8) void { assert(self.cursor == self.input.len); // Not done with the last input slice. if (self.diagnostics) |diag| { diag.total_bytes_before_current_input += self.input.len; // This usually goes "negative" to measure how far before the beginning // of the new buffer the current line started. diag.line_start_cursor -%= self.cursor; } self.input = input; self.cursor = 0; self.value_start = 0; } /// Call this when you will no longer call `feedInput()` anymore. /// This can be called either immediately after the last `feedInput()`, /// or at any time afterward, such as when getting `error.BufferUnderrun` from `next()`. /// Don't forget to call `next*()` after `endInput()` until you get `.end_of_document`. pub fn endInput(self: *@This()) void { self.is_end_of_input = true; } pub const NextError = Error || Allocator.Error || error{BufferUnderrun}; pub const AllocError = Error || Allocator.Error || error{ValueTooLong}; pub const PeekError = Error || error{BufferUnderrun}; pub const SkipError = Error || Allocator.Error; pub const AllocIntoArrayListError = AllocError || error{BufferUnderrun}; /// Equivalent to `nextAllocMax(allocator, when, default_max_value_len);` /// This function is only available after `endInput()` (or `initCompleteInput()`) has been called. /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior. pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token { return self.nextAllocMax(allocator, when, default_max_value_len); } /// This function is only available after `endInput()` (or `initCompleteInput()`) has been called. /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior. pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token { assert(self.is_end_of_input); // This function is not available in streaming mode. const token_type = self.peekNextTokenType() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; switch (token_type) { .number, .string => { var value_list = ArrayList(u8).init(allocator); errdefer { value_list.deinit(); } if (self.allocNextIntoArrayListMax(&value_list, when, max_value_len) catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) |slice| { return if (token_type == .number) Token{ .number = slice } else Token{ .string = slice }; } else { return if (token_type == .number) Token{ .allocated_number = try value_list.toOwnedSlice() } else Token{ .allocated_string = try value_list.toOwnedSlice() }; } }, // Simple tokens never alloc. .object_begin, .object_end, .array_begin, .array_end, .true, .false, .null, .end_of_document, => return self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }, } } /// Equivalent to `allocNextIntoArrayListMax(value_list, when, default_max_value_len);` pub fn allocNextIntoArrayList(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen) AllocIntoArrayListError!?[]const u8 { return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len); } /// The next token type must be either `.number` or `.string`. See `peekNextTokenType()`. /// When allocation is not necessary with `.alloc_if_needed`, /// this method returns the content slice from the input buffer, and `value_list` is not touched. /// When allocation is necessary or with `.alloc_always`, this method concatenates partial tokens into the given `value_list`, /// and returns `null` once the final `.number` or `.string` token has been written into it. /// In case of an `error.BufferUnderrun`, partial values will be left in the given value_list. /// The given `value_list` is never reset by this method, so an `error.BufferUnderrun` situation /// can be resumed by passing the same array list in again. /// This method does not indicate whether the token content being returned is for a `.number` or `.string` token type; /// the caller of this method is expected to know which type of token is being processed. pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen, max_value_len: usize) AllocIntoArrayListError!?[]const u8 { while (true) { const token = try self.next(); switch (token) { // Accumulate partial values. .partial_number, .partial_string => |slice| { try appendSlice(value_list, slice, max_value_len); }, .partial_string_escaped_1 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_2 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_3 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_4 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, // Return complete values. .number => |slice| { if (when == .alloc_if_needed and value_list.items.len == 0) { // No alloc necessary. return slice; } try appendSlice(value_list, slice, max_value_len); // The token is complete. return null; }, .string => |slice| { if (when == .alloc_if_needed and value_list.items.len == 0) { // No alloc necessary. return slice; } try appendSlice(value_list, slice, max_value_len); // The token is complete. return null; }, .object_begin, .object_end, .array_begin, .array_end, .true, .false, .null, .end_of_document, => unreachable, // Only .number and .string token types are allowed here. Check peekNextTokenType() before calling this. .allocated_number, .allocated_string => unreachable, } } } /// This function is only available after `endInput()` (or `initCompleteInput()`) has been called. /// If the next token type is `.object_begin` or `.array_begin`, /// this function calls `next()` repeatedly until the corresponding `.object_end` or `.array_end` is found. /// If the next token type is `.number` or `.string`, /// this function calls `next()` repeatedly until the (non `.partial_*`) `.number` or `.string` token is found. /// If the next token type is `.true`, `.false`, or `.null`, this function calls `next()` once. /// The next token type must not be `.object_end`, `.array_end`, or `.end_of_document`; /// see `peekNextTokenType()`. pub fn skipValue(self: *@This()) SkipError!void { assert(self.is_end_of_input); // This function is not available in streaming mode. switch (self.peekNextTokenType() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) { .object_begin, .array_begin => { self.skipUntilStackHeight(self.stackHeight()) catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; }, .number, .string => { while (true) { switch (self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) { .partial_number, .partial_string, .partial_string_escaped_1, .partial_string_escaped_2, .partial_string_escaped_3, .partial_string_escaped_4, => continue, .number, .string => break, else => unreachable, } } }, .true, .false, .null => { _ = self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; }, .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token. } } /// Skip tokens until an `.object_end` or `.array_end` token results in a `stackHeight()` equal the given stack height. /// Unlike `skipValue()`, this function is available in streaming mode. pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void { while (true) { switch (try self.next()) { .object_end, .array_end => { if (self.stackHeight() == terminal_stack_height) break; }, .end_of_document => unreachable, else => continue, } } } /// The depth of `{}` or `[]` nesting levels at the current position. pub fn stackHeight(self: *const @This()) usize { return self.stack.bit_len; } /// Pre allocate memory to hold the given number of nesting levels. /// `stackHeight()` up to the given number will not cause allocations. pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void { try self.stack.ensureTotalCapacity(height); } /// See `std.json.Token` for documentation of this function. pub fn next(self: *@This()) NextError!Token { state_loop: while (true) { switch (self.state) { .value => { switch (try self.skipWhitespaceExpectByte()) { // Object, Array '{' => { try self.stack.push(OBJECT_MODE); self.cursor += 1; self.state = .object_start; return .object_begin; }, '[' => { try self.stack.push(ARRAY_MODE); self.cursor += 1; self.state = .array_start; return .array_begin; }, // String '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; continue :state_loop; }, // Number '1'...'9' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_int; continue :state_loop; }, '0' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_leading_zero; continue :state_loop; }, '-' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_minus; continue :state_loop; }, // literal values 't' => { self.cursor += 1; self.state = .literal_t; continue :state_loop; }, 'f' => { self.cursor += 1; self.state = .literal_f; continue :state_loop; }, 'n' => { self.cursor += 1; self.state = .literal_n; continue :state_loop; }, else => return error.SyntaxError, } }, .post_value => { if (try self.skipWhitespaceCheckEnd()) return .end_of_document; const c = self.input[self.cursor]; if (self.string_is_object_key) { self.string_is_object_key = false; switch (c) { ':' => { self.cursor += 1; self.state = .value; continue :state_loop; }, else => return error.SyntaxError, } } switch (c) { '}' => { if (self.stack.pop() != OBJECT_MODE) return error.SyntaxError; self.cursor += 1; // stay in .post_value state. return .object_end; }, ']' => { if (self.stack.pop() != ARRAY_MODE) return error.SyntaxError; self.cursor += 1; // stay in .post_value state. return .array_end; }, ',' => { switch (self.stack.peek()) { OBJECT_MODE => { self.state = .object_post_comma; }, ARRAY_MODE => { self.state = .value; }, } self.cursor += 1; continue :state_loop; }, else => return error.SyntaxError, } }, .object_start => { switch (try self.skipWhitespaceExpectByte()) { '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; self.string_is_object_key = true; continue :state_loop; }, '}' => { self.cursor += 1; _ = self.stack.pop(); self.state = .post_value; return .object_end; }, else => return error.SyntaxError, } }, .object_post_comma => { switch (try self.skipWhitespaceExpectByte()) { '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; self.string_is_object_key = true; continue :state_loop; }, else => return error.SyntaxError, } }, .array_start => { switch (try self.skipWhitespaceExpectByte()) { ']' => { self.cursor += 1; _ = self.stack.pop(); self.state = .post_value; return .array_end; }, else => { self.state = .value; continue :state_loop; }, } }, .number_minus => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0' => { self.cursor += 1; self.state = .number_leading_zero; continue :state_loop; }, '1'...'9' => { self.cursor += 1; self.state = .number_int; continue :state_loop; }, else => return error.SyntaxError, } }, .number_leading_zero => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(true); switch (self.input[self.cursor]) { '.' => { self.cursor += 1; self.state = .number_post_dot; continue :state_loop; }, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } }, .number_int => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, '.' => { self.cursor += 1; self.state = .number_post_dot; continue :state_loop; }, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .number_post_dot => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_frac; continue :state_loop; }, else => return error.SyntaxError, } }, .number_frac => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .number_post_e => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_exp; continue :state_loop; }, '+', '-' => { self.cursor += 1; self.state = .number_post_e_sign; continue :state_loop; }, else => return error.SyntaxError, } }, .number_post_e_sign => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_exp; continue :state_loop; }, else => return error.SyntaxError, } }, .number_exp => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .string => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { 0...0x1f => return error.SyntaxError, // Bare ASCII control code in string. // ASCII plain text. 0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue, // Special characters. '"' => { const result = Token{ .string = self.takeValueSlice() }; self.cursor += 1; self.state = .post_value; return result; }, '\\' => { const slice = self.takeValueSlice(); self.cursor += 1; self.state = .string_backslash; if (slice.len > 0) return Token{ .partial_string = slice }; continue :state_loop; }, // UTF-8 validation. // See http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String 0xC2...0xDF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, 0xE0 => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte_guard_against_overlong; continue :state_loop; }, 0xE1...0xEC, 0xEE...0xEF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, 0xED => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte_guard_against_surrogate_half; continue :state_loop; }, 0xF0 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte_guard_against_overlong; continue :state_loop; }, 0xF1...0xF3 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte; continue :state_loop; }, 0xF4 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte_guard_against_too_large; continue :state_loop; }, 0x80...0xC1, 0xF5...0xFF => return error.SyntaxError, // Invalid UTF-8. } } if (self.is_end_of_input) return error.UnexpectedEndOfInput; const slice = self.takeValueSlice(); if (slice.len > 0) return Token{ .partial_string = slice }; return error.BufferUnderrun; }, .string_backslash => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { '"', '\\', '/' => { // Since these characters now represent themselves literally, // we can simply begin the next plaintext slice here. self.value_start = self.cursor; self.cursor += 1; self.state = .string; continue :state_loop; }, 'b' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{0x08} }; }, 'f' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{0x0c} }; }, 'n' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\n'} }; }, 'r' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\r'} }; }, 't' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\t'} }; }, 'u' => { self.cursor += 1; self.state = .string_backslash_u; continue :state_loop; }, else => return error.SyntaxError, } }, .string_backslash_u => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] = @as(u16, c - '0') << 12; }, 'A'...'F' => { self.utf16_code_units[0] = @as(u16, c - 'A' + 10) << 12; }, 'a'...'f' => { self.utf16_code_units[0] = @as(u16, c - 'a' + 10) << 12; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_1; continue :state_loop; }, .string_backslash_u_1 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 8; }, 'A'...'F' => { self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 8; }, 'a'...'f' => { self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 8; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_2; continue :state_loop; }, .string_backslash_u_2 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 4; }, 'A'...'F' => { self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 4; }, 'a'...'f' => { self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 4; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_3; continue :state_loop; }, .string_backslash_u_3 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= c - '0'; }, 'A'...'F' => { self.utf16_code_units[0] |= c - 'A' + 10; }, 'a'...'f' => { self.utf16_code_units[0] |= c - 'a' + 10; }, else => return error.SyntaxError, } self.cursor += 1; if (std.unicode.utf16IsHighSurrogate(self.utf16_code_units[0])) { self.state = .string_surrogate_half; continue :state_loop; } else if (std.unicode.utf16IsLowSurrogate(self.utf16_code_units[0])) { return error.SyntaxError; // Unexpected low surrogate half. } else { self.value_start = self.cursor; self.state = .string; return partialStringCodepoint(self.utf16_code_units[0]); } }, .string_surrogate_half => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { '\\' => { self.cursor += 1; self.state = .string_surrogate_half_backslash; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 'u' => { self.cursor += 1; self.state = .string_surrogate_half_backslash_u; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 'D', 'd' => { self.cursor += 1; self.utf16_code_units[1] = 0xD << 12; self.state = .string_surrogate_half_backslash_u_1; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u_1 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { 'C'...'F' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 8; self.state = .string_surrogate_half_backslash_u_2; continue :state_loop; }, 'c'...'f' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 8; self.state = .string_surrogate_half_backslash_u_2; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u_2 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - '0') << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, 'A'...'F' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, 'a'...'f' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, else => return error.SyntaxError, } }, .string_surrogate_half_backslash_u_3 => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[1] |= c - '0'; }, 'A'...'F' => { self.utf16_code_units[1] |= c - 'A' + 10; }, 'a'...'f' => { self.utf16_code_units[1] |= c - 'a' + 10; }, else => return error.SyntaxError, } self.cursor += 1; self.value_start = self.cursor; self.state = .string; const code_point = std.unicode.utf16DecodeSurrogatePair(&self.utf16_code_units) catch unreachable; return partialStringCodepoint(code_point); }, .string_utf8_last_byte => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte_guard_against_overlong => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0xA0...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte_guard_against_surrogate_half => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x80...0x9F => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte_guard_against_overlong => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x90...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte_guard_against_too_large => { if (self.cursor >= self.input.len) return self.endOfBufferInString(); switch (self.input[self.cursor]) { 0x80...0x8F => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .literal_t => { switch (try self.expectByte()) { 'r' => { self.cursor += 1; self.state = .literal_tr; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_tr => { switch (try self.expectByte()) { 'u' => { self.cursor += 1; self.state = .literal_tru; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_tru => { switch (try self.expectByte()) { 'e' => { self.cursor += 1; self.state = .post_value; return .true; }, else => return error.SyntaxError, } }, .literal_f => { switch (try self.expectByte()) { 'a' => { self.cursor += 1; self.state = .literal_fa; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fa => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .literal_fal; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fal => { switch (try self.expectByte()) { 's' => { self.cursor += 1; self.state = .literal_fals; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fals => { switch (try self.expectByte()) { 'e' => { self.cursor += 1; self.state = .post_value; return .false; }, else => return error.SyntaxError, } }, .literal_n => { switch (try self.expectByte()) { 'u' => { self.cursor += 1; self.state = .literal_nu; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_nu => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .literal_nul; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_nul => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .post_value; return .null; }, else => return error.SyntaxError, } }, } unreachable; } } /// Seeks ahead in the input until the first byte of the next token (or the end of the input) /// determines which type of token will be returned from the next `next*()` call. /// This function is idempotent, only advancing past commas, colons, and inter-token whitespace. pub fn peekNextTokenType(self: *@This()) PeekError!TokenType { state_loop: while (true) { switch (self.state) { .value => { switch (try self.skipWhitespaceExpectByte()) { '{' => return .object_begin, '[' => return .array_begin, '"' => return .string, '-', '0'...'9' => return .number, 't' => return .true, 'f' => return .false, 'n' => return .null, else => return error.SyntaxError, } }, .post_value => { if (try self.skipWhitespaceCheckEnd()) return .end_of_document; const c = self.input[self.cursor]; if (self.string_is_object_key) { self.string_is_object_key = false; switch (c) { ':' => { self.cursor += 1; self.state = .value; continue :state_loop; }, else => return error.SyntaxError, } } switch (c) { '}' => return .object_end, ']' => return .array_end, ',' => { switch (self.stack.peek()) { OBJECT_MODE => { self.state = .object_post_comma; }, ARRAY_MODE => { self.state = .value; }, } self.cursor += 1; continue :state_loop; }, else => return error.SyntaxError, } }, .object_start => { switch (try self.skipWhitespaceExpectByte()) { '"' => return .string, '}' => return .object_end, else => return error.SyntaxError, } }, .object_post_comma => { switch (try self.skipWhitespaceExpectByte()) { '"' => return .string, else => return error.SyntaxError, } }, .array_start => { switch (try self.skipWhitespaceExpectByte()) { ']' => return .array_end, else => { self.state = .value; continue :state_loop; }, } }, .number_minus, .number_leading_zero, .number_int, .number_post_dot, .number_frac, .number_post_e, .number_post_e_sign, .number_exp, => return .number, .string, .string_backslash, .string_backslash_u, .string_backslash_u_1, .string_backslash_u_2, .string_backslash_u_3, .string_surrogate_half, .string_surrogate_half_backslash, .string_surrogate_half_backslash_u, .string_surrogate_half_backslash_u_1, .string_surrogate_half_backslash_u_2, .string_surrogate_half_backslash_u_3, => return .string, .string_utf8_last_byte, .string_utf8_second_to_last_byte, .string_utf8_second_to_last_byte_guard_against_overlong, .string_utf8_second_to_last_byte_guard_against_surrogate_half, .string_utf8_third_to_last_byte, .string_utf8_third_to_last_byte_guard_against_overlong, .string_utf8_third_to_last_byte_guard_against_too_large, => return .string, .literal_t, .literal_tr, .literal_tru, => return .true, .literal_f, .literal_fa, .literal_fal, .literal_fals, => return .false, .literal_n, .literal_nu, .literal_nul, => return .null, } unreachable; } } const State = enum { value, post_value, object_start, object_post_comma, array_start, number_minus, number_leading_zero, number_int, number_post_dot, number_frac, number_post_e, number_post_e_sign, number_exp, string, string_backslash, string_backslash_u, string_backslash_u_1, string_backslash_u_2, string_backslash_u_3, string_surrogate_half, string_surrogate_half_backslash, string_surrogate_half_backslash_u, string_surrogate_half_backslash_u_1, string_surrogate_half_backslash_u_2, string_surrogate_half_backslash_u_3, // From http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String string_utf8_last_byte, // State A string_utf8_second_to_last_byte, // State B string_utf8_second_to_last_byte_guard_against_overlong, // State C string_utf8_second_to_last_byte_guard_against_surrogate_half, // State D string_utf8_third_to_last_byte, // State E string_utf8_third_to_last_byte_guard_against_overlong, // State F string_utf8_third_to_last_byte_guard_against_too_large, // State G literal_t, literal_tr, literal_tru, literal_f, literal_fa, literal_fal, literal_fals, literal_n, literal_nu, literal_nul, }; fn expectByte(self: *const @This()) !u8 { if (self.cursor < self.input.len) { return self.input[self.cursor]; } // No byte. if (self.is_end_of_input) return error.UnexpectedEndOfInput; return error.BufferUnderrun; } fn skipWhitespace(self: *@This()) void { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { // Whitespace ' ', '\t', '\r' => continue, '\n' => { if (self.diagnostics) |diag| { diag.line_number += 1; // This will count the newline itself, // which means a straight-forward subtraction will give a 1-based column number. diag.line_start_cursor = self.cursor; } continue; }, else => return, } } } fn skipWhitespaceExpectByte(self: *@This()) !u8 { self.skipWhitespace(); return self.expectByte(); } fn skipWhitespaceCheckEnd(self: *@This()) !bool { self.skipWhitespace(); if (self.cursor >= self.input.len) { // End of buffer. if (self.is_end_of_input) { // End of everything. if (self.stackHeight() == 0) { // We did it! return true; } return error.UnexpectedEndOfInput; } return error.BufferUnderrun; } if (self.stackHeight() == 0) return error.SyntaxError; return false; } fn takeValueSlice(self: *@This()) []const u8 { const slice = self.input[self.value_start..self.cursor]; self.value_start = self.cursor; return slice; } fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 { // Check if the escape sequence started before the current input buffer. // (The algebra here is awkward to avoid unsigned underflow, // but it's just making sure the slice on the next line isn't UB.) if (self.cursor <= self.value_start + trailing_negative_offset) return ""; const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset]; // When trailing_negative_offset is non-zero, setting self.value_start doesn't matter, // because we always set it again while emitting the .partial_string_escaped_*. self.value_start = self.cursor; return slice; } fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token { const slice = self.takeValueSlice(); if (self.is_end_of_input) { if (!allow_end) return error.UnexpectedEndOfInput; self.state = .post_value; return Token{ .number = slice }; } if (slice.len == 0) return error.BufferUnderrun; return Token{ .partial_number = slice }; } fn endOfBufferInString(self: *@This()) !Token { if (self.is_end_of_input) return error.UnexpectedEndOfInput; const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) { // Don't include the escape sequence in the partial string. .string_backslash => 1, .string_backslash_u => 2, .string_backslash_u_1 => 3, .string_backslash_u_2 => 4, .string_backslash_u_3 => 5, .string_surrogate_half => 6, .string_surrogate_half_backslash => 7, .string_surrogate_half_backslash_u => 8, .string_surrogate_half_backslash_u_1 => 9, .string_surrogate_half_backslash_u_2 => 10, .string_surrogate_half_backslash_u_3 => 11, // Include everything up to the cursor otherwise. .string, .string_utf8_last_byte, .string_utf8_second_to_last_byte, .string_utf8_second_to_last_byte_guard_against_overlong, .string_utf8_second_to_last_byte_guard_against_surrogate_half, .string_utf8_third_to_last_byte, .string_utf8_third_to_last_byte_guard_against_overlong, .string_utf8_third_to_last_byte_guard_against_too_large, => 0, else => unreachable, }); if (slice.len == 0) return error.BufferUnderrun; return Token{ .partial_string = slice }; } fn partialStringCodepoint(code_point: u21) Token { var buf: [4]u8 = undefined; switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) { 1 => return Token{ .partial_string_escaped_1 = buf[0..1].* }, 2 => return Token{ .partial_string_escaped_2 = buf[0..2].* }, 3 => return Token{ .partial_string_escaped_3 = buf[0..3].* }, 4 => return Token{ .partial_string_escaped_4 = buf[0..4].* }, else => unreachable, } } }