struct ArgIteratorWindows [src]

Iterator that implements the Windows command-line parsing algorithm. The implementation is intended to be compatible with the post-2008 C runtime, but is not intended to be compatible with CommandLineToArgvW since CommandLineToArgvW uses the pre-2008 parsing rules. This iterator faithfully implements the parsing behavior observed from the C runtime with one exception: if the command-line string is empty, the iterator will immediately complete without returning any arguments (whereas the C runtime will return a single argument representing the name of the current executable). The essential parts of the algorithm are described in Microsoft's documentation: https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments David Deley explains some additional undocumented quirks in great detail: https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES

Fields

allocator: Allocator
cmd_line: []const u16Encoded as WTF-16 LE.
index: usize = 0
buffer: []u8Owned by the iterator. Long enough to hold contiguous NUL-terminated slices of each argument encoded as WTF-8.
start: usize = 0
end: usize = 0

Members

Source

pub const ArgIteratorWindows = struct { allocator: Allocator, /// Encoded as WTF-16 LE. cmd_line: []const u16, index: usize = 0, /// Owned by the iterator. Long enough to hold contiguous NUL-terminated slices /// of each argument encoded as WTF-8. buffer: []u8, start: usize = 0, end: usize = 0, pub const InitError = error{OutOfMemory}; /// `cmd_line_w` *must* be a WTF16-LE-encoded string. /// /// The iterator stores and uses `cmd_line_w`, so its memory must be valid for /// at least as long as the returned ArgIteratorWindows. pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows { const wtf8_len = unicode.calcWtf8Len(cmd_line_w); // This buffer must be large enough to contain contiguous NUL-terminated slices // of each argument. // - During parsing, the length of a parsed argument will always be equal to // to less than its unparsed length // - The first argument needs one extra byte of space allocated for its NUL // terminator, but for each subsequent argument the necessary whitespace // between arguments guarantees room for their NUL terminator(s). const buffer = try allocator.alloc(u8, wtf8_len + 1); errdefer allocator.free(buffer); return .{ .allocator = allocator, .cmd_line = cmd_line_w, .buffer = buffer, }; } /// Returns the next argument and advances the iterator. Returns `null` if at the end of the /// command-line string. The iterator owns the returned slice. /// The result is encoded as [WTF-8](https://simonsapin.github.io/wtf-8/). pub fn next(self: *ArgIteratorWindows) ?[:0]const u8 { return self.nextWithStrategy(next_strategy); } /// Skips the next argument and advances the iterator. Returns `true` if an argument was /// skipped, `false` if at the end of the command-line string. pub fn skip(self: *ArgIteratorWindows) bool { return self.nextWithStrategy(skip_strategy); } const next_strategy = struct { const T = ?[:0]const u8; const eof = null; /// Returns '\' if any backslashes are emitted, otherwise returns `last_emitted_code_unit`. fn emitBackslashes(self: *ArgIteratorWindows, count: usize, last_emitted_code_unit: ?u16) ?u16 { for (0..count) |_| { self.buffer[self.end] = '\\'; self.end += 1; } return if (count != 0) '\\' else last_emitted_code_unit; } /// If `last_emitted_code_unit` and `code_unit` form a surrogate pair, then /// the previously emitted high surrogate is overwritten by the codepoint encoded /// by the surrogate pair, and `null` is returned. /// Otherwise, `code_unit` is emitted and returned. fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16, last_emitted_code_unit: ?u16) ?u16 { // Because we are emitting WTF-8, we need to // check to see if we've emitted two consecutive surrogate // codepoints that form a valid surrogate pair in order // to ensure that we're always emitting well-formed WTF-8 // (https://simonsapin.github.io/wtf-8/#concatenating). // // If we do have a valid surrogate pair, we need to emit // the UTF-8 sequence for the codepoint that they encode // instead of the WTF-8 encoding for the two surrogate pairs // separately. // // This is relevant when dealing with a WTF-16 encoded // command line like this: // "<0xD801>"<0xDC37> // which would get parsed and converted to WTF-8 as: // <0xED><0xA0><0x81><0xED><0xB0><0xB7> // but instead, we need to recognize the surrogate pair // and emit the codepoint it encodes, which in this // example is U+10437 (𐐷), which is encoded in UTF-8 as: // <0xF0><0x90><0x90><0xB7> if (last_emitted_code_unit != null and std.unicode.utf16IsLowSurrogate(code_unit) and std.unicode.utf16IsHighSurrogate(last_emitted_code_unit.?)) { const codepoint = std.unicode.utf16DecodeSurrogatePair(&.{ last_emitted_code_unit.?, code_unit }) catch unreachable; // Unpaired surrogate is 3 bytes long const dest = self.buffer[self.end - 3 ..]; const len = unicode.utf8Encode(codepoint, dest) catch unreachable; // All codepoints that require a surrogate pair (> U+FFFF) are encoded as 4 bytes assert(len == 4); self.end += 1; return null; } const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable; self.end += wtf8_len; return code_unit; } fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 { self.buffer[self.end] = 0; const arg = self.buffer[self.start..self.end :0]; self.end += 1; self.start = self.end; return arg; } }; const skip_strategy = struct { const T = bool; const eof = false; fn emitBackslashes(_: *ArgIteratorWindows, _: usize, last_emitted_code_unit: ?u16) ?u16 { return last_emitted_code_unit; } fn emitCharacter(_: *ArgIteratorWindows, _: u16, last_emitted_code_unit: ?u16) ?u16 { return last_emitted_code_unit; } fn yieldArg(_: *ArgIteratorWindows) bool { return true; } }; fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T { var last_emitted_code_unit: ?u16 = null; // The first argument (the executable name) uses different parsing rules. if (self.index == 0) { if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) { // Immediately complete the iterator. // The C runtime would return the name of the current executable here. return strategy.eof; } var inside_quotes = false; while (true) : (self.index += 1) { const char = if (self.index != self.cmd_line.len) mem.littleToNative(u16, self.cmd_line[self.index]) else 0; switch (char) { 0 => { return strategy.yieldArg(self); }, '"' => { inside_quotes = !inside_quotes; }, ' ', '\t' => { if (inside_quotes) { last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit); } else { self.index += 1; return strategy.yieldArg(self); } }, else => { last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit); }, } } } // Skip spaces and tabs. The iterator completes if we reach the end of the string here. while (true) : (self.index += 1) { const char = if (self.index != self.cmd_line.len) mem.littleToNative(u16, self.cmd_line[self.index]) else 0; switch (char) { 0 => return strategy.eof, ' ', '\t' => continue, else => break, } } // Parsing rules for subsequent arguments: // // - The end of the string always terminates the current argument. // - When not in 'inside_quotes' mode, a space or tab terminates the current argument. // - 2n backslashes followed by a quote emit n backslashes (note: n can be zero). // If in 'inside_quotes' and the quote is immediately followed by a second quote, // one quote is emitted and the other is skipped, otherwise, the quote is skipped // and 'inside_quotes' is toggled. // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote. // - n backslashes not followed by a quote emit n backslashes. var backslash_count: usize = 0; var inside_quotes = false; while (true) : (self.index += 1) { const char = if (self.index != self.cmd_line.len) mem.littleToNative(u16, self.cmd_line[self.index]) else 0; switch (char) { 0 => { last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit); return strategy.yieldArg(self); }, ' ', '\t' => { last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit); backslash_count = 0; if (inside_quotes) { last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit); } else return strategy.yieldArg(self); }, '"' => { const char_is_escaped_quote = backslash_count % 2 != 0; last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count / 2, last_emitted_code_unit); backslash_count = 0; if (char_is_escaped_quote) { last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit); } else { if (inside_quotes and self.index + 1 != self.cmd_line.len and mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"') { last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit); self.index += 1; } else { inside_quotes = !inside_quotes; } } }, '\\' => { backslash_count += 1; }, else => { last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit); backslash_count = 0; last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit); }, } } } /// Frees the iterator's copy of the command-line string and all previously returned /// argument slices. pub fn deinit(self: *ArgIteratorWindows) void { self.allocator.free(self.buffer); } }