struct ArgIteratorWindows [src]
Iterator that implements the Windows command-line parsing algorithm.
The implementation is intended to be compatible with the post-2008 C runtime,
but is not intended to be compatible with CommandLineToArgvW since
CommandLineToArgvW uses the pre-2008 parsing rules.
This iterator faithfully implements the parsing behavior observed from the C runtime with
one exception: if the command-line string is empty, the iterator will immediately complete
without returning any arguments (whereas the C runtime will return a single argument
representing the name of the current executable).
The essential parts of the algorithm are described in Microsoft's documentation:
https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments
David Deley explains some additional undocumented quirks in great detail:
https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES
Fields
allocator: Allocator
cmd_line: []const u16Encoded as WTF-16 LE.
index: usize = 0
buffer: []u8Owned by the iterator. Long enough to hold contiguous NUL-terminated slices
of each argument encoded as WTF-8.
start: usize = 0
end: usize = 0
Members
Source
pub const ArgIteratorWindows = struct {
allocator: Allocator,
/// Encoded as WTF-16 LE.
cmd_line: []const u16,
index: usize = 0,
/// Owned by the iterator. Long enough to hold contiguous NUL-terminated slices
/// of each argument encoded as WTF-8.
buffer: []u8,
start: usize = 0,
end: usize = 0,
pub const InitError = error{OutOfMemory};
/// `cmd_line_w` *must* be a WTF16-LE-encoded string.
///
/// The iterator stores and uses `cmd_line_w`, so its memory must be valid for
/// at least as long as the returned ArgIteratorWindows.
pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows {
const wtf8_len = unicode.calcWtf8Len(cmd_line_w);
// This buffer must be large enough to contain contiguous NUL-terminated slices
// of each argument.
// - During parsing, the length of a parsed argument will always be equal to
// to less than its unparsed length
// - The first argument needs one extra byte of space allocated for its NUL
// terminator, but for each subsequent argument the necessary whitespace
// between arguments guarantees room for their NUL terminator(s).
const buffer = try allocator.alloc(u8, wtf8_len + 1);
errdefer allocator.free(buffer);
return .{
.allocator = allocator,
.cmd_line = cmd_line_w,
.buffer = buffer,
};
}
/// Returns the next argument and advances the iterator. Returns `null` if at the end of the
/// command-line string. The iterator owns the returned slice.
/// The result is encoded as [WTF-8](https://simonsapin.github.io/wtf-8/).
pub fn next(self: *ArgIteratorWindows) ?[:0]const u8 {
return self.nextWithStrategy(next_strategy);
}
/// Skips the next argument and advances the iterator. Returns `true` if an argument was
/// skipped, `false` if at the end of the command-line string.
pub fn skip(self: *ArgIteratorWindows) bool {
return self.nextWithStrategy(skip_strategy);
}
const next_strategy = struct {
const T = ?[:0]const u8;
const eof = null;
/// Returns '\' if any backslashes are emitted, otherwise returns `last_emitted_code_unit`.
fn emitBackslashes(self: *ArgIteratorWindows, count: usize, last_emitted_code_unit: ?u16) ?u16 {
for (0..count) |_| {
self.buffer[self.end] = '\\';
self.end += 1;
}
return if (count != 0) '\\' else last_emitted_code_unit;
}
/// If `last_emitted_code_unit` and `code_unit` form a surrogate pair, then
/// the previously emitted high surrogate is overwritten by the codepoint encoded
/// by the surrogate pair, and `null` is returned.
/// Otherwise, `code_unit` is emitted and returned.
fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16, last_emitted_code_unit: ?u16) ?u16 {
// Because we are emitting WTF-8, we need to
// check to see if we've emitted two consecutive surrogate
// codepoints that form a valid surrogate pair in order
// to ensure that we're always emitting well-formed WTF-8
// (https://simonsapin.github.io/wtf-8/#concatenating).
//
// If we do have a valid surrogate pair, we need to emit
// the UTF-8 sequence for the codepoint that they encode
// instead of the WTF-8 encoding for the two surrogate pairs
// separately.
//
// This is relevant when dealing with a WTF-16 encoded
// command line like this:
// "<0xD801>"<0xDC37>
// which would get parsed and converted to WTF-8 as:
// <0xED><0xA0><0x81><0xED><0xB0><0xB7>
// but instead, we need to recognize the surrogate pair
// and emit the codepoint it encodes, which in this
// example is U+10437 (𐐷), which is encoded in UTF-8 as:
// <0xF0><0x90><0x90><0xB7>
if (last_emitted_code_unit != null and
std.unicode.utf16IsLowSurrogate(code_unit) and
std.unicode.utf16IsHighSurrogate(last_emitted_code_unit.?))
{
const codepoint = std.unicode.utf16DecodeSurrogatePair(&.{ last_emitted_code_unit.?, code_unit }) catch unreachable;
// Unpaired surrogate is 3 bytes long
const dest = self.buffer[self.end - 3 ..];
const len = unicode.utf8Encode(codepoint, dest) catch unreachable;
// All codepoints that require a surrogate pair (> U+FFFF) are encoded as 4 bytes
assert(len == 4);
self.end += 1;
return null;
}
const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
self.end += wtf8_len;
return code_unit;
}
fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
self.buffer[self.end] = 0;
const arg = self.buffer[self.start..self.end :0];
self.end += 1;
self.start = self.end;
return arg;
}
};
const skip_strategy = struct {
const T = bool;
const eof = false;
fn emitBackslashes(_: *ArgIteratorWindows, _: usize, last_emitted_code_unit: ?u16) ?u16 {
return last_emitted_code_unit;
}
fn emitCharacter(_: *ArgIteratorWindows, _: u16, last_emitted_code_unit: ?u16) ?u16 {
return last_emitted_code_unit;
}
fn yieldArg(_: *ArgIteratorWindows) bool {
return true;
}
};
fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
var last_emitted_code_unit: ?u16 = null;
// The first argument (the executable name) uses different parsing rules.
if (self.index == 0) {
if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
// Immediately complete the iterator.
// The C runtime would return the name of the current executable here.
return strategy.eof;
}
var inside_quotes = false;
while (true) : (self.index += 1) {
const char = if (self.index != self.cmd_line.len)
mem.littleToNative(u16, self.cmd_line[self.index])
else
0;
switch (char) {
0 => {
return strategy.yieldArg(self);
},
'"' => {
inside_quotes = !inside_quotes;
},
' ', '\t' => {
if (inside_quotes) {
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
} else {
self.index += 1;
return strategy.yieldArg(self);
}
},
else => {
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
},
}
}
}
// Skip spaces and tabs. The iterator completes if we reach the end of the string here.
while (true) : (self.index += 1) {
const char = if (self.index != self.cmd_line.len)
mem.littleToNative(u16, self.cmd_line[self.index])
else
0;
switch (char) {
0 => return strategy.eof,
' ', '\t' => continue,
else => break,
}
}
// Parsing rules for subsequent arguments:
//
// - The end of the string always terminates the current argument.
// - When not in 'inside_quotes' mode, a space or tab terminates the current argument.
// - 2n backslashes followed by a quote emit n backslashes (note: n can be zero).
// If in 'inside_quotes' and the quote is immediately followed by a second quote,
// one quote is emitted and the other is skipped, otherwise, the quote is skipped
// and 'inside_quotes' is toggled.
// - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote.
// - n backslashes not followed by a quote emit n backslashes.
var backslash_count: usize = 0;
var inside_quotes = false;
while (true) : (self.index += 1) {
const char = if (self.index != self.cmd_line.len)
mem.littleToNative(u16, self.cmd_line[self.index])
else
0;
switch (char) {
0 => {
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
return strategy.yieldArg(self);
},
' ', '\t' => {
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
backslash_count = 0;
if (inside_quotes) {
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
} else return strategy.yieldArg(self);
},
'"' => {
const char_is_escaped_quote = backslash_count % 2 != 0;
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count / 2, last_emitted_code_unit);
backslash_count = 0;
if (char_is_escaped_quote) {
last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
} else {
if (inside_quotes and
self.index + 1 != self.cmd_line.len and
mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"')
{
last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
self.index += 1;
} else {
inside_quotes = !inside_quotes;
}
}
},
'\\' => {
backslash_count += 1;
},
else => {
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
backslash_count = 0;
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
},
}
}
}
/// Frees the iterator's copy of the command-line string and all previously returned
/// argument slices.
pub fn deinit(self: *ArgIteratorWindows) void {
self.allocator.free(self.buffer);
}
}