Function wtf8ToUtf8Lossy [src]

Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement character (U+FFFD). All surrogate codepoints and the replacement character are encoded as three bytes, meaning the input and output slices will always be the same length. In-place conversion is supported when utf8 and wtf8 refer to the same slice. Note: If wtf8 is entirely composed of well-formed UTF-8, then no conversion is necessary. utf8ValidateSlice can be used to check if lossy conversion is worthwhile. If wtf8 is not valid WTF-8, then error.InvalidWtf8 is returned.

Prototype

pub fn wtf8ToUtf8Lossy(utf8: []u8, wtf8: []const u8) error{InvalidWtf8}!void

Parameters

utf8: []u8wtf8: []const u8

Possible Errors

InvalidWtf8

Example

 test wtf8ToUtf8Lossy {
    var buf: [32]u8 = undefined;

    const invalid_utf8 = "\xff";
    try testing.expectError(error.InvalidWtf8, wtf8ToUtf8Lossy(&buf, invalid_utf8));

    const ascii = "abcd";
    try wtf8ToUtf8Lossy(&buf, ascii);
    try testing.expectEqualStrings("abcd", buf[0..ascii.len]);

    const high_surrogate_half = "ab\xed\xa0\xbdcd";
    try wtf8ToUtf8Lossy(&buf, high_surrogate_half);
    try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..high_surrogate_half.len]);

    const low_surrogate_half = "ab\xed\xb2\xa9cd";
    try wtf8ToUtf8Lossy(&buf, low_surrogate_half);
    try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..low_surrogate_half.len]);

    // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
    // replacement character instead of being interpreted as a surrogate pair.
    const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
    try wtf8ToUtf8Lossy(&buf, encoded_surrogate_pair);
    try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", buf[0..encoded_surrogate_pair.len]);

    // in place
    @memcpy(buf[0..low_surrogate_half.len], low_surrogate_half);
    const slice = buf[0..low_surrogate_half.len];
    try wtf8ToUtf8Lossy(slice, slice);
    try testing.expectEqualStrings("ab\u{FFFD}cd", slice);
}

Source

  pub fn wtf8ToUtf8Lossy(utf8: []u8, wtf8: []const u8) error{InvalidWtf8}!void {
    assert(utf8.len >= wtf8.len);

    const in_place = utf8.ptr == wtf8.ptr;
    const replacement_char_bytes = comptime blk: {
        var buf: [3]u8 = undefined;
        assert((utf8Encode(replacement_character, &buf) catch unreachable) == 3);
        break :blk buf;
    };

    var dest_i: usize = 0;
    const view = try Wtf8View.init(wtf8);
    var it = view.iterator();
    while (it.nextCodepointSlice()) |codepoint_slice| {
        // All surrogate codepoints are encoded as 3 bytes
        if (codepoint_slice.len == 3) {
            const codepoint = wtf8Decode(codepoint_slice) catch unreachable;
            if (isSurrogateCodepoint(codepoint)) {
                @memcpy(utf8[dest_i..][0..replacement_char_bytes.len], &replacement_char_bytes);
                dest_i += replacement_char_bytes.len;
                continue;
            }
        }
        if (!in_place) {
            @memcpy(utf8[dest_i..][0..codepoint_slice.len], codepoint_slice);
        }
        dest_i += codepoint_slice.len;
    }
}