struct Frame [src]

Fields

hasher_opt: ?std.hash.XxHash64
window_size: usize
has_checksum: bool
block_size_max: usize
content_size: ?usize

Members

Source

pub const Frame = struct { hasher_opt: ?std.hash.XxHash64, window_size: usize, has_checksum: bool, block_size_max: usize, content_size: ?usize, pub const Magic = enum(u32) { zstandard = 0xFD2FB528, _, pub fn kind(m: Magic) ?Kind { return switch (@intFromEnum(m)) { @intFromEnum(Magic.zstandard) => .zstandard, @intFromEnum(Skippable.magic_min)...@intFromEnum(Skippable.magic_max) => .skippable, else => null, }; } pub fn isSkippable(m: Magic) bool { return switch (@intFromEnum(m)) { @intFromEnum(Skippable.magic_min)...@intFromEnum(Skippable.magic_max) => true, else => false, }; } }; pub const Kind = enum { zstandard, skippable }; pub const Zstandard = struct { pub const magic: Magic = .zstandard; header: Header, data_blocks: []Block, checksum: ?u32, pub const Header = struct { descriptor: Descriptor, window_descriptor: ?u8, dictionary_id: ?u32, content_size: ?u64, pub const Descriptor = packed struct { dictionary_id_flag: u2, content_checksum_flag: bool, reserved: bool, unused: bool, single_segment_flag: bool, content_size_flag: u2, }; pub const DecodeError = Reader.Error || error{ReservedBitSet}; pub fn decode(in: *Reader) DecodeError!Header { const descriptor: Descriptor = @bitCast(try in.takeByte()); if (descriptor.reserved) return error.ReservedBitSet; const window_descriptor: ?u8 = if (descriptor.single_segment_flag) null else try in.takeByte(); const dictionary_id: ?u32 = if (descriptor.dictionary_id_flag > 0) d: { // if flag is 3 then field_size = 4, else field_size = flag const field_size = (@as(u4, 1) << descriptor.dictionary_id_flag) >> 1; break :d try in.takeVarInt(u32, .little, field_size); } else null; const content_size: ?u64 = if (descriptor.single_segment_flag or descriptor.content_size_flag > 0) c: { const field_size = @as(u4, 1) << descriptor.content_size_flag; const content_size = try in.takeVarInt(u64, .little, field_size); break :c if (field_size == 2) content_size + 256 else content_size; } else null; return .{ .descriptor = descriptor, .window_descriptor = window_descriptor, .dictionary_id = dictionary_id, .content_size = content_size, }; } /// Returns the window size required to decompress a frame, or `null` if it /// cannot be determined (which indicates a malformed frame header). pub fn windowSize(header: Header) ?u64 { if (header.window_descriptor) |descriptor| { const exponent = (descriptor & 0b11111000) >> 3; const mantissa = descriptor & 0b00000111; const window_log = 10 + exponent; const window_base = @as(u64, 1) << @as(u6, @intCast(window_log)); const window_add = (window_base / 8) * mantissa; return window_base + window_add; } else return header.content_size; } }; pub const Block = struct { pub const Header = packed struct(u24) { last: bool, type: Type, size: u21, }; pub const Type = enum(u2) { raw, rle, compressed, reserved, }; }; pub const Decode = struct { repeat_offsets: [3]u32, offset: StateData(8), match: StateData(9), literal: StateData(9), literal_fse_buffer: [zstd.table_size_max.literal]Table.Fse, match_fse_buffer: [zstd.table_size_max.match]Table.Fse, offset_fse_buffer: [zstd.table_size_max.offset]Table.Fse, fse_tables_undefined: bool, literal_stream_reader: ReverseBitReader, literal_stream_index: usize, literal_streams: LiteralsSection.Streams, literal_header: LiteralsSection.Header, huffman_tree: ?LiteralsSection.HuffmanTree, literal_written_count: usize, fn StateData(comptime max_accuracy_log: comptime_int) type { return struct { state: @This().State, table: Table, accuracy_log: u8, const State = std.meta.Int(.unsigned, max_accuracy_log); }; } const init: Decode = .{ .repeat_offsets = .{ zstd.start_repeated_offset_1, zstd.start_repeated_offset_2, zstd.start_repeated_offset_3, }, .offset = undefined, .match = undefined, .literal = undefined, .literal_fse_buffer = undefined, .match_fse_buffer = undefined, .offset_fse_buffer = undefined, .fse_tables_undefined = true, .literal_written_count = 0, .literal_header = undefined, .literal_streams = undefined, .literal_stream_reader = undefined, .literal_stream_index = undefined, .huffman_tree = null, }; pub const PrepareError = error{ /// the (reversed) literal bitstream's first byte does not have any bits set MissingStartBit, /// `literals` is a treeless literals section and the decode state does not /// have a Huffman tree from a previous block TreelessLiteralsFirst, /// on the first call if one of the sequence FSE tables is set to repeat mode RepeatModeFirst, /// an FSE table has an invalid accuracy MalformedAccuracyLog, /// failed decoding an FSE table MalformedFseTable, /// input stream ends before all FSE tables are read EndOfStream, ReadFailed, InputBufferUndersize, }; /// Prepare the decoder to decode a compressed block. Loads the /// literals stream and Huffman tree from `literals` and reads the /// FSE tables from `in`. pub fn prepare( self: *Decode, in: *Reader, remaining: *Limit, literals: LiteralsSection, sequences_header: SequencesSection.Header, ) PrepareError!void { self.literal_written_count = 0; self.literal_header = literals.header; self.literal_streams = literals.streams; if (literals.huffman_tree) |tree| { self.huffman_tree = tree; } else if (literals.header.block_type == .treeless and self.huffman_tree == null) { return error.TreelessLiteralsFirst; } switch (literals.header.block_type) { .raw, .rle => {}, .compressed, .treeless => { self.literal_stream_index = 0; switch (literals.streams) { .one => |slice| try self.initLiteralStream(slice), .four => |streams| try self.initLiteralStream(streams[0]), } }, } if (sequences_header.sequence_count > 0) { try self.updateFseTable(in, remaining, .literal, sequences_header.literal_lengths); try self.updateFseTable(in, remaining, .offset, sequences_header.offsets); try self.updateFseTable(in, remaining, .match, sequences_header.match_lengths); self.fse_tables_undefined = false; } } /// Read initial FSE states for sequence decoding. pub fn readInitialFseState(self: *Decode, bit_reader: *ReverseBitReader) error{EndOfStream}!void { self.literal.state = try bit_reader.readBitsNoEof(u9, self.literal.accuracy_log); self.offset.state = try bit_reader.readBitsNoEof(u8, self.offset.accuracy_log); self.match.state = try bit_reader.readBitsNoEof(u9, self.match.accuracy_log); } fn updateRepeatOffset(self: *Decode, offset: u32) void { self.repeat_offsets[2] = self.repeat_offsets[1]; self.repeat_offsets[1] = self.repeat_offsets[0]; self.repeat_offsets[0] = offset; } fn useRepeatOffset(self: *Decode, index: usize) u32 { if (index == 1) std.mem.swap(u32, &self.repeat_offsets[0], &self.repeat_offsets[1]) else if (index == 2) { std.mem.swap(u32, &self.repeat_offsets[0], &self.repeat_offsets[2]); std.mem.swap(u32, &self.repeat_offsets[1], &self.repeat_offsets[2]); } return self.repeat_offsets[0]; } const WhichFse = enum { offset, match, literal }; /// TODO: don't use `@field` fn updateState( self: *Decode, comptime choice: WhichFse, bit_reader: *ReverseBitReader, ) error{ MalformedFseBits, EndOfStream }!void { switch (@field(self, @tagName(choice)).table) { .rle => {}, .fse => |table| { const data = table[@field(self, @tagName(choice)).state]; const T = @TypeOf(@field(self, @tagName(choice))).State; const bits_summand = try bit_reader.readBitsNoEof(T, data.bits); const next_state = std.math.cast( @TypeOf(@field(self, @tagName(choice))).State, data.baseline + bits_summand, ) orelse return error.MalformedFseBits; @field(self, @tagName(choice)).state = next_state; }, } } const FseTableError = error{ MalformedFseTable, MalformedAccuracyLog, RepeatModeFirst, EndOfStream, }; /// TODO: don't use `@field` fn updateFseTable( self: *Decode, in: *Reader, remaining: *Limit, comptime choice: WhichFse, mode: SequencesSection.Header.Mode, ) !void { const field_name = @tagName(choice); switch (mode) { .predefined => { @field(self, field_name).accuracy_log = @field(zstd.default_accuracy_log, field_name); @field(self, field_name).table = @field(Table, "predefined_" ++ field_name); }, .rle => { @field(self, field_name).accuracy_log = 0; remaining.* = remaining.subtract(1) orelse return error.EndOfStream; @field(self, field_name).table = .{ .rle = try in.takeByte() }; }, .fse => { const max_table_size = 2048; const peek_len: usize = remaining.minInt(max_table_size); if (in.buffer.len < peek_len) return error.InputBufferUndersize; const limited_buffer = try in.peek(peek_len); var bit_reader: BitReader = .{ .bytes = limited_buffer }; const table_size = try Table.decode( &bit_reader, @field(zstd.table_symbol_count_max, field_name), @field(zstd.table_accuracy_log_max, field_name), &@field(self, field_name ++ "_fse_buffer"), ); @field(self, field_name).table = .{ .fse = (&@field(self, field_name ++ "_fse_buffer"))[0..table_size], }; @field(self, field_name).accuracy_log = std.math.log2_int_ceil(usize, table_size); in.toss(bit_reader.index); remaining.* = remaining.subtract(bit_reader.index).?; }, .repeat => if (self.fse_tables_undefined) return error.RepeatModeFirst, } } const Sequence = struct { literal_length: u32, match_length: u32, offset: u32, }; fn nextSequence( self: *Decode, bit_reader: *ReverseBitReader, ) error{ InvalidBitStream, EndOfStream }!Sequence { const raw_code = self.getCode(.offset); const offset_code = std.math.cast(u5, raw_code) orelse { return error.InvalidBitStream; }; const offset_value = (@as(u32, 1) << offset_code) + try bit_reader.readBitsNoEof(u32, offset_code); const match_code = self.getCode(.match); if (match_code >= zstd.match_length_code_table.len) return error.InvalidBitStream; const match = zstd.match_length_code_table[match_code]; const match_length = match[0] + try bit_reader.readBitsNoEof(u32, match[1]); const literal_code = self.getCode(.literal); if (literal_code >= zstd.literals_length_code_table.len) return error.InvalidBitStream; const literal = zstd.literals_length_code_table[literal_code]; const literal_length = literal[0] + try bit_reader.readBitsNoEof(u32, literal[1]); const offset = if (offset_value > 3) offset: { const offset = offset_value - 3; self.updateRepeatOffset(offset); break :offset offset; } else offset: { if (literal_length == 0) { if (offset_value == 3) { const offset = self.repeat_offsets[0] - 1; self.updateRepeatOffset(offset); break :offset offset; } break :offset self.useRepeatOffset(offset_value); } break :offset self.useRepeatOffset(offset_value - 1); }; if (offset == 0) return error.InvalidBitStream; return .{ .literal_length = literal_length, .match_length = match_length, .offset = offset, }; } /// Decode one sequence from `bit_reader` into `dest`. Updates FSE states /// if `last_sequence` is `false`. Assumes `prepare` called for the block /// before attempting to decode sequences. fn decodeSequence( decode: *Decode, dest: []u8, write_pos: usize, bit_reader: *ReverseBitReader, ) !usize { const sequence = try decode.nextSequence(bit_reader); const literal_length: usize = sequence.literal_length; const match_length: usize = sequence.match_length; const sequence_length = literal_length + match_length; if (sequence_length > dest[write_pos..].len) return error.MalformedSequence; const copy_start = std.math.sub(usize, write_pos + sequence.literal_length, sequence.offset) catch return error.MalformedSequence; if (decode.literal_written_count + literal_length > decode.literal_header.regenerated_size) return error.MalformedLiteralsLength; var sub_bw: Writer = .fixed(dest[write_pos..]); try decodeLiterals(decode, &sub_bw, literal_length); decode.literal_written_count += literal_length; // This is not a @memmove; it intentionally repeats patterns // caused by iterating one byte at a time. for ( dest[write_pos + literal_length ..][0..match_length], dest[copy_start..][0..match_length], ) |*d, s| d.* = s; return sequence_length; } fn nextLiteralMultiStream(self: *Decode) error{MissingStartBit}!void { self.literal_stream_index += 1; try self.initLiteralStream(self.literal_streams.four[self.literal_stream_index]); } fn initLiteralStream(self: *Decode, bytes: []const u8) error{MissingStartBit}!void { self.literal_stream_reader = try ReverseBitReader.init(bytes); } fn isLiteralStreamEmpty(self: *Decode) bool { switch (self.literal_streams) { .one => return self.literal_stream_reader.isEmpty(), .four => return self.literal_stream_index == 3 and self.literal_stream_reader.isEmpty(), } } const LiteralBitsError = error{ MissingStartBit, UnexpectedEndOfLiteralStream, }; fn readLiteralsBits( self: *Decode, bit_count_to_read: u16, ) LiteralBitsError!u16 { return self.literal_stream_reader.readBitsNoEof(u16, bit_count_to_read) catch bits: { if (self.literal_streams == .four and self.literal_stream_index < 3) { try self.nextLiteralMultiStream(); break :bits self.literal_stream_reader.readBitsNoEof(u16, bit_count_to_read) catch return error.UnexpectedEndOfLiteralStream; } else { return error.UnexpectedEndOfLiteralStream; } }; } /// Decode `len` bytes of literals into `w`. fn decodeLiterals(d: *Decode, w: *Writer, len: usize) !void { switch (d.literal_header.block_type) { .raw => { try w.writeAll(d.literal_streams.one[d.literal_written_count..][0..len]); }, .rle => { try w.splatByteAll(d.literal_streams.one[0], len); }, .compressed, .treeless => { const buf = try w.writableSlice(len); const huffman_tree = d.huffman_tree.?; const max_bit_count = huffman_tree.max_bit_count; const starting_bit_count = LiteralsSection.HuffmanTree.weightToBitCount( huffman_tree.nodes[huffman_tree.symbol_count_minus_one].weight, max_bit_count, ); var bits_read: u4 = 0; var huffman_tree_index: usize = huffman_tree.symbol_count_minus_one; var bit_count_to_read: u4 = starting_bit_count; for (buf) |*out| { var prefix: u16 = 0; while (true) { const new_bits = try d.readLiteralsBits(bit_count_to_read); prefix <<= bit_count_to_read; prefix |= new_bits; bits_read += bit_count_to_read; const result = try huffman_tree.query(huffman_tree_index, prefix); switch (result) { .symbol => |sym| { out.* = sym; bit_count_to_read = starting_bit_count; bits_read = 0; huffman_tree_index = huffman_tree.symbol_count_minus_one; break; }, .index => |index| { huffman_tree_index = index; const bit_count = LiteralsSection.HuffmanTree.weightToBitCount( huffman_tree.nodes[index].weight, max_bit_count, ); bit_count_to_read = bit_count - bits_read; }, } } } }, } } /// TODO: don't use `@field` fn getCode(self: *Decode, comptime choice: WhichFse) u32 { return switch (@field(self, @tagName(choice)).table) { .rle => |value| value, .fse => |table| table[@field(self, @tagName(choice)).state].symbol, }; } }; }; pub const Skippable = struct { pub const magic_min: Magic = @enumFromInt(0x184D2A50); pub const magic_max: Magic = @enumFromInt(0x184D2A5F); pub const Header = struct { magic_number: u32, frame_size: u32, }; }; const InitError = error{ /// Frame uses a dictionary. DictionaryIdFlagUnsupported, /// Frame does not have a valid window size. WindowSizeUnknown, /// Window size exceeds `window_size_max` or max `usize` value. WindowOversize, /// Frame header indicates a content size exceeding max `usize` value. ContentOversize, }; /// Validates `frame_header` and returns the associated `Frame`. pub fn init( frame_header: Frame.Zstandard.Header, window_size_max: usize, verify_checksum: bool, ) InitError!Frame { if (frame_header.descriptor.dictionary_id_flag != 0) return error.DictionaryIdFlagUnsupported; const window_size_raw = frame_header.windowSize() orelse return error.WindowSizeUnknown; const window_size = if (window_size_raw > window_size_max) return error.WindowOversize else std.math.cast(usize, window_size_raw) orelse return error.WindowOversize; const should_compute_checksum = frame_header.descriptor.content_checksum_flag and verify_checksum; const content_size = if (frame_header.content_size) |size| std.math.cast(usize, size) orelse return error.ContentOversize else null; return .{ .hasher_opt = if (should_compute_checksum) std.hash.XxHash64.init(0) else null, .window_size = window_size, .has_checksum = frame_header.descriptor.content_checksum_flag, .block_size_max = @min(zstd.block_size_max, window_size), .content_size = content_size, }; } }