struct deflate [src]
Alias for std.compress.flate.deflate
Members
- compress (Function)
- compressor (Function)
- Compressor (Type Function)
- huffman (struct)
- Level (enum)
- Options (struct)
- store (struct)
Source
const std = @import("std");
const io = std.io;
const assert = std.debug.assert;
const testing = std.testing;
const expect = testing.expect;
const print = std.debug.print;
const Token = @import("Token.zig");
const consts = @import("consts.zig");
const BlockWriter = @import("block_writer.zig").BlockWriter;
const Container = @import("container.zig").Container;
const SlidingWindow = @import("SlidingWindow.zig");
const Lookup = @import("Lookup.zig");
pub const Options = struct {
level: Level = .default,
};
/// Trades between speed and compression size.
/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
/// levels 1-3 are using different algorithm to perform faster but with less
/// compression. That is not implemented here.
pub const Level = enum(u4) {
// zig fmt: off
fast = 0xb, level_4 = 4,
level_5 = 5,
default = 0xc, level_6 = 6,
level_7 = 7,
level_8 = 8,
best = 0xd, level_9 = 9,
// zig fmt: on
};
/// Algorithm knobs for each level.
const LevelArgs = struct {
good: u16, // Do less lookups if we already have match of this length.
nice: u16, // Stop looking for better match if we found match with at least this length.
lazy: u16, // Don't do lazy match find if got match with at least this length.
chain: u16, // How many lookups for previous match to perform.
pub fn get(level: Level) LevelArgs {
// zig fmt: off
return switch (level) {
.fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
.level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
.default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
.level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
.level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
.best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
};
// zig fmt: on
}
};
/// Compress plain data from reader into compressed stream written to writer.
pub fn compress(comptime container: Container, reader: anytype, writer: anytype, options: Options) !void {
var c = try compressor(container, writer, options);
try c.compress(reader);
try c.finish();
}
/// Create compressor for writer type.
pub fn compressor(comptime container: Container, writer: anytype, options: Options) !Compressor(
container,
@TypeOf(writer),
) {
return try Compressor(container, @TypeOf(writer)).init(writer, options);
}
/// Compressor type.
pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
const TokenWriterType = BlockWriter(WriterType);
return Deflate(container, WriterType, TokenWriterType);
}
/// Default compression algorithm. Has two steps: tokenization and token
/// encoding.
///
/// Tokenization takes uncompressed input stream and produces list of tokens.
/// Each token can be literal (byte of data) or match (backrefernce to previous
/// data with length and distance). Tokenization accumulators 32K tokens, when
/// full or `flush` is called tokens are passed to the `block_writer`. Level
/// defines how hard (how slow) it tries to find match.
///
/// Block writer will decide which type of deflate block to write (stored, fixed,
/// dynamic) and encode tokens to the output byte stream. Client has to call
/// `finish` to write block with the final bit set.
///
/// Container defines type of header and footer which can be gzip, zlib or raw.
/// They all share same deflate body. Raw has no header or footer just deflate
/// body.
///
/// Compression algorithm explained in rfc-1951 (slightly edited for this case):
///
/// The compressor uses a chained hash table `lookup` to find duplicated
/// strings, using a hash function that operates on 4-byte sequences. At any
/// given point during compression, let XYZW be the next 4 input bytes
/// (lookahead) to be examined (not necessarily all different, of course).
/// First, the compressor examines the hash chain for XYZW. If the chain is
/// empty, the compressor simply writes out X as a literal byte and advances
/// one byte in the input. If the hash chain is not empty, indicating that the
/// sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
/// hash function value) has occurred recently, the compressor compares all
/// strings on the XYZW hash chain with the actual input data sequence
/// starting at the current point, and selects the longest match.
///
/// To improve overall compression, the compressor defers the selection of
/// matches ("lazy matching"): after a match of length N has been found, the
/// compressor searches for a longer match starting at the next input byte. If
/// it finds a longer match, it truncates the previous match to a length of
/// one (thus producing a single literal byte) and then emits the longer
/// match. Otherwise, it emits the original match, and, as described above,
/// advances N bytes before continuing.
///
///
/// Allocates statically ~400K (192K lookup, 128K tokens, 64K window).
///
/// Deflate function accepts BlockWriterType so we can change that in test to test
/// just tokenization part.
///
fn Deflate(comptime container: Container, comptime WriterType: type, comptime BlockWriterType: type) type {
return struct {
lookup: Lookup = .{},
win: SlidingWindow = .{},
tokens: Tokens = .{},
wrt: WriterType,
block_writer: BlockWriterType,
level: LevelArgs,
hasher: container.Hasher() = .{},
// Match and literal at the previous position.
// Used for lazy match finding in processWindow.
prev_match: ?Token = null,
prev_literal: ?u8 = null,
const Self = @This();
pub fn init(wrt: WriterType, options: Options) !Self {
const self = Self{
.wrt = wrt,
.block_writer = BlockWriterType.init(wrt),
.level = LevelArgs.get(options.level),
};
try container.writeHeader(self.wrt);
return self;
}
const FlushOption = enum { none, flush, final };
// Process data in window and create tokens. If token buffer is full
// flush tokens to the token writer. In the case of `flush` or `final`
// option it will process all data from the window. In the `none` case
// it will preserve some data for the next match.
fn tokenize(self: *Self, flush_opt: FlushOption) !void {
// flush - process all data from window
const should_flush = (flush_opt != .none);
// While there is data in active lookahead buffer.
while (self.win.activeLookahead(should_flush)) |lh| {
var step: u16 = 1; // 1 in the case of literal, match length otherwise
const pos: u16 = self.win.pos();
const literal = lh[0]; // literal at current position
const min_len: u16 = if (self.prev_match) |m| m.length() else 0;
// Try to find match at least min_len long.
if (self.findMatch(pos, lh, min_len)) |match| {
// Found better match than previous.
try self.addPrevLiteral();
// Is found match length good enough?
if (match.length() >= self.level.lazy) {
// Don't try to lazy find better match, use this.
step = try self.addMatch(match);
} else {
// Store this match.
self.prev_literal = literal;
self.prev_match = match;
}
} else {
// There is no better match at current pos then it was previous.
// Write previous match or literal.
if (self.prev_match) |m| {
// Write match from previous position.
step = try self.addMatch(m) - 1; // we already advanced 1 from previous position
} else {
// No match at previous position.
// Write previous literal if any, and remember this literal.
try self.addPrevLiteral();
self.prev_literal = literal;
}
}
// Advance window and add hashes.
self.windowAdvance(step, lh, pos);
}
if (should_flush) {
// In the case of flushing, last few lookahead buffers were smaller then min match len.
// So only last literal can be unwritten.
assert(self.prev_match == null);
try self.addPrevLiteral();
self.prev_literal = null;
try self.flushTokens(flush_opt);
}
}
fn windowAdvance(self: *Self, step: u16, lh: []const u8, pos: u16) void {
// current position is already added in findMatch
self.lookup.bulkAdd(lh[1..], step - 1, pos + 1);
self.win.advance(step);
}
// Add previous literal (if any) to the tokens list.
fn addPrevLiteral(self: *Self) !void {
if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
}
// Add match to the tokens list, reset prev pointers.
// Returns length of the added match.
fn addMatch(self: *Self, m: Token) !u16 {
try self.addToken(m);
self.prev_literal = null;
self.prev_match = null;
return m.length();
}
fn addToken(self: *Self, token: Token) !void {
self.tokens.add(token);
if (self.tokens.full()) try self.flushTokens(.none);
}
// Finds largest match in the history window with the data at current pos.
fn findMatch(self: *Self, pos: u16, lh: []const u8, min_len: u16) ?Token {
var len: u16 = min_len;
// Previous location with the same hash (same 4 bytes).
var prev_pos = self.lookup.add(lh, pos);
// Last found match.
var match: ?Token = null;
// How much back-references to try, performance knob.
var chain: usize = self.level.chain;
if (len >= self.level.good) {
// If we've got a match that's good enough, only look in 1/4 the chain.
chain >>= 2;
}
// Hot path loop!
while (prev_pos > 0 and chain > 0) : (chain -= 1) {
const distance = pos - prev_pos;
if (distance > consts.match.max_distance)
break;
const new_len = self.win.match(prev_pos, pos, len);
if (new_len > len) {
match = Token.initMatch(@intCast(distance), new_len);
if (new_len >= self.level.nice) {
// The match is good enough that we don't try to find a better one.
return match;
}
len = new_len;
}
prev_pos = self.lookup.prev(prev_pos);
}
return match;
}
fn flushTokens(self: *Self, flush_opt: FlushOption) !void {
// Pass tokens to the token writer
try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer());
// Stored block ensures byte alignment.
// It has 3 bits (final, block_type) and then padding until byte boundary.
// After that everything is aligned to the boundary in the stored block.
// Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF.
// Last 4 bytes are byte aligned.
if (flush_opt == .flush) {
try self.block_writer.storedBlock("", false);
}
if (flush_opt != .none) {
// Safe to call only when byte aligned or it is OK to add
// padding bits (on last byte of the final block).
try self.block_writer.flush();
}
// Reset internal tokens store.
self.tokens.reset();
// Notify win that tokens are flushed.
self.win.flush();
}
// Slide win and if needed lookup tables.
fn slide(self: *Self) void {
const n = self.win.slide();
self.lookup.slide(n);
}
/// Compresses as much data as possible, stops when the reader becomes
/// empty. It will introduce some output latency (reading input without
/// producing all output) because some data are still in internal
/// buffers.
///
/// It is up to the caller to call flush (if needed) or finish (required)
/// when is need to output any pending data or complete stream.
///
pub fn compress(self: *Self, reader: anytype) !void {
while (true) {
// Fill window from reader
const buf = self.win.writable();
if (buf.len == 0) {
try self.tokenize(.none);
self.slide();
continue;
}
const n = try reader.readAll(buf);
self.hasher.update(buf[0..n]);
self.win.written(n);
// Process window
try self.tokenize(.none);
// Exit when no more data in reader
if (n < buf.len) break;
}
}
/// Flushes internal buffers to the output writer. Outputs empty stored
/// block to sync bit stream to the byte boundary, so that the
/// decompressor can get all input data available so far.
///
/// It is useful mainly in compressed network protocols, to ensure that
/// deflate bit stream can be used as byte stream. May degrade
/// compression so it should be used only when necessary.
///
/// Completes the current deflate block and follows it with an empty
/// stored block that is three zero bits plus filler bits to the next
/// byte, followed by four bytes (00 00 ff ff).
///
pub fn flush(self: *Self) !void {
try self.tokenize(.flush);
}
/// Completes deflate bit stream by writing any pending data as deflate
/// final deflate block. HAS to be called once all data are written to
/// the compressor as a signal that next block has to have final bit
/// set.
///
pub fn finish(self: *Self) !void {
try self.tokenize(.final);
try container.writeFooter(&self.hasher, self.wrt);
}
/// Use another writer while preserving history. Most probably flush
/// should be called on old writer before setting new.
pub fn setWriter(self: *Self, new_writer: WriterType) void {
self.block_writer.setWriter(new_writer);
self.wrt = new_writer;
}
// Writer interface
pub const Writer = io.Writer(*Self, Error, write);
pub const Error = BlockWriterType.Error;
/// Write `input` of uncompressed data.
/// See compress.
pub fn write(self: *Self, input: []const u8) !usize {
var fbs = io.fixedBufferStream(input);
try self.compress(fbs.reader());
return input.len;
}
pub fn writer(self: *Self) Writer {
return .{ .context = self };
}
};
}
// Tokens store
const Tokens = struct {
list: [consts.deflate.tokens]Token = undefined,
pos: usize = 0,
fn add(self: *Tokens, t: Token) void {
self.list[self.pos] = t;
self.pos += 1;
}
fn full(self: *Tokens) bool {
return self.pos == self.list.len;
}
fn reset(self: *Tokens) void {
self.pos = 0;
}
fn tokens(self: *Tokens) []const Token {
return self.list[0..self.pos];
}
};
/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
/// only performs Huffman entropy encoding. Results in faster compression, much
/// less memory requirements during compression but bigger compressed sizes.
pub const huffman = struct {
pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
var c = try huffman.compressor(container, writer);
try c.compress(reader);
try c.finish();
}
pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
return SimpleCompressor(.huffman, container, WriterType);
}
pub fn compressor(comptime container: Container, writer: anytype) !huffman.Compressor(container, @TypeOf(writer)) {
return try huffman.Compressor(container, @TypeOf(writer)).init(writer);
}
};
/// Creates store blocks only. Data are not compressed only packed into deflate
/// store blocks. That adds 9 bytes of header for each block. Max stored block
/// size is 64K. Block is emitted when flush is called on on finish.
pub const store = struct {
pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
var c = try store.compressor(container, writer);
try c.compress(reader);
try c.finish();
}
pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
return SimpleCompressor(.store, container, WriterType);
}
pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) {
return try store.Compressor(container, @TypeOf(writer)).init(writer);
}
};
const SimpleCompressorKind = enum {
huffman,
store,
};
fn simpleCompressor(
comptime kind: SimpleCompressorKind,
comptime container: Container,
writer: anytype,
) !SimpleCompressor(kind, container, @TypeOf(writer)) {
return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
}
fn SimpleCompressor(
comptime kind: SimpleCompressorKind,
comptime container: Container,
comptime WriterType: type,
) type {
const BlockWriterType = BlockWriter(WriterType);
return struct {
buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
wp: usize = 0,
wrt: WriterType,
block_writer: BlockWriterType,
hasher: container.Hasher() = .{},
const Self = @This();
pub fn init(wrt: WriterType) !Self {
const self = Self{
.wrt = wrt,
.block_writer = BlockWriterType.init(wrt),
};
try container.writeHeader(self.wrt);
return self;
}
pub fn flush(self: *Self) !void {
try self.flushBuffer(false);
try self.block_writer.storedBlock("", false);
try self.block_writer.flush();
}
pub fn finish(self: *Self) !void {
try self.flushBuffer(true);
try self.block_writer.flush();
try container.writeFooter(&self.hasher, self.wrt);
}
fn flushBuffer(self: *Self, final: bool) !void {
const buf = self.buffer[0..self.wp];
switch (kind) {
.huffman => try self.block_writer.huffmanBlock(buf, final),
.store => try self.block_writer.storedBlock(buf, final),
}
self.wp = 0;
}
// Writes all data from the input reader of uncompressed data.
// It is up to the caller to call flush or finish if there is need to
// output compressed blocks.
pub fn compress(self: *Self, reader: anytype) !void {
while (true) {
// read from rdr into buffer
const buf = self.buffer[self.wp..];
if (buf.len == 0) {
try self.flushBuffer(false);
continue;
}
const n = try reader.readAll(buf);
self.hasher.update(buf[0..n]);
self.wp += n;
if (n < buf.len) break; // no more data in reader
}
}
// Writer interface
pub const Writer = io.Writer(*Self, Error, write);
pub const Error = BlockWriterType.Error;
// Write `input` of uncompressed data.
pub fn write(self: *Self, input: []const u8) !usize {
var fbs = io.fixedBufferStream(input);
try self.compress(fbs.reader());
return input.len;
}
pub fn writer(self: *Self) Writer {
return .{ .context = self };
}
};
}
const builtin = @import("builtin");
test "tokenization" {
const L = Token.initLiteral;
const M = Token.initMatch;
const cases = [_]struct {
data: []const u8,
tokens: []const Token,
}{
.{
.data = "Blah blah blah blah blah!",
.tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
},
.{
.data = "ABCDEABCD ABCDEABCD",
.tokens = &[_]Token{
L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
L('A'), M(10, 8),
},
},
};
for (cases) |c| {
inline for (Container.list) |container| { // for each wrapping
var cw = io.countingWriter(io.null_writer);
const cww = cw.writer();
var df = try Deflate(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});
_ = try df.write(c.data);
try df.flush();
// df.token_writer.show();
try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match
try testing.expectEqual(container.headerSize(), cw.bytes_written);
try df.finish();
try testing.expectEqual(container.size(), cw.bytes_written);
}
}
}
// Tests that tokens written are equal to expected token list.
const TestTokenWriter = struct {
const Self = @This();
pos: usize = 0,
actual: [128]Token = undefined,
pub fn init(_: anytype) Self {
return .{};
}
pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
for (tokens) |t| {
self.actual[self.pos] = t;
self.pos += 1;
}
}
pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
pub fn get(self: *Self) []Token {
return self.actual[0..self.pos];
}
pub fn show(self: *Self) void {
print("\n", .{});
for (self.get()) |t| {
t.show();
}
}
pub fn flush(_: *Self) !void {}
};
test "file tokenization" {
const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
const cases = [_]struct {
data: []const u8, // uncompressed content
// expected number of tokens producet in deflate tokenization
tokens_count: [levels.len]usize = .{0} ** levels.len,
}{
.{
.data = @embedFile("testdata/rfc1951.txt"),
.tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
},
.{
.data = @embedFile("testdata/block_writer/huffman-null-max.input"),
.tokens_count = .{ 257, 257, 257, 257, 257, 257 },
},
.{
.data = @embedFile("testdata/block_writer/huffman-pi.input"),
.tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
},
.{
.data = @embedFile("testdata/block_writer/huffman-text.input"),
.tokens_count = .{ 235, 234, 234, 234, 234, 234 },
},
.{
.data = @embedFile("testdata/fuzz/roundtrip1.input"),
.tokens_count = .{ 333, 331, 331, 331, 331, 331 },
},
.{
.data = @embedFile("testdata/fuzz/roundtrip2.input"),
.tokens_count = .{ 334, 334, 334, 334, 334, 334 },
},
};
for (cases) |case| { // for each case
const data = case.data;
for (levels, 0..) |level, i| { // for each compression level
var original = io.fixedBufferStream(data);
// buffer for decompressed data
var al = std.ArrayList(u8).init(testing.allocator);
defer al.deinit();
const writer = al.writer();
// create compressor
const WriterType = @TypeOf(writer);
const TokenWriter = TokenDecoder(@TypeOf(writer));
var cmp = try Deflate(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });
// Stream uncompressed `original` data to the compressor. It will
// produce tokens list and pass that list to the TokenDecoder. This
// TokenDecoder uses CircularBuffer from inflate to convert list of
// tokens back to the uncompressed stream.
try cmp.compress(original.reader());
try cmp.flush();
const expected_count = case.tokens_count[i];
const actual = cmp.block_writer.tokens_count;
if (expected_count == 0) {
print("actual token count {d}\n", .{actual});
} else {
try testing.expectEqual(expected_count, actual);
}
try testing.expectEqual(data.len, al.items.len);
try testing.expectEqualSlices(u8, data, al.items);
}
}
}
fn TokenDecoder(comptime WriterType: type) type {
return struct {
const CircularBuffer = @import("CircularBuffer.zig");
hist: CircularBuffer = .{},
wrt: WriterType,
tokens_count: usize = 0,
const Self = @This();
pub fn init(wrt: WriterType) Self {
return .{ .wrt = wrt };
}
pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
self.tokens_count += tokens.len;
for (tokens) |t| {
switch (t.kind) {
.literal => self.hist.write(t.literal()),
.match => try self.hist.writeMatch(t.length(), t.distance()),
}
if (self.hist.free() < 285) try self.flushWin();
}
try self.flushWin();
}
pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
fn flushWin(self: *Self) !void {
while (true) {
const buf = self.hist.read();
if (buf.len == 0) break;
try self.wrt.writeAll(buf);
}
}
pub fn flush(_: *Self) !void {}
};
}
test "store simple compressor" {
const data = "Hello world!";
const expected = [_]u8{
0x1, // block type 0, final bit set
0xc, 0x0, // len = 12
0xf3, 0xff, // ~len
'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
//0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
};
var fbs = std.io.fixedBufferStream(data);
var al = std.ArrayList(u8).init(testing.allocator);
defer al.deinit();
var cmp = try store.compressor(.raw, al.writer());
try cmp.compress(fbs.reader());
try cmp.finish();
try testing.expectEqualSlices(u8, &expected, al.items);
fbs.reset();
try al.resize(0);
// huffman only compresoor will also emit store block for this small sample
var hc = try huffman.compressor(.raw, al.writer());
try hc.compress(fbs.reader());
try hc.finish();
try testing.expectEqualSlices(u8, &expected, al.items);
}