diff options
Diffstat (limited to 'lib/std/compress/flate/BlockWriter.zig')
| -rw-r--r-- | lib/std/compress/flate/BlockWriter.zig | 591 |
1 files changed, 0 insertions, 591 deletions
diff --git a/lib/std/compress/flate/BlockWriter.zig b/lib/std/compress/flate/BlockWriter.zig deleted file mode 100644 index 47b1b298ce..0000000000 --- a/lib/std/compress/flate/BlockWriter.zig +++ /dev/null @@ -1,591 +0,0 @@ -//! Accepts list of tokens, decides what is best block type to write. What block -//! type will provide best compression. Writes header and body of the block. -const std = @import("std"); -const assert = std.debug.assert; -const Writer = std.Io.Writer; - -const BlockWriter = @This(); -const flate = @import("../flate.zig"); -const Compress = flate.Compress; -const HuffmanEncoder = flate.HuffmanEncoder; -const Token = @import("Token.zig"); - -const codegen_order = HuffmanEncoder.codegen_order; -const end_code_mark = 255; - -output: *Writer, - -codegen_freq: [HuffmanEncoder.codegen_code_count]u16, -literal_freq: [HuffmanEncoder.max_num_lit]u16, -distance_freq: [HuffmanEncoder.distance_code_count]u16, -codegen: [HuffmanEncoder.max_num_lit + HuffmanEncoder.distance_code_count + 1]u8, -literal_encoding: HuffmanEncoder, -distance_encoding: HuffmanEncoder, -codegen_encoding: HuffmanEncoder, -fixed_literal_encoding: HuffmanEncoder, -fixed_distance_encoding: HuffmanEncoder, -huff_distance: HuffmanEncoder, - -fixed_literal_codes: [HuffmanEncoder.max_num_frequencies]HuffmanEncoder.Code, -fixed_distance_codes: [HuffmanEncoder.distance_code_count]HuffmanEncoder.Code, -distance_codes: [HuffmanEncoder.distance_code_count]HuffmanEncoder.Code, - -pub fn init(output: *Writer) BlockWriter { - return .{ - .output = output, - .codegen_freq = undefined, - .literal_freq = undefined, - .distance_freq = undefined, - .codegen = undefined, - .literal_encoding = undefined, - .distance_encoding = undefined, - .codegen_encoding = undefined, - .fixed_literal_encoding = undefined, - .fixed_distance_encoding = undefined, - .huff_distance = undefined, - .fixed_literal_codes = undefined, - .fixed_distance_codes = undefined, - .distance_codes = undefined, - }; -} - -pub fn initBuffers(bw: *BlockWriter) void { - bw.fixed_literal_encoding = .fixedLiteralEncoder(&bw.fixed_literal_codes); - bw.fixed_distance_encoding = .fixedDistanceEncoder(&bw.fixed_distance_codes); - bw.huff_distance = .huffmanDistanceEncoder(&bw.distance_codes); -} - -/// Flush intrenal bit buffer to the writer. -/// Should be called only when bit stream is at byte boundary. -/// -/// That is after final block; when last byte could be incomplete or -/// after stored block; which is aligned to the byte boundary (it has x -/// padding bits after first 3 bits). -pub fn flush(self: *BlockWriter) Writer.Error!void { - try self.bit_writer.flush(); -} - -fn writeCode(self: *BlockWriter, c: Compress.HuffCode) Writer.Error!void { - try self.bit_writer.writeBits(c.code, c.len); -} - -/// RFC 1951 3.2.7 specifies a special run-length encoding for specifying -/// the literal and distance lengths arrays (which are concatenated into a single -/// array). This method generates that run-length encoding. -/// -/// The result is written into the codegen array, and the frequencies -/// of each code is written into the codegen_freq array. -/// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional -/// information. Code bad_code is an end marker -/// -/// num_literals: The number of literals in literal_encoding -/// num_distances: The number of distances in distance_encoding -/// lit_enc: The literal encoder to use -/// dist_enc: The distance encoder to use -fn generateCodegen( - self: *BlockWriter, - num_literals: u32, - num_distances: u32, - lit_enc: *Compress.LiteralEncoder, - dist_enc: *Compress.DistanceEncoder, -) void { - for (self.codegen_freq, 0..) |_, i| { - self.codegen_freq[i] = 0; - } - - // Note that we are using codegen both as a temporary variable for holding - // a copy of the frequencies, and as the place where we put the result. - // This is fine because the output is always shorter than the input used - // so far. - var codegen = &self.codegen; // cache - // Copy the concatenated code sizes to codegen. Put a marker at the end. - var cgnl = codegen[0..num_literals]; - for (cgnl, 0..) |_, i| { - cgnl[i] = @as(u8, @intCast(lit_enc.codes[i].len)); - } - - cgnl = codegen[num_literals .. num_literals + num_distances]; - for (cgnl, 0..) |_, i| { - cgnl[i] = @as(u8, @intCast(dist_enc.codes[i].len)); - } - codegen[num_literals + num_distances] = end_code_mark; - - var size = codegen[0]; - var count: i32 = 1; - var out_index: u32 = 0; - var in_index: u32 = 1; - while (size != end_code_mark) : (in_index += 1) { - // INVARIANT: We have seen "count" copies of size that have not yet - // had output generated for them. - const next_size = codegen[in_index]; - if (next_size == size) { - count += 1; - continue; - } - // We need to generate codegen indicating "count" of size. - if (size != 0) { - codegen[out_index] = size; - out_index += 1; - self.codegen_freq[size] += 1; - count -= 1; - while (count >= 3) { - var n: i32 = 6; - if (n > count) { - n = count; - } - codegen[out_index] = 16; - out_index += 1; - codegen[out_index] = @as(u8, @intCast(n - 3)); - out_index += 1; - self.codegen_freq[16] += 1; - count -= n; - } - } else { - while (count >= 11) { - var n: i32 = 138; - if (n > count) { - n = count; - } - codegen[out_index] = 18; - out_index += 1; - codegen[out_index] = @as(u8, @intCast(n - 11)); - out_index += 1; - self.codegen_freq[18] += 1; - count -= n; - } - if (count >= 3) { - // 3 <= count <= 10 - codegen[out_index] = 17; - out_index += 1; - codegen[out_index] = @as(u8, @intCast(count - 3)); - out_index += 1; - self.codegen_freq[17] += 1; - count = 0; - } - } - count -= 1; - while (count >= 0) : (count -= 1) { - codegen[out_index] = size; - out_index += 1; - self.codegen_freq[size] += 1; - } - // Set up invariant for next time through the loop. - size = next_size; - count = 1; - } - // Marker indicating the end of the codegen. - codegen[out_index] = end_code_mark; -} - -const DynamicSize = struct { - size: u32, - num_codegens: u32, -}; - -/// dynamicSize returns the size of dynamically encoded data in bits. -fn dynamicSize( - self: *BlockWriter, - lit_enc: *Compress.LiteralEncoder, // literal encoder - dist_enc: *Compress.DistanceEncoder, // distance encoder - extra_bits: u32, -) DynamicSize { - var num_codegens = self.codegen_freq.len; - while (num_codegens > 4 and self.codegen_freq[codegen_order[num_codegens - 1]] == 0) { - num_codegens -= 1; - } - const header = 3 + 5 + 5 + 4 + (3 * num_codegens) + - self.codegen_encoding.bitLength(self.codegen_freq[0..]) + - self.codegen_freq[16] * 2 + - self.codegen_freq[17] * 3 + - self.codegen_freq[18] * 7; - const size = header + - lit_enc.bitLength(&self.literal_freq) + - dist_enc.bitLength(&self.distance_freq) + - extra_bits; - - return DynamicSize{ - .size = @as(u32, @intCast(size)), - .num_codegens = @as(u32, @intCast(num_codegens)), - }; -} - -/// fixedSize returns the size of dynamically encoded data in bits. -fn fixedSize(self: *BlockWriter, extra_bits: u32) u32 { - return 3 + - self.fixed_literal_encoding.bitLength(&self.literal_freq) + - self.fixed_distance_encoding.bitLength(&self.distance_freq) + - extra_bits; -} - -const StoredSize = struct { - size: u32, - storable: bool, -}; - -/// storedSizeFits calculates the stored size, including header. -/// The function returns the size in bits and whether the block -/// fits inside a single block. -fn storedSizeFits(in: ?[]const u8) StoredSize { - if (in == null) { - return .{ .size = 0, .storable = false }; - } - if (in.?.len <= HuffmanEncoder.max_store_block_size) { - return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true }; - } - return .{ .size = 0, .storable = false }; -} - -/// Write the header of a dynamic Huffman block to the output stream. -/// -/// num_literals: The number of literals specified in codegen -/// num_distances: The number of distances specified in codegen -/// num_codegens: The number of codegens used in codegen -/// eof: Is it the end-of-file? (end of stream) -fn dynamicHeader( - self: *BlockWriter, - num_literals: u32, - num_distances: u32, - num_codegens: u32, - eof: bool, -) Writer.Error!void { - const first_bits: u32 = if (eof) 5 else 4; - try self.bit_writer.writeBits(first_bits, 3); - try self.bit_writer.writeBits(num_literals - 257, 5); - try self.bit_writer.writeBits(num_distances - 1, 5); - try self.bit_writer.writeBits(num_codegens - 4, 4); - - var i: u32 = 0; - while (i < num_codegens) : (i += 1) { - const value = self.codegen_encoding.codes[codegen_order[i]].len; - try self.bit_writer.writeBits(value, 3); - } - - i = 0; - while (true) { - const code_word: u32 = @as(u32, @intCast(self.codegen[i])); - i += 1; - if (code_word == end_code_mark) { - break; - } - try self.writeCode(self.codegen_encoding.codes[@as(u32, @intCast(code_word))]); - - switch (code_word) { - 16 => { - try self.bit_writer.writeBits(self.codegen[i], 2); - i += 1; - }, - 17 => { - try self.bit_writer.writeBits(self.codegen[i], 3); - i += 1; - }, - 18 => { - try self.bit_writer.writeBits(self.codegen[i], 7); - i += 1; - }, - else => {}, - } - } -} - -fn storedHeader(self: *BlockWriter, length: usize, eof: bool) Writer.Error!void { - assert(length <= 65535); - const flag: u32 = if (eof) 1 else 0; - try self.bit_writer.writeBits(flag, 3); - try self.flush(); - const l: u16 = @intCast(length); - try self.bit_writer.writeBits(l, 16); - try self.bit_writer.writeBits(~l, 16); -} - -fn fixedHeader(self: *BlockWriter, eof: bool) Writer.Error!void { - // Indicate that we are a fixed Huffman block - var value: u32 = 2; - if (eof) { - value = 3; - } - try self.bit_writer.writeBits(value, 3); -} - -/// Write a block of tokens with the smallest encoding. Will choose block type. -/// The original input can be supplied, and if the huffman encoded data -/// is larger than the original bytes, the data will be written as a -/// stored block. -/// If the input is null, the tokens will always be Huffman encoded. -pub fn write(self: *BlockWriter, tokens: []const Token, eof: bool, input: ?[]const u8) Writer.Error!void { - const lit_and_dist = self.indexTokens(tokens); - const num_literals = lit_and_dist.num_literals; - const num_distances = lit_and_dist.num_distances; - - var extra_bits: u32 = 0; - const ret = storedSizeFits(input); - const stored_size = ret.size; - const storable = ret.storable; - - if (storable) { - // We only bother calculating the costs of the extra bits required by - // the length of distance fields (which will be the same for both fixed - // and dynamic encoding), if we need to compare those two encodings - // against stored encoding. - var length_code: u16 = Token.length_codes_start + 8; - while (length_code < num_literals) : (length_code += 1) { - // First eight length codes have extra size = 0. - extra_bits += @as(u32, @intCast(self.literal_freq[length_code])) * - @as(u32, @intCast(Token.lengthExtraBits(length_code))); - } - var distance_code: u16 = 4; - while (distance_code < num_distances) : (distance_code += 1) { - // First four distance codes have extra size = 0. - extra_bits += @as(u32, @intCast(self.distance_freq[distance_code])) * - @as(u32, @intCast(Token.distanceExtraBits(distance_code))); - } - } - - // Figure out smallest code. - // Fixed Huffman baseline. - var literal_encoding = &self.fixed_literal_encoding; - var distance_encoding = &self.fixed_distance_encoding; - var size = self.fixedSize(extra_bits); - - // Dynamic Huffman? - var num_codegens: u32 = 0; - - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literal_encoding and the distance_encoding. - self.generateCodegen( - num_literals, - num_distances, - &self.literal_encoding, - &self.distance_encoding, - ); - self.codegen_encoding.generate(self.codegen_freq[0..], 7); - const dynamic_size = self.dynamicSize( - &self.literal_encoding, - &self.distance_encoding, - extra_bits, - ); - const dyn_size = dynamic_size.size; - num_codegens = dynamic_size.num_codegens; - - if (dyn_size < size) { - size = dyn_size; - literal_encoding = &self.literal_encoding; - distance_encoding = &self.distance_encoding; - } - - // Stored bytes? - if (storable and stored_size < size) { - try self.storedBlock(input.?, eof); - return; - } - - // Huffman. - if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) { - try self.fixedHeader(eof); - } else { - try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); - } - - // Write the tokens. - try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes); -} - -pub fn storedBlock(self: *BlockWriter, input: []const u8, eof: bool) Writer.Error!void { - try self.storedHeader(input.len, eof); - try self.bit_writer.writeBytes(input); -} - -/// writeBlockDynamic encodes a block using a dynamic Huffman table. -/// This should be used if the symbols used have a disproportionate -/// histogram distribution. -/// If input is supplied and the compression savings are below 1/16th of the -/// input size the block is stored. -fn dynamicBlock( - self: *BlockWriter, - tokens: []const Token, - eof: bool, - input: ?[]const u8, -) Writer.Error!void { - const total_tokens = self.indexTokens(tokens); - const num_literals = total_tokens.num_literals; - const num_distances = total_tokens.num_distances; - - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literal_encoding and the distance_encoding. - self.generateCodegen( - num_literals, - num_distances, - &self.literal_encoding, - &self.distance_encoding, - ); - self.codegen_encoding.generate(self.codegen_freq[0..], 7); - const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.distance_encoding, 0); - const size = dynamic_size.size; - const num_codegens = dynamic_size.num_codegens; - - // Store bytes, if we don't get a reasonable improvement. - - const stored_size = storedSizeFits(input); - const ssize = stored_size.size; - const storable = stored_size.storable; - if (storable and ssize < (size + (size >> 4))) { - try self.storedBlock(input.?, eof); - return; - } - - // Write Huffman table. - try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); - - // Write the tokens. - try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes); -} - -const TotalIndexedTokens = struct { - num_literals: u32, - num_distances: u32, -}; - -/// Indexes a slice of tokens followed by an end_block_marker, and updates -/// literal_freq and distance_freq, and generates literal_encoding -/// and distance_encoding. -/// The number of literal and distance tokens is returned. -fn indexTokens(self: *BlockWriter, tokens: []const Token) TotalIndexedTokens { - var num_literals: u32 = 0; - var num_distances: u32 = 0; - - for (self.literal_freq, 0..) |_, i| { - self.literal_freq[i] = 0; - } - for (self.distance_freq, 0..) |_, i| { - self.distance_freq[i] = 0; - } - - for (tokens) |t| { - if (t.kind == Token.Kind.literal) { - self.literal_freq[t.literal()] += 1; - continue; - } - self.literal_freq[t.lengthCode()] += 1; - self.distance_freq[t.distanceCode()] += 1; - } - // add end_block_marker token at the end - self.literal_freq[HuffmanEncoder.end_block_marker] += 1; - - // get the number of literals - num_literals = @as(u32, @intCast(self.literal_freq.len)); - while (self.literal_freq[num_literals - 1] == 0) { - num_literals -= 1; - } - // get the number of distances - num_distances = @as(u32, @intCast(self.distance_freq.len)); - while (num_distances > 0 and self.distance_freq[num_distances - 1] == 0) { - num_distances -= 1; - } - if (num_distances == 0) { - // We haven't found a single match. If we want to go with the dynamic encoding, - // we should count at least one distance to be sure that the distance huffman tree could be encoded. - self.distance_freq[0] = 1; - num_distances = 1; - } - self.literal_encoding.generate(&self.literal_freq, 15); - self.distance_encoding.generate(&self.distance_freq, 15); - return TotalIndexedTokens{ - .num_literals = num_literals, - .num_distances = num_distances, - }; -} - -/// Writes a slice of tokens to the output followed by and end_block_marker. -/// codes for literal and distance encoding must be supplied. -fn writeTokens( - self: *BlockWriter, - tokens: []const Token, - le_codes: []Compress.HuffCode, - oe_codes: []Compress.HuffCode, -) Writer.Error!void { - for (tokens) |t| { - if (t.kind == Token.Kind.literal) { - try self.writeCode(le_codes[t.literal()]); - continue; - } - - // Write the length - const le = t.lengthEncoding(); - try self.writeCode(le_codes[le.code]); - if (le.extra_bits > 0) { - try self.bit_writer.writeBits(le.extra_length, le.extra_bits); - } - - // Write the distance - const oe = t.distanceEncoding(); - try self.writeCode(oe_codes[oe.code]); - if (oe.extra_bits > 0) { - try self.bit_writer.writeBits(oe.extra_distance, oe.extra_bits); - } - } - // add end_block_marker at the end - try self.writeCode(le_codes[HuffmanEncoder.end_block_marker]); -} - -/// Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes -/// if the results only gains very little from compression. -pub fn huffmanBlock(self: *BlockWriter, input: []const u8, eof: bool) Writer.Error!void { - // Add everything as literals - histogram(input, &self.literal_freq); - - self.literal_freq[HuffmanEncoder.end_block_marker] = 1; - - const num_literals = HuffmanEncoder.end_block_marker + 1; - self.distance_freq[0] = 1; - const num_distances = 1; - - self.literal_encoding.generate(&self.literal_freq, 15); - - // Figure out smallest code. - // Always use dynamic Huffman or Store - var num_codegens: u32 = 0; - - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literal_encoding and the distance_encoding. - self.generateCodegen( - num_literals, - num_distances, - &self.literal_encoding, - &self.huff_distance, - ); - self.codegen_encoding.generate(self.codegen_freq[0..], 7); - const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.huff_distance, 0); - const size = dynamic_size.size; - num_codegens = dynamic_size.num_codegens; - - // Store bytes, if we don't get a reasonable improvement. - const stored_size_ret = storedSizeFits(input); - const ssize = stored_size_ret.size; - const storable = stored_size_ret.storable; - - if (storable and ssize < (size + (size >> 4))) { - try self.storedBlock(input, eof); - return; - } - - // Huffman. - try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); - const encoding = self.literal_encoding.codes[0..257]; - - for (input) |t| { - const c = encoding[t]; - try self.bit_writer.writeBits(c.code, c.len); - } - try self.writeCode(encoding[HuffmanEncoder.end_block_marker]); -} - -fn histogram(b: []const u8, h: *[286]u16) void { - // Clear histogram - for (h, 0..) |_, i| { - h[i] = 0; - } - - var lh = h.*[0..256]; - for (b) |t| { - lh[t] += 1; - } -} |
