std.compress.xz: skeleton in place

missing these things: - implementation of finish() - detect packed bytes read for check and block padding - implementation of discard() - implementation of block stream checksum
author: Andrew Kelley <andrew@ziglang.org> 2025-08-25 21:33:57 -0700
committer: Andrew Kelley <andrew@ziglang.org> 2025-08-26 21:00:58 -0700
commit: d87eb7d4e4f2ea606a18640fcc019b60cc435cdd (patch)
tree: 437c0e00c1b9a98e92b3045613de39ad36710429
parent: a8ae6c2f4265a66c7a63d788a13549c48a1dd8c0 (diff)
download: zig-d87eb7d4e4f2ea606a18640fcc019b60cc435cdd.tar.gz
zig-d87eb7d4e4f2ea606a18640fcc019b60cc435cdd.zip
2 files changed, 234 insertions, 201 deletions
diff --git a/lib/std/compress/xz/Decompress.zig b/lib/std/compress/xz/Decompress.zig
index 6b925020d6..99cfa266b1 100644
--- a/lib/std/compress/xz/Decompress.zig
+++ b/lib/std/compress/xz/Decompress.zig
@@ -26,6 +26,8 @@ pub const Error = error{
     WrongChecksum,
     Unsupported,
     Overflow,
+    InvalidRangeCode,
+    DecompressedSizeMismatch,
 };
 
 pub const Check = enum(u4) {
@@ -55,14 +57,14 @@ pub fn init(
     gpa: Allocator,
     /// Decompress takes ownership of this buffer and resizes it with `gpa`.
     buffer: []u8,
-) Decompress {
-    const magic = try input.takeBytes(6);
-    if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
+) !Decompress {
+    const magic = try input.takeArray(6);
+    if (!std.mem.eql(u8, magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
         return error.NotXzStream;
 
     const actual_hash = Crc32.hash(try input.peek(@sizeOf(StreamFlags)));
     const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable;
-    const stored_hash = try input.readInt(u32, .little);
+    const stored_hash = try input.takeInt(u32, .little);
     if (actual_hash != stored_hash) return error.WrongChecksum;
 
     return .{
@@ -71,6 +73,7 @@ pub fn init(
             .vtable = &.{
                 .stream = stream,
                 .readVec = readVec,
+                .discard = discard,
             },
             .buffer = buffer,
             .seek = 0,
@@ -83,206 +86,232 @@ pub fn init(
     };
 }
 
+/// Reclaim ownership of the buffer passed to `init`.
+pub fn takeBuffer(d: *Decompress) []u8 {
+    const buffer = d.reader.buffer;
+    d.reader.buffer = &.{};
+    return buffer;
+}
+
+pub fn deinit(d: *Decompress) void {
+    const gpa = d.gpa;
+    gpa.free(d.reader.buffer);
+    d.* = undefined;
+}
+
+fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize {
+    _ = data;
+    return readIndirect(r);
+}
+
 fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize {
     _ = w;
     _ = limit;
+    return readIndirect(r);
+}
+
+fn discard(r: *Reader, limit: std.Io.Limit) Reader.Error!usize {
     const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
     _ = d;
+    _ = limit;
     @panic("TODO");
 }
 
-fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize {
-    _ = data;
+fn readIndirect(r: *Reader) Reader.Error!usize {
     const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
+    const gpa = d.gpa;
+    const input = d.input;
+
+    var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer);
+    allocating.writer.end = r.end;
+    defer {
+        r.buffer = allocating.writer.buffer;
+        r.end = allocating.writer.end;
+    }
+
+    if (d.block_count == std.math.maxInt(usize)) return error.EndOfStream;
+
+    readBlock(input, &allocating) catch |err| switch (err) {
+        error.WriteFailed => {
+            d.err = error.OutOfMemory;
+            return error.ReadFailed;
+        },
+        error.SuccessfulEndOfStream => {
+            finish(d);
+            d.block_count = std.math.maxInt(usize);
+            return error.EndOfStream;
+        },
+        else => |e| {
+            d.err = e;
+            return error.ReadFailed;
+        },
+    };
+    switch (d.check) {
+        .none => {},
+        .crc32 => {
+            const declared_checksum = try input.takeInt(u32, .little);
+            // TODO
+            //const hash_a = Crc32.hash(unpacked_bytes);
+            //if (hash_a != hash_b) return error.WrongChecksum;
+            _ = declared_checksum;
+        },
+        .crc64 => {
+            const declared_checksum = try input.takeInt(u64, .little);
+            // TODO
+            //const hash_a = Crc64.hash(unpacked_bytes);
+            //if (hash_a != hash_b) return error.WrongChecksum;
+            _ = declared_checksum;
+        },
+        .sha256 => {
+            const declared_hash = try input.take(Sha256.digest_length);
+            // TODO
+            //var hash_a: [Sha256.digest_length]u8 = undefined;
+            //Sha256.hash(unpacked_bytes, &hash_a, .{});
+            //if (!std.mem.eql(u8, &hash_a, &hash_b))
+            //    return error.WrongChecksum;
+            _ = declared_hash;
+        },
+        else => {
+            d.err = error.Unsupported;
+            return error.ReadFailed;
+        },
+    }
+    d.block_count += 1;
+    return 0;
+}
+
+fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void {
+    var packed_size: ?u64 = null;
+    var unpacked_size: ?u64 = null;
+
+    {
+        // Read the block header via peeking so that we can hash the whole thing too.
+        const first_byte: usize = try input.peekByte();
+        if (first_byte == 0) return error.SuccessfulEndOfStream;
+
+        const declared_header_size = first_byte * 4;
+        try input.fill(declared_header_size);
+        const header_seek_start = input.seek;
+        input.toss(1);
+
+        const Flags = packed struct(u8) {
+            last_filter_index: u2,
+            reserved: u4,
+            has_packed_size: bool,
+            has_unpacked_size: bool,
+        };
+        const flags = try input.takeStruct(Flags, .little);
+
+        const filter_count = @as(u3, flags.last_filter_index) + 1;
+        if (filter_count > 1) return error.Unsupported;
+
+        if (flags.has_packed_size) packed_size = try input.takeLeb128(u64);
+        if (flags.has_unpacked_size) unpacked_size = try input.takeLeb128(u64);
+
+        const FilterId = enum(u64) {
+            lzma2 = 0x21,
+            _,
+        };
+
+        const filter_id: FilterId = @enumFromInt(try input.takeLeb128(u64));
+        if (filter_id != .lzma2) return error.Unsupported;
+
+        const properties_size = try input.takeLeb128(u64);
+        if (properties_size != 1) return error.CorruptInput;
+        // TODO: use filter properties
+        _ = try input.takeByte();
+
+        const actual_header_size = input.seek - header_seek_start;
+        if (actual_header_size > declared_header_size) return error.CorruptInput;
+        var remaining_bytes = declared_header_size - actual_header_size;
+        while (remaining_bytes != 0) {
+            if (try input.takeByte() != 0) return error.CorruptInput;
+            remaining_bytes -= 1;
+        }
+
+        const header_slice = input.buffer[header_seek_start..][0..declared_header_size];
+        const actual_hash = Crc32.hash(header_slice);
+        const declared_hash = try input.takeInt(u32, .little);
+        if (actual_hash != declared_hash) return error.WrongChecksum;
+    }
+
+    // Compressed Data
+
+    var lzma2_decode = try lzma2.Decode.init(allocating.allocator);
+    const before_size = allocating.writer.end;
+    try lzma2_decode.decompress(input, allocating);
+    const unpacked_bytes = allocating.writer.end - before_size;
+
+    // TODO restore this check
+    //if (packed_size) |s| {
+    //    if (s != packed_counter.bytes_read)
+    //        return error.CorruptInput;
+    //}
+
+    if (unpacked_size) |s| {
+        if (s != unpacked_bytes) return error.CorruptInput;
+    }
+
+    // Block Padding
+    if (true) @panic("TODO account for block padding");
+    //while (block_counter.bytes_read % 4 != 0) {
+    //    if (try block_reader.takeByte() != 0)
+    //        return error.CorruptInput;
+    //}
+
+}
+
+fn finish(d: *Decompress) void {
     _ = d;
     @panic("TODO");
-}
+    //const input = d.input;
+    //const index_size = blk: {
+    //    const record_count = try input.takeLeb128(u64);
+    //    if (record_count != d.block_decode.block_count)
+    //        return error.CorruptInput;
+
+    //    var i: usize = 0;
+    //    while (i < record_count) : (i += 1) {
+    //        // TODO: validate records
+    //        _ = try std.leb.readUleb128(u64, counting_reader);
+    //        _ = try std.leb.readUleb128(u64, counting_reader);
+    //    }
+
+    //    while (counter.bytes_read % 4 != 0) {
+    //        if (try counting_reader.takeByte() != 0)
+    //            return error.CorruptInput;
+    //    }
+
+    //    const hash_a = hasher.hasher.final();
+    //    const hash_b = try counting_reader.takeInt(u32, .little);
+    //    if (hash_a != hash_b)
+    //        return error.WrongChecksum;
+
+    //    break :blk counter.bytes_read;
+    //};
+
+    //const hash_a = try d.in_reader.takeInt(u32, .little);
 
-//    if (buffer.len == 0)
-//        return 0;
-//
-//    const r = try self.block_decode.read(buffer);
-//    if (r != 0)
-//        return r;
-//
-//    const index_size = blk: {
-//        var hasher = hashedReader(self.in_reader, Crc32.init());
-//        hasher.hasher.update(&[1]u8{0x00});
-//
-//        var counter = std.io.countingReader(hasher.reader());
-//        counter.bytes_read += 1;
-//
-//        const counting_reader = counter.reader();
-//
-//        const record_count = try std.leb.readUleb128(u64, counting_reader);
-//        if (record_count != self.block_decode.block_count)
-//            return error.CorruptInput;
-//
-//        var i: usize = 0;
-//        while (i < record_count) : (i += 1) {
-//            // TODO: validate records
-//            _ = try std.leb.readUleb128(u64, counting_reader);
-//            _ = try std.leb.readUleb128(u64, counting_reader);
-//        }
-//
-//        while (counter.bytes_read % 4 != 0) {
-//            if (try counting_reader.readByte() != 0)
-//                return error.CorruptInput;
-//        }
-//
-//        const hash_a = hasher.hasher.final();
-//        const hash_b = try counting_reader.readInt(u32, .little);
-//        if (hash_a != hash_b)
-//            return error.WrongChecksum;
-//
-//        break :blk counter.bytes_read;
-//    };
-//
-//    const hash_a = try self.in_reader.readInt(u32, .little);
-//
-//    const hash_b = blk: {
-//        var hasher = hashedReader(self.in_reader, Crc32.init());
-//        const hashed_reader = hasher.reader();
-//
-//        const backward_size = (@as(u64, try hashed_reader.readInt(u32, .little)) + 1) * 4;
-//        if (backward_size != index_size)
-//            return error.CorruptInput;
-//
-//        var check: Check = undefined;
-//        try readStreamFlags(hashed_reader, &check);
-//
-//        break :blk hasher.hasher.final();
-//    };
-//
-//    if (hash_a != hash_b)
-//        return error.WrongChecksum;
-//
-//    const magic = try self.in_reader.readBytesNoEof(2);
-//    if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' }))
-//        return error.CorruptInput;
-//
-//    return 0;
-//}
-
-//fn readBlock(self: *BlockDecode) Error!void {
-//    var block_counter = std.io.countingReader(self.inner_reader);
-//    const block_reader = block_counter.reader();
-//
-//    var packed_size: ?u64 = null;
-//    var unpacked_size: ?u64 = null;
-//
-//    // Block Header
-//    {
-//        var header_hasher = hashedReader(block_reader, Crc32.init());
-//        const header_reader = header_hasher.reader();
-//
-//        const header_size = @as(u64, try header_reader.readByte()) * 4;
-//        if (header_size == 0)
-//            return error.EndOfStreamWithNoError;
-//
-//        const Flags = packed struct(u8) {
-//            last_filter_index: u2,
-//            reserved: u4,
-//            has_packed_size: bool,
-//            has_unpacked_size: bool,
-//        };
-//
-//        const flags = @as(Flags, @bitCast(try header_reader.readByte()));
-//        const filter_count = @as(u3, flags.last_filter_index) + 1;
-//        if (filter_count > 1)
-//            return error.Unsupported;
-//
-//        if (flags.has_packed_size)
-//            packed_size = try std.leb.readUleb128(u64, header_reader);
-//
-//        if (flags.has_unpacked_size)
-//            unpacked_size = try std.leb.readUleb128(u64, header_reader);
-//
-//        const FilterId = enum(u64) {
-//            lzma2 = 0x21,
-//            _,
-//        };
-//
-//        const filter_id = @as(
-//            FilterId,
-//            @enumFromInt(try std.leb.readUleb128(u64, header_reader)),
-//        );
-//
-//        if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000)
-//            return error.CorruptInput;
-//
-//        if (filter_id != .lzma2)
-//            return error.Unsupported;
-//
-//        const properties_size = try std.leb.readUleb128(u64, header_reader);
-//        if (properties_size != 1)
-//            return error.CorruptInput;
-//
-//        // TODO: use filter properties
-//        _ = try header_reader.readByte();
-//
-//        while (block_counter.bytes_read != header_size) {
-//            if (try header_reader.readByte() != 0)
-//                return error.CorruptInput;
-//        }
-//
-//        const hash_a = header_hasher.hasher.final();
-//        const hash_b = try header_reader.readInt(u32, .little);
-//        if (hash_a != hash_b)
-//            return error.WrongChecksum;
-//    }
-//
-//    // Compressed Data
-//    var packed_counter = std.io.countingReader(block_reader);
-//    try lzma2.decompress(
-//        self.allocator,
-//        packed_counter.reader(),
-//        self.to_read.writer(self.allocator),
-//    );
-//
-//    if (packed_size) |s| {
-//        if (s != packed_counter.bytes_read)
-//            return error.CorruptInput;
-//    }
-//
-//    const unpacked_bytes = self.to_read.items;
-//    if (unpacked_size) |s| {
-//        if (s != unpacked_bytes.len)
-//            return error.CorruptInput;
-//    }
-//
-//    // Block Padding
-//    while (block_counter.bytes_read % 4 != 0) {
-//        if (try block_reader.readByte() != 0)
-//            return error.CorruptInput;
-//    }
-//
-//    switch (self.check) {
-//        .none => {},
-//        .crc32 => {
-//            const hash_a = Crc32.hash(unpacked_bytes);
-//            const hash_b = try self.inner_reader.readInt(u32, .little);
-//            if (hash_a != hash_b)
-//                return error.WrongChecksum;
-//        },
-//        .crc64 => {
-//            const hash_a = Crc64.hash(unpacked_bytes);
-//            const hash_b = try self.inner_reader.readInt(u64, .little);
-//            if (hash_a != hash_b)
-//                return error.WrongChecksum;
-//        },
-//        .sha256 => {
-//            var hash_a: [Sha256.digest_length]u8 = undefined;
-//            Sha256.hash(unpacked_bytes, &hash_a, .{});
-//
-//            var hash_b: [Sha256.digest_length]u8 = undefined;
-//            try self.inner_reader.readNoEof(&hash_b);
-//
-//            if (!std.mem.eql(u8, &hash_a, &hash_b))
-//                return error.WrongChecksum;
-//        },
-//        else => return error.Unsupported,
-//    }
-//
-//    self.block_count += 1;
-//}
+    //const hash_b = blk: {
+    //    var hasher = hashedReader(d.in_reader, Crc32.init());
+    //    const hashed_reader = hasher.reader();
+
+    //    const backward_size = (@as(u64, try hashed_reader.takeInt(u32, .little)) + 1) * 4;
+    //    if (backward_size != index_size)
+    //        return error.CorruptInput;
+
+    //    var check: Check = undefined;
+    //    try readStreamFlags(hashed_reader, &check);
+
+    //    break :blk hasher.hasher.final();
+    //};
+
+    //if (hash_a != hash_b)
+    //    return error.WrongChecksum;
+
+    //const magic = try d.in_reader.takeBytesNoEof(2);
+    //if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' }))
+    //    return error.CorruptInput;
+
+    //return 0;
+}
diff --git a/lib/std/compress/xz/test.zig b/lib/std/compress/xz/test.zig
index 08180e45c0..a25cc08df0 100644
--- a/lib/std/compress/xz/test.zig
+++ b/lib/std/compress/xz/test.zig
@@ -3,19 +3,23 @@ const testing = std.testing;
 const xz = std.compress.xz;
 
 fn decompress(data: []const u8) ![]u8 {
-    var in_stream = std.io.fixedBufferStream(data);
+    const gpa = testing.allocator;
 
-    var xz_stream = try xz.decompress(testing.allocator, in_stream.reader());
+    var in_stream: std.Io.Reader = .fixed(data);
+
+    var xz_stream = try xz.Decompress.init(&in_stream, gpa, &.{});
     defer xz_stream.deinit();
 
-    return xz_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
+    return xz_stream.reader.allocRemaining(gpa, .unlimited);
 }
 
 fn testReader(data: []const u8, comptime expected: []const u8) !void {
-    const buf = try decompress(data);
-    defer testing.allocator.free(buf);
+    const gpa = testing.allocator;
+
+    const result = try decompress(data);
+    defer gpa.free(result);
 
-    try testing.expectEqualSlices(u8, expected, buf);
+    try testing.expectEqualSlices(u8, expected, result);
 }
 
 test "compressed data" {
author	Andrew Kelley <andrew@ziglang.org>	2025-08-25 21:33:57 -0700
committer	Andrew Kelley <andrew@ziglang.org>	2025-08-26 21:00:58 -0700
commit	d87eb7d4e4f2ea606a18640fcc019b60cc435cdd (patch)
tree	437c0e00c1b9a98e92b3045613de39ad36710429
parent	a8ae6c2f4265a66c7a63d788a13549c48a1dd8c0 (diff)
download	zig-d87eb7d4e4f2ea606a18640fcc019b60cc435cdd.tar.gz zig-d87eb7d4e4f2ea606a18640fcc019b60cc435cdd.zip