9 files changed, 987 insertions, 437 deletions
diff --git a/lib/std/ascii.zig b/lib/std/ascii.zig
index cd8b14e98f..eac3ba0565 100644
--- a/lib/std/ascii.zig
+++ b/lib/std/ascii.zig
@@ -555,22 +555,54 @@ test "ascii.endsWithIgnoreCase" {
     try std.testing.expect(!endsWithIgnoreCase("BoB", "Bo"));
 }
 
-/// Finds `substr` in `container`, ignoring case, starting at `start_index`.
-/// TODO boyer-moore algorithm
-pub fn indexOfIgnoreCasePos(container: []const u8, start_index: usize, substr: []const u8) ?usize {
-    if (substr.len > container.len) return null;
+/// Finds `needle` in `haystack`, ignoring case, starting at index 0.
+pub fn indexOfIgnoreCase(haystack: []const u8, needle: []const u8) ?usize {
+    return indexOfIgnoreCasePos(haystack, 0, needle);
+}
+
+/// Finds `needle` in `haystack`, ignoring case, starting at `start_index`.
+/// Uses Boyer-Moore-Horspool algorithm on large inputs; `indexOfIgnoreCasePosLinear` on small inputs.
+pub fn indexOfIgnoreCasePos(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
+    if (needle.len > haystack.len) return null;
+    if (needle.len == 0) return start_index;
+
+    if (haystack.len < 52 or needle.len <= 4)
+        return indexOfIgnoreCasePosLinear(haystack, start_index, needle);
+
+    var skip_table: [256]usize = undefined;
+    boyerMooreHorspoolPreprocessIgnoreCase(needle, skip_table[0..]);
 
     var i: usize = start_index;
-    const end = container.len - substr.len;
+    while (i <= haystack.len - needle.len) {
+        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
+        i += skip_table[toLower(haystack[i + needle.len - 1])];
+    }
+
+    return null;
+}
+
+/// Consider using `indexOfIgnoreCasePos` instead of this, which will automatically use a
+/// more sophisticated algorithm on larger inputs.
+pub fn indexOfIgnoreCasePosLinear(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
+    var i: usize = start_index;
+    const end = haystack.len - needle.len;
     while (i <= end) : (i += 1) {
-        if (eqlIgnoreCase(container[i .. i + substr.len], substr)) return i;
+        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
     }
     return null;
 }
 
-/// Finds `substr` in `container`, ignoring case, starting at index 0.
-pub fn indexOfIgnoreCase(container: []const u8, substr: []const u8) ?usize {
-    return indexOfIgnoreCasePos(container, 0, substr);
+fn boyerMooreHorspoolPreprocessIgnoreCase(pattern: []const u8, table: *[256]usize) void {
+    for (table) |*c| {
+        c.* = pattern.len;
+    }
+
+    var i: usize = 0;
+    // The last item is intentionally ignored and the skip size will be pattern.len.
+    // This is the standard way Boyer-Moore-Horspool is implemented.
+    while (i < pattern.len - 1) : (i += 1) {
+        table[toLower(pattern[i])] = pattern.len - 1 - i;
+    }
 }
 
 test "indexOfIgnoreCase" {
@@ -579,6 +611,9 @@ test "indexOfIgnoreCase" {
     try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
     try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
     try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
+
+    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "ThReE fOUr").? == 8);
+    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "Two tWo") == null);
 }
 
 /// Returns the lexicographical order of two slices. O(n).
diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index b875f73b2e..ac2f089ea1 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1762,16 +1762,32 @@ pub const Mutable = struct {
     }
 
     /// Read the value of `x` from `buffer`
-    /// Asserts that `buffer`, `abi_size`, and `bit_count` are large enough to store the value.
+    /// Asserts that `buffer` is large enough to contain a value of bit-size `bit_count`.
     ///
     /// The contents of `buffer` are interpreted as if they were the contents of
-    /// @ptrCast(*[abi_size]const u8, &x). Byte ordering is determined by `endian`
+    /// @ptrCast(*[buffer.len]const u8, &x). Byte ordering is determined by `endian`
     /// and any required padding bits are expected on the MSB end.
     pub fn readTwosComplement(
         x: *Mutable,
         buffer: []const u8,
         bit_count: usize,
-        abi_size: usize,
+        endian: Endian,
+        signedness: Signedness,
+    ) void {
+        return readPackedTwosComplement(x, buffer, 0, bit_count, endian, signedness);
+    }
+
+    /// Read the value of `x` from a packed memory `buffer`.
+    /// Asserts that `buffer` is large enough to contain a value of bit-size `bit_count`
+    /// at offset `bit_offset`.
+    ///
+    /// This is equivalent to loading the value of an integer with `bit_count` bits as
+    /// if it were a field in packed memory at the provided bit offset.
+    pub fn readPackedTwosComplement(
+        x: *Mutable,
+        bytes: []const u8,
+        bit_offset: usize,
+        bit_count: usize,
         endian: Endian,
         signedness: Signedness,
     ) void {
@@ -1782,75 +1798,54 @@ pub const Mutable = struct {
             return;
         }
 
-        // byte_count is our total read size: it cannot exceed abi_size,
-        // but may be less as long as it includes the required bits
-        const limb_count = calcTwosCompLimbCount(bit_count);
-        const byte_count = std.math.min(abi_size, @sizeOf(Limb) * limb_count);
-        assert(8 * byte_count >= bit_count);
-
         // Check whether the input is negative
         var positive = true;
         if (signedness == .signed) {
+            const total_bits = bit_offset + bit_count;
             var last_byte = switch (endian) {
-                .Little => ((bit_count + 7) / 8) - 1,
-                .Big => abi_size - ((bit_count + 7) / 8),
+                .Little => ((total_bits + 7) / 8) - 1,
+                .Big => bytes.len - ((total_bits + 7) / 8),
             };
 
-            const sign_bit = @as(u8, 1) << @intCast(u3, (bit_count - 1) % 8);
-            positive = ((buffer[last_byte] & sign_bit) == 0);
+            const sign_bit = @as(u8, 1) << @intCast(u3, (total_bits - 1) % 8);
+            positive = ((bytes[last_byte] & sign_bit) == 0);
         }
 
         // Copy all complete limbs
-        var carry: u1 = if (positive) 0 else 1;
+        var carry: u1 = 1;
         var limb_index: usize = 0;
+        var bit_index: usize = 0;
         while (limb_index < bit_count / @bitSizeOf(Limb)) : (limb_index += 1) {
-            var buf_index = switch (endian) {
-                .Little => @sizeOf(Limb) * limb_index,
-                .Big => abi_size - (limb_index + 1) * @sizeOf(Limb),
-            };
-
-            const limb_buf = @ptrCast(*const [@sizeOf(Limb)]u8, buffer[buf_index..]);
-            var limb = mem.readInt(Limb, limb_buf, endian);
+            // Read one Limb of bits
+            var limb = mem.readPackedInt(Limb, bytes, bit_index + bit_offset, endian);
+            bit_index += @bitSizeOf(Limb);
 
             // 2's complement (bitwise not, then add carry bit)
             if (!positive) carry = @boolToInt(@addWithOverflow(Limb, ~limb, carry, &limb));
             x.limbs[limb_index] = limb;
         }
 
-        // Copy the remaining N bytes (N <= @sizeOf(Limb))
-        var bytes_read = limb_index * @sizeOf(Limb);
-        if (bytes_read != byte_count) {
-            var limb: Limb = 0;
-
-            while (bytes_read != byte_count) {
-                const read_size = std.math.floorPowerOfTwo(usize, byte_count - bytes_read);
-                var int_buffer = switch (endian) {
-                    .Little => buffer[bytes_read..],
-                    .Big => buffer[(abi_size - bytes_read - read_size)..],
-                };
-                limb |= @intCast(Limb, switch (read_size) {
-                    1 => mem.readInt(u8, int_buffer[0..1], endian),
-                    2 => mem.readInt(u16, int_buffer[0..2], endian),
-                    4 => mem.readInt(u32, int_buffer[0..4], endian),
-                    8 => mem.readInt(u64, int_buffer[0..8], endian),
-                    16 => mem.readInt(u128, int_buffer[0..16], endian),
-                    else => unreachable,
-                }) << @intCast(Log2Limb, 8 * (bytes_read % @sizeOf(Limb)));
-                bytes_read += read_size;
-            }
+        // Copy the remaining bits
+        if (bit_count != bit_index) {
+            // Read all remaining bits
+            var limb = switch (signedness) {
+                .unsigned => mem.readVarPackedInt(Limb, bytes, bit_index + bit_offset, bit_count - bit_index, endian, .unsigned),
+                .signed => b: {
+                    const SLimb = std.meta.Int(.signed, @bitSizeOf(Limb));
+                    const limb = mem.readVarPackedInt(SLimb, bytes, bit_index + bit_offset, bit_count - bit_index, endian, .signed);
+                    break :b @bitCast(Limb, limb);
+                },
+            };
 
             // 2's complement (bitwise not, then add carry bit)
-            if (!positive) _ = @addWithOverflow(Limb, ~limb, carry, &limb);
-
-            // Mask off any unused bits
-            const valid_bits = @intCast(Log2Limb, bit_count % @bitSizeOf(Limb));
-            const mask = (@as(Limb, 1) << valid_bits) -% 1; // 0b0..01..1 with (valid_bits_in_limb) trailing ones
-            limb &= mask;
+            if (!positive) assert(!@addWithOverflow(Limb, ~limb, carry, &limb));
+            x.limbs[limb_index] = limb;
 
-            x.limbs[limb_count - 1] = limb;
+            limb_index += 1;
         }
+
         x.positive = positive;
-        x.len = limb_count;
+        x.len = limb_index;
         x.normalize(x.len);
     }
 
@@ -2212,66 +2207,48 @@ pub const Const = struct {
     }
 
     /// Write the value of `x` into `buffer`
-    /// Asserts that `buffer`, `abi_size`, and `bit_count` are large enough to store the value.
+    /// Asserts that `buffer` is large enough to store the value.
     ///
     /// `buffer` is filled so that its contents match what would be observed via
-    /// @ptrCast(*[abi_size]const u8, &x). Byte ordering is determined by `endian`,
+    /// @ptrCast(*[buffer.len]const u8, &x). Byte ordering is determined by `endian`,
     /// and any required padding bits are added on the MSB end.
-    pub fn writeTwosComplement(x: Const, buffer: []u8, bit_count: usize, abi_size: usize, endian: Endian) void {
+    pub fn writeTwosComplement(x: Const, buffer: []u8, endian: Endian) void {
+        return writePackedTwosComplement(x, buffer, 0, 8 * buffer.len, endian);
+    }
 
-        // byte_count is our total write size
-        const byte_count = abi_size;
-        assert(8 * byte_count >= bit_count);
-        assert(buffer.len >= byte_count);
+    /// Write the value of `x` to a packed memory `buffer`.
+    /// Asserts that `buffer` is large enough to contain a value of bit-size `bit_count`
+    /// at offset `bit_offset`.
+    ///
+    /// This is equivalent to storing the value of an integer with `bit_count` bits as
+    /// if it were a field in packed memory at the provided bit offset.
+    pub fn writePackedTwosComplement(x: Const, bytes: []u8, bit_offset: usize, bit_count: usize, endian: Endian) void {
         assert(x.fitsInTwosComp(if (x.positive) .unsigned else .signed, bit_count));
 
         // Copy all complete limbs
-        var carry: u1 = if (x.positive) 0 else 1;
+        var carry: u1 = 1;
         var limb_index: usize = 0;
-        while (limb_index < byte_count / @sizeOf(Limb)) : (limb_index += 1) {
-            var buf_index = switch (endian) {
-                .Little => @sizeOf(Limb) * limb_index,
-                .Big => abi_size - (limb_index + 1) * @sizeOf(Limb),
-            };
-
+        var bit_index: usize = 0;
+        while (limb_index < bit_count / @bitSizeOf(Limb)) : (limb_index += 1) {
             var limb: Limb = if (limb_index < x.limbs.len) x.limbs[limb_index] else 0;
+
             // 2's complement (bitwise not, then add carry bit)
             if (!x.positive) carry = @boolToInt(@addWithOverflow(Limb, ~limb, carry, &limb));
 
-            var limb_buf = @ptrCast(*[@sizeOf(Limb)]u8, buffer[buf_index..]);
-            mem.writeInt(Limb, limb_buf, limb, endian);
+            // Write one Limb of bits
+            mem.writePackedInt(Limb, bytes, bit_index + bit_offset, limb, endian);
+            bit_index += @bitSizeOf(Limb);
         }
 
-        // Copy the remaining N bytes (N < @sizeOf(Limb))
-        var bytes_written = limb_index * @sizeOf(Limb);
-        if (bytes_written != byte_count) {
+        // Copy the remaining bits
+        if (bit_count != bit_index) {
             var limb: Limb = if (limb_index < x.limbs.len) x.limbs[limb_index] else 0;
+
             // 2's complement (bitwise not, then add carry bit)
             if (!x.positive) _ = @addWithOverflow(Limb, ~limb, carry, &limb);
 
-            while (bytes_written != byte_count) {
-                const write_size = std.math.floorPowerOfTwo(usize, byte_count - bytes_written);
-                var int_buffer = switch (endian) {
-                    .Little => buffer[bytes_written..],
-                    .Big => buffer[(abi_size - bytes_written - write_size)..],
-                };
-
-                if (write_size == 1) {
-                    mem.writeInt(u8, int_buffer[0..1], @truncate(u8, limb), endian);
-                } else if (@sizeOf(Limb) >= 2 and write_size == 2) {
-                    mem.writeInt(u16, int_buffer[0..2], @truncate(u16, limb), endian);
-                } else if (@sizeOf(Limb) >= 4 and write_size == 4) {
-                    mem.writeInt(u32, int_buffer[0..4], @truncate(u32, limb), endian);
-                } else if (@sizeOf(Limb) >= 8 and write_size == 8) {
-                    mem.writeInt(u64, int_buffer[0..8], @truncate(u64, limb), endian);
-                } else if (@sizeOf(Limb) >= 16 and write_size == 16) {
-                    mem.writeInt(u128, int_buffer[0..16], @truncate(u128, limb), endian);
-                } else if (@sizeOf(Limb) >= 32) {
-                    @compileError("@sizeOf(Limb) exceeded supported range");
-                } else unreachable;
-                limb >>= @intCast(Log2Limb, 8 * write_size);
-                bytes_written += write_size;
-            }
+            // Write all remaining bits
+            mem.writeVarPackedInt(bytes, bit_index + bit_offset, bit_count - bit_index, limb, endian);
         }
     }
 
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 5685a38d41..97de06bfcc 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -2603,13 +2603,13 @@ test "big int conversion read/write twos complement" {
 
     for (endians) |endian| {
         // Writing to buffer and back should not change anything
-        a.toConst().writeTwosComplement(buffer1, 493, abi_size, endian);
-        m.readTwosComplement(buffer1, 493, abi_size, endian, .unsigned);
+        a.toConst().writeTwosComplement(buffer1[0..abi_size], endian);
+        m.readTwosComplement(buffer1[0..abi_size], 493, endian, .unsigned);
         try testing.expect(m.toConst().order(a.toConst()) == .eq);
 
         // Equivalent to @bitCast(i493, @as(u493, intMax(u493))
-        a.toConst().writeTwosComplement(buffer1, 493, abi_size, endian);
-        m.readTwosComplement(buffer1, 493, abi_size, endian, .signed);
+        a.toConst().writeTwosComplement(buffer1[0..abi_size], endian);
+        m.readTwosComplement(buffer1[0..abi_size], 493, endian, .signed);
         try testing.expect(m.toConst().orderAgainstScalar(-1) == .eq);
     }
 }
@@ -2628,26 +2628,26 @@ test "big int conversion read twos complement with padding" {
     // (3) should sign-extend any bits from bit_count to 8 * abi_size
 
     var bit_count: usize = 12 * 8 + 1;
-    a.toConst().writeTwosComplement(buffer1, bit_count, 13, .Little);
+    a.toConst().writeTwosComplement(buffer1[0..13], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0xaa, 0xaa, 0xaa }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 13, .Big);
+    a.toConst().writeTwosComplement(buffer1[0..13], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xaa, 0xaa, 0xaa }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 16, .Little);
+    a.toConst().writeTwosComplement(buffer1[0..16], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0x0, 0x0 }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 16, .Big);
+    a.toConst().writeTwosComplement(buffer1[0..16], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0x0, 0x0, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd }));
 
     @memset(buffer1.ptr, 0xaa, buffer1.len);
     try a.set(-0x01_02030405_06070809_0a0b0c0d);
     bit_count = 12 * 8 + 2;
 
-    a.toConst().writeTwosComplement(buffer1, bit_count, 13, .Little);
+    a.toConst().writeTwosComplement(buffer1[0..13], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xf3, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xaa, 0xaa, 0xaa }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 13, .Big);
+    a.toConst().writeTwosComplement(buffer1[0..13], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf3, 0xaa, 0xaa, 0xaa }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 16, .Little);
+    a.toConst().writeTwosComplement(buffer1[0..16], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xf3, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff }));
-    a.toConst().writeTwosComplement(buffer1, bit_count, 16, .Big);
+    a.toConst().writeTwosComplement(buffer1[0..16], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &[_]u8{ 0xff, 0xff, 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf3 }));
 }
 
@@ -2660,17 +2660,15 @@ test "big int write twos complement +/- zero" {
     defer testing.allocator.free(buffer1);
     @memset(buffer1.ptr, 0xaa, buffer1.len);
 
-    var bit_count: usize = 0;
-
     // Test zero
 
-    m.toConst().writeTwosComplement(buffer1, bit_count, 13, .Little);
+    m.toConst().writeTwosComplement(buffer1[0..13], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 13) ++ ([_]u8{0xaa} ** 3))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 13, .Big);
+    m.toConst().writeTwosComplement(buffer1[0..13], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 13) ++ ([_]u8{0xaa} ** 3))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 16, .Little);
+    m.toConst().writeTwosComplement(buffer1[0..16], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 16))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 16, .Big);
+    m.toConst().writeTwosComplement(buffer1[0..16], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 16))));
 
     @memset(buffer1.ptr, 0xaa, buffer1.len);
@@ -2678,13 +2676,13 @@ test "big int write twos complement +/- zero" {
 
     // Test negative zero
 
-    m.toConst().writeTwosComplement(buffer1, bit_count, 13, .Little);
+    m.toConst().writeTwosComplement(buffer1[0..13], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 13) ++ ([_]u8{0xaa} ** 3))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 13, .Big);
+    m.toConst().writeTwosComplement(buffer1[0..13], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 13) ++ ([_]u8{0xaa} ** 3))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 16, .Little);
+    m.toConst().writeTwosComplement(buffer1[0..16], .Little);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 16))));
-    m.toConst().writeTwosComplement(buffer1, bit_count, 16, .Big);
+    m.toConst().writeTwosComplement(buffer1[0..16], .Big);
     try testing.expect(std.mem.eql(u8, buffer1, &(([_]u8{0} ** 16))));
 }
 
@@ -2705,62 +2703,82 @@ test "big int conversion write twos complement with padding" {
     // Test 0x01_02030405_06070809_0a0b0c0d
 
     buffer = &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0xb };
-    m.readTwosComplement(buffer, bit_count, 13, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0xb, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd };
-    m.readTwosComplement(buffer, bit_count, 13, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0xab, 0xaa, 0xaa, 0xaa };
-    m.readTwosComplement(buffer, bit_count, 16, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0xaa, 0xaa, 0xaa, 0xab, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd };
-    m.readTwosComplement(buffer, bit_count, 16, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x01_02030405_06070809_0a0b0c0d) == .eq);
 
+    bit_count = @sizeOf(Limb) * 8;
+
+    // Test 0x0a0a0a0a_02030405_06070809_0a0b0c0d
+
+    buffer = &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0xaa };
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .unsigned);
+    try testing.expect(m.toConst().orderAgainstScalar(@truncate(Limb, 0xaa_02030405_06070809_0a0b0c0d)) == .eq);
+
+    buffer = &[_]u8{ 0xaa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd };
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .unsigned);
+    try testing.expect(m.toConst().orderAgainstScalar(@truncate(Limb, 0xaa_02030405_06070809_0a0b0c0d)) == .eq);
+
+    buffer = &[_]u8{ 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0xaa, 0xaa, 0xaa, 0xaa };
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .unsigned);
+    try testing.expect(m.toConst().orderAgainstScalar(@truncate(Limb, 0xaaaaaaaa_02030405_06070809_0a0b0c0d)) == .eq);
+
+    buffer = &[_]u8{ 0xaa, 0xaa, 0xaa, 0xaa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd };
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .unsigned);
+    try testing.expect(m.toConst().orderAgainstScalar(@truncate(Limb, 0xaaaaaaaa_02030405_06070809_0a0b0c0d)) == .eq);
+
     bit_count = 12 * 8 + 2;
 
     // Test -0x01_02030405_06070809_0a0b0c0d
 
     buffer = &[_]u8{ 0xf3, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x02 };
-    m.readTwosComplement(buffer, bit_count, 13, .Little, .signed);
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .signed);
     try testing.expect(m.toConst().orderAgainstScalar(-0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0x02, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf3 };
-    m.readTwosComplement(buffer, bit_count, 13, .Big, .signed);
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .signed);
     try testing.expect(m.toConst().orderAgainstScalar(-0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0xf3, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x02, 0xaa, 0xaa, 0xaa };
-    m.readTwosComplement(buffer, bit_count, 16, .Little, .signed);
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .signed);
     try testing.expect(m.toConst().orderAgainstScalar(-0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     buffer = &[_]u8{ 0xaa, 0xaa, 0xaa, 0x02, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf3 };
-    m.readTwosComplement(buffer, bit_count, 16, .Big, .signed);
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .signed);
     try testing.expect(m.toConst().orderAgainstScalar(-0x01_02030405_06070809_0a0b0c0d) == .eq);
 
     // Test 0
 
     buffer = &([_]u8{0} ** 16);
-    m.readTwosComplement(buffer, bit_count, 13, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 13, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 16, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 16, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
 
     bit_count = 0;
     buffer = &([_]u8{0xaa} ** 16);
-    m.readTwosComplement(buffer, bit_count, 13, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 13, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 16, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 16, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
 }
 
@@ -2779,15 +2797,15 @@ test "big int conversion write twos complement zero" {
     var buffer: []const u8 = undefined;
 
     buffer = &([_]u8{0} ** 13);
-    m.readTwosComplement(buffer, bit_count, 13, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 13, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..13], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
 
     buffer = &([_]u8{0} ** 16);
-    m.readTwosComplement(buffer, bit_count, 16, .Little, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Little, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
-    m.readTwosComplement(buffer, bit_count, 16, .Big, .unsigned);
+    m.readTwosComplement(buffer[0..16], bit_count, .Big, .unsigned);
     try testing.expect(m.toConst().orderAgainstScalar(0x0) == .eq);
 }
 
diff --git a/lib/std/mem.zig b/lib/std/mem.zig
index 4000030fc0..93dd54bdb5 100644
--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@@ -1083,7 +1083,7 @@ fn boyerMooreHorspoolPreprocessReverse(pattern: []const u8, table: *[256]usize)
 
     var i: usize = pattern.len - 1;
     // The first item is intentionally ignored and the skip size will be pattern.len.
-    // This is the standard way boyer-moore-horspool is implemented.
+    // This is the standard way Boyer-Moore-Horspool is implemented.
     while (i > 0) : (i -= 1) {
         table[pattern[i]] = i;
     }
@@ -1096,14 +1096,15 @@ fn boyerMooreHorspoolPreprocess(pattern: []const u8, table: *[256]usize) void {
 
     var i: usize = 0;
     // The last item is intentionally ignored and the skip size will be pattern.len.
-    // This is the standard way boyer-moore-horspool is implemented.
+    // This is the standard way Boyer-Moore-Horspool is implemented.
     while (i < pattern.len - 1) : (i += 1) {
         table[pattern[i]] = pattern.len - 1 - i;
     }
 }
+
 /// Find the index in a slice of a sub-slice, searching from the end backwards.
 /// To start looking at a different index, slice the haystack first.
-/// Uses the Reverse boyer-moore-horspool algorithm on large inputs;
+/// Uses the Reverse Boyer-Moore-Horspool algorithm on large inputs;
 /// `lastIndexOfLinear` on small inputs.
 pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize {
     if (needle.len > haystack.len) return null;
@@ -1131,7 +1132,7 @@ pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?us
     return null;
 }
 
-/// Uses Boyer-moore-horspool algorithm on large inputs; `indexOfPosLinear` on small inputs.
+/// Uses Boyer-Moore-Horspool algorithm on large inputs; `indexOfPosLinear` on small inputs.
 pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize {
     if (needle.len > haystack.len) return null;
     if (needle.len == 0) return start_index;
@@ -1183,7 +1184,7 @@ test "indexOf" {
 
 test "indexOf multibyte" {
     {
-        // make haystack and needle long enough to trigger boyer-moore-horspool algorithm
+        // make haystack and needle long enough to trigger Boyer-Moore-Horspool algorithm
         const haystack = [1]u16{0} ** 100 ++ [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee, 0x00ff };
         const needle = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee };
         try testing.expectEqual(indexOfPos(u16, &haystack, 0, &needle), 100);
@@ -1196,7 +1197,7 @@ test "indexOf multibyte" {
     }
 
     {
-        // make haystack and needle long enough to trigger boyer-moore-horspool algorithm
+        // make haystack and needle long enough to trigger Boyer-Moore-Horspool algorithm
         const haystack = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee, 0x00ff } ++ [1]u16{0} ** 100;
         const needle = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee };
         try testing.expectEqual(lastIndexOf(u16, &haystack, &needle), 0);
@@ -1298,6 +1299,76 @@ pub fn readVarInt(comptime ReturnType: type, bytes: []const u8, endian: Endian)
     return result;
 }
 
+/// Loads an integer from packed memory with provided bit_count, bit_offset, and signedness.
+/// Asserts that T is large enough to store the read value.
+///
+/// Example:
+///     const T = packed struct(u16){ a: u3, b: u7, c: u6 };
+///     var st = T{ .a = 1, .b = 2, .c = 4 };
+///     const b_field = readVarPackedInt(u64, std.mem.asBytes(&st), @bitOffsetOf(T, "b"), 7, builtin.cpu.arch.endian(), .unsigned);
+///
+pub fn readVarPackedInt(
+    comptime T: type,
+    bytes: []const u8,
+    bit_offset: usize,
+    bit_count: usize,
+    endian: std.builtin.Endian,
+    signedness: std.builtin.Signedness,
+) T {
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const iN = std.meta.Int(.signed, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const read_size = (bit_count + (bit_offset % 8) + 7) / 8;
+    const bit_shift = @intCast(u3, bit_offset % 8);
+    const pad = @intCast(Log2N, @bitSizeOf(T) - bit_count);
+
+    const lowest_byte = switch (endian) {
+        .Big => bytes.len - (bit_offset / 8) - read_size,
+        .Little => bit_offset / 8,
+    };
+    const read_bytes = bytes[lowest_byte..][0..read_size];
+
+    if (@bitSizeOf(T) <= 8) {
+        // These are the same shifts/masks we perform below, but adds `@truncate`/`@intCast`
+        // where needed since int is smaller than a byte.
+        const value = if (read_size == 1) b: {
+            break :b @truncate(uN, read_bytes[0] >> bit_shift);
+        } else b: {
+            const i: u1 = @boolToInt(endian == .Big);
+            const head = @truncate(uN, read_bytes[i] >> bit_shift);
+            const tail_shift = @intCast(Log2N, @as(u4, 8) - bit_shift);
+            const tail = @truncate(uN, read_bytes[1 - i]);
+            break :b (tail << tail_shift) | head;
+        };
+        switch (signedness) {
+            .signed => return @intCast(T, (@bitCast(iN, value) << pad) >> pad),
+            .unsigned => return @intCast(T, (@bitCast(uN, value) << pad) >> pad),
+        }
+    }
+
+    // Copy the value out (respecting endianness), accounting for bit_shift
+    var int: uN = 0;
+    switch (endian) {
+        .Big => {
+            for (read_bytes[0 .. read_size - 1]) |elem| {
+                int = elem | (int << 8);
+            }
+            int = (read_bytes[read_size - 1] >> bit_shift) | (int << (@as(u4, 8) - bit_shift));
+        },
+        .Little => {
+            int = read_bytes[0] >> bit_shift;
+            for (read_bytes[1..]) |elem, i| {
+                int |= (@as(uN, elem) << @intCast(Log2N, (8 * (i + 1) - bit_shift)));
+            }
+        },
+    }
+    switch (signedness) {
+        .signed => return @intCast(T, (@bitCast(iN, int) << pad) >> pad),
+        .unsigned => return @intCast(T, (@bitCast(uN, int) << pad) >> pad),
+    }
+}
+
 /// Reads an integer from memory with bit count specified by T.
 /// The bit count of T must be evenly divisible by 8.
 /// This function cannot fail and cannot cause undefined behavior.
@@ -1365,6 +1436,84 @@ pub fn readInt(comptime T: type, bytes: *const [@divExact(@typeInfo(T).Int.bits,
     }
 }
 
+fn readPackedIntLittle(comptime T: type, bytes: []const u8, bit_offset: usize) T {
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const bit_count = @as(usize, @bitSizeOf(T));
+    const bit_shift = @intCast(u3, bit_offset % 8);
+
+    const load_size = (bit_count + 7) / 8;
+    const load_tail_bits = @intCast(u3, (load_size * 8) - bit_count);
+    const LoadInt = std.meta.Int(.unsigned, load_size * 8);
+
+    if (bit_count == 0)
+        return 0;
+
+    // Read by loading a LoadInt, and then follow it up with a 1-byte read
+    // of the tail if bit_offset pushed us over a byte boundary.
+    const read_bytes = bytes[bit_offset / 8 ..];
+    const val = @truncate(uN, readIntLittle(LoadInt, read_bytes[0..load_size]) >> bit_shift);
+    if (bit_shift > load_tail_bits) {
+        const tail_bits = @intCast(Log2N, bit_shift - load_tail_bits);
+        const tail_byte = read_bytes[load_size];
+        const tail_truncated = if (bit_count < 8) @truncate(uN, tail_byte) else @as(uN, tail_byte);
+        return @bitCast(T, val | (tail_truncated << (@truncate(Log2N, bit_count) -% tail_bits)));
+    } else return @bitCast(T, val);
+}
+
+fn readPackedIntBig(comptime T: type, bytes: []const u8, bit_offset: usize) T {
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const bit_count = @as(usize, @bitSizeOf(T));
+    const bit_shift = @intCast(u3, bit_offset % 8);
+    const byte_count = (@as(usize, bit_shift) + bit_count + 7) / 8;
+
+    const load_size = (bit_count + 7) / 8;
+    const load_tail_bits = @intCast(u3, (load_size * 8) - bit_count);
+    const LoadInt = std.meta.Int(.unsigned, load_size * 8);
+
+    if (bit_count == 0)
+        return 0;
+
+    // Read by loading a LoadInt, and then follow it up with a 1-byte read
+    // of the tail if bit_offset pushed us over a byte boundary.
+    const end = bytes.len - (bit_offset / 8);
+    const read_bytes = bytes[(end - byte_count)..end];
+    const val = @truncate(uN, readIntBig(LoadInt, bytes[(end - load_size)..end][0..load_size]) >> bit_shift);
+    if (bit_shift > load_tail_bits) {
+        const tail_bits = @intCast(Log2N, bit_shift - load_tail_bits);
+        const tail_byte = if (bit_count < 8) @truncate(uN, read_bytes[0]) else @as(uN, read_bytes[0]);
+        return @bitCast(T, val | (tail_byte << (@truncate(Log2N, bit_count) -% tail_bits)));
+    } else return @bitCast(T, val);
+}
+
+pub const readPackedIntNative = switch (native_endian) {
+    .Little => readPackedIntLittle,
+    .Big => readPackedIntBig,
+};
+
+pub const readPackedIntForeign = switch (native_endian) {
+    .Little => readPackedIntBig,
+    .Big => readPackedIntLittle,
+};
+
+/// Loads an integer from packed memory.
+/// Asserts that buffer contains at least bit_offset + @bitSizeOf(T) bits.
+///
+/// Example:
+///     const T = packed struct(u16){ a: u3, b: u7, c: u6 };
+///     var st = T{ .a = 1, .b = 2, .c = 4 };
+///     const b_field = readPackedInt(u7, std.mem.asBytes(&st), @bitOffsetOf(T, "b"), builtin.cpu.arch.endian());
+///
+pub fn readPackedInt(comptime T: type, bytes: []const u8, bit_offset: usize, endian: Endian) T {
+    switch (endian) {
+        .Little => return readPackedIntLittle(T, bytes, bit_offset),
+        .Big => return readPackedIntBig(T, bytes, bit_offset),
+    }
+}
+
 /// Asserts that bytes.len >= @typeInfo(T).Int.bits / 8. Reads the integer starting from index 0
 /// and ignores extra bytes.
 /// The bit count of T must be evenly divisible by 8.
@@ -1447,6 +1596,100 @@ pub fn writeInt(comptime T: type, buffer: *[@divExact(@typeInfo(T).Int.bits, 8)]
     }
 }
 
+pub fn writePackedIntLittle(comptime T: type, bytes: []u8, bit_offset: usize, value: T) void {
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const bit_count = @as(usize, @bitSizeOf(T));
+    const bit_shift = @intCast(u3, bit_offset % 8);
+
+    const store_size = (@bitSizeOf(T) + 7) / 8;
+    const store_tail_bits = @intCast(u3, (store_size * 8) - bit_count);
+    const StoreInt = std.meta.Int(.unsigned, store_size * 8);
+
+    if (bit_count == 0)
+        return;
+
+    // Write by storing a StoreInt, and then follow it up with a 1-byte tail
+    // if bit_offset pushed us over a byte boundary.
+    const write_bytes = bytes[bit_offset / 8 ..];
+    const head = write_bytes[0] & ((@as(u8, 1) << bit_shift) - 1);
+
+    var write_value = (@as(StoreInt, @bitCast(uN, value)) << bit_shift) | @intCast(StoreInt, head);
+    if (bit_shift > store_tail_bits) {
+        const tail_len = @intCast(Log2N, bit_shift - store_tail_bits);
+        write_bytes[store_size] &= ~((@as(u8, 1) << @intCast(u3, tail_len)) - 1);
+        write_bytes[store_size] |= @intCast(u8, (@bitCast(uN, value) >> (@truncate(Log2N, bit_count) -% tail_len)));
+    } else if (bit_shift < store_tail_bits) {
+        const tail_len = store_tail_bits - bit_shift;
+        const tail = write_bytes[store_size - 1] & (@as(u8, 0xfe) << (7 - tail_len));
+        write_value |= @as(StoreInt, tail) << (8 * (store_size - 1));
+    }
+
+    writeIntLittle(StoreInt, write_bytes[0..store_size], write_value);
+}
+
+pub fn writePackedIntBig(comptime T: type, bytes: []u8, bit_offset: usize, value: T) void {
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const bit_count = @as(usize, @bitSizeOf(T));
+    const bit_shift = @intCast(u3, bit_offset % 8);
+    const byte_count = (bit_shift + bit_count + 7) / 8;
+
+    const store_size = (@bitSizeOf(T) + 7) / 8;
+    const store_tail_bits = @intCast(u3, (store_size * 8) - bit_count);
+    const StoreInt = std.meta.Int(.unsigned, store_size * 8);
+
+    if (bit_count == 0)
+        return;
+
+    // Write by storing a StoreInt, and then follow it up with a 1-byte tail
+    // if bit_offset pushed us over a byte boundary.
+    const end = bytes.len - (bit_offset / 8);
+    const write_bytes = bytes[(end - byte_count)..end];
+    const head = write_bytes[byte_count - 1] & ((@as(u8, 1) << bit_shift) - 1);
+
+    var write_value = (@as(StoreInt, @bitCast(uN, value)) << bit_shift) | @intCast(StoreInt, head);
+    if (bit_shift > store_tail_bits) {
+        const tail_len = @intCast(Log2N, bit_shift - store_tail_bits);
+        write_bytes[0] &= ~((@as(u8, 1) << @intCast(u3, tail_len)) - 1);
+        write_bytes[0] |= @intCast(u8, (@bitCast(uN, value) >> (@truncate(Log2N, bit_count) -% tail_len)));
+    } else if (bit_shift < store_tail_bits) {
+        const tail_len = store_tail_bits - bit_shift;
+        const tail = write_bytes[0] & (@as(u8, 0xfe) << (7 - tail_len));
+        write_value |= @as(StoreInt, tail) << (8 * (store_size - 1));
+    }
+
+    writeIntBig(StoreInt, write_bytes[(byte_count - store_size)..][0..store_size], write_value);
+}
+
+pub const writePackedIntNative = switch (native_endian) {
+    .Little => writePackedIntLittle,
+    .Big => writePackedIntBig,
+};
+
+pub const writePackedIntForeign = switch (native_endian) {
+    .Little => writePackedIntBig,
+    .Big => writePackedIntLittle,
+};
+
+/// Stores an integer to packed memory.
+/// Asserts that buffer contains at least bit_offset + @bitSizeOf(T) bits.
+///
+/// Example:
+///     const T = packed struct(u16){ a: u3, b: u7, c: u6 };
+///     var st = T{ .a = 1, .b = 2, .c = 4 };
+///     // st.b = 0x7f;
+///     writePackedInt(u7, std.mem.asBytes(&st), @bitOffsetOf(T, "b"), 0x7f, builtin.cpu.arch.endian());
+///
+pub fn writePackedInt(comptime T: type, bytes: []u8, bit_offset: usize, value: T, endian: Endian) void {
+    switch (endian) {
+        .Little => writePackedIntLittle(T, bytes, bit_offset, value),
+        .Big => writePackedIntBig(T, bytes, bit_offset, value),
+    }
+}
+
 /// Writes a twos-complement little-endian integer to memory.
 /// Asserts that buf.len >= @typeInfo(T).Int.bits / 8.
 /// The bit count of T must be divisible by 8.
@@ -1523,6 +1766,69 @@ pub fn writeIntSlice(comptime T: type, buffer: []u8, value: T, endian: Endian) v
     };
 }
 
+/// Stores an integer to packed memory with provided bit_count, bit_offset, and signedness.
+/// If negative, the written value is sign-extended.
+///
+/// Example:
+///     const T = packed struct(u16){ a: u3, b: u7, c: u6 };
+///     var st = T{ .a = 1, .b = 2, .c = 4 };
+///     // st.b = 0x7f;
+///     var value: u64 = 0x7f;
+///     writeVarPackedInt(std.mem.asBytes(&st), @bitOffsetOf(T, "b"), 7, value, builtin.cpu.arch.endian());
+///
+pub fn writeVarPackedInt(bytes: []u8, bit_offset: usize, bit_count: usize, value: anytype, endian: std.builtin.Endian) void {
+    const T = @TypeOf(value);
+    const uN = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const Log2N = std.math.Log2Int(T);
+
+    const bit_shift = @intCast(u3, bit_offset % 8);
+    const write_size = (bit_count + bit_shift + 7) / 8;
+    const lowest_byte = switch (endian) {
+        .Big => bytes.len - (bit_offset / 8) - write_size,
+        .Little => bit_offset / 8,
+    };
+    const write_bytes = bytes[lowest_byte..][0..write_size];
+
+    if (write_size == 1) {
+        // Single byte writes are handled specially, since we need to mask bits
+        // on both ends of the byte.
+        const mask = (@as(u8, 0xff) >> @intCast(u3, 8 - bit_count));
+        const new_bits = @intCast(u8, @bitCast(uN, value) & mask) << bit_shift;
+        write_bytes[0] = (write_bytes[0] & ~(mask << bit_shift)) | new_bits;
+        return;
+    }
+
+    var remaining: T = value;
+
+    // Iterate bytes forward for Little-endian, backward for Big-endian
+    const delta: i2 = if (endian == .Big) -1 else 1;
+    const start = if (endian == .Big) @intCast(isize, write_bytes.len - 1) else 0;
+
+    var i: isize = start; // isize for signed index arithmetic
+
+    // Write first byte, using a mask to protects bits preceding bit_offset
+    const head_mask = @as(u8, 0xff) >> bit_shift;
+    write_bytes[@intCast(usize, i)] &= ~(head_mask << bit_shift);
+    write_bytes[@intCast(usize, i)] |= @intCast(u8, @bitCast(uN, remaining) & head_mask) << bit_shift;
+    remaining >>= @intCast(Log2N, @as(u4, 8) - bit_shift);
+    i += delta;
+
+    // Write bytes[1..bytes.len - 1]
+    if (@bitSizeOf(T) > 8) {
+        const loop_end = start + delta * (@intCast(isize, write_size) - 1);
+        while (i != loop_end) : (i += delta) {
+            write_bytes[@intCast(usize, i)] = @truncate(u8, @bitCast(uN, remaining));
+            remaining >>= 8;
+        }
+    }
+
+    // Write last byte, using a mask to protect bits following bit_offset + bit_count
+    const following_bits = -%@truncate(u3, bit_shift + bit_count);
+    const tail_mask = (@as(u8, 0xff) << following_bits) >> following_bits;
+    write_bytes[@intCast(usize, i)] &= ~tail_mask;
+    write_bytes[@intCast(usize, i)] |= @intCast(u8, @bitCast(uN, remaining) & tail_mask);
+}
+
 test "writeIntBig and writeIntLittle" {
     var buf0: [0]u8 = undefined;
     var buf1: [1]u8 = undefined;
@@ -3393,3 +3699,165 @@ pub fn alignInSlice(slice: anytype, comptime new_alignment: usize) ?AlignedSlice
     const aligned_slice = bytesAsSlice(Element, aligned_bytes[0..slice_length_bytes]);
     return @alignCast(new_alignment, aligned_slice);
 }
+
+test "read/write(Var)PackedInt" {
+    switch (builtin.cpu.arch) {
+        // This test generates too much code to execute on WASI.
+        // LLVM backend fails with "too many locals: locals exceed maximum"
+        .wasm32, .wasm64 => return error.SkipZigTest,
+        else => {},
+    }
+
+    const foreign_endian: Endian = if (native_endian == .Big) .Little else .Big;
+    const expect = std.testing.expect;
+    var prng = std.rand.DefaultPrng.init(1234);
+    const random = prng.random();
+
+    @setEvalBranchQuota(10_000);
+    inline for ([_]type{ u8, u16, u32, u128 }) |BackingType| {
+        for ([_]BackingType{
+            @as(BackingType, 0), // all zeros
+            -%@as(BackingType, 1), // all ones
+            random.int(BackingType), // random
+            random.int(BackingType), // random
+            random.int(BackingType), // random
+        }) |init_value| {
+            const uTs = [_]type{ u1, u3, u7, u8, u9, u10, u15, u16, u86 };
+            const iTs = [_]type{ i1, i3, i7, i8, i9, i10, i15, i16, i86 };
+            inline for (uTs ++ iTs) |PackedType| {
+                if (@bitSizeOf(PackedType) > @bitSizeOf(BackingType))
+                    continue;
+
+                const iPackedType = std.meta.Int(.signed, @bitSizeOf(PackedType));
+                const uPackedType = std.meta.Int(.unsigned, @bitSizeOf(PackedType));
+                const Log2T = std.math.Log2Int(BackingType);
+
+                const offset_at_end = @bitSizeOf(BackingType) - @bitSizeOf(PackedType);
+                for ([_]usize{ 0, 1, 7, 8, 9, 10, 15, 16, 86, offset_at_end }) |offset| {
+                    if (offset > offset_at_end or offset == @bitSizeOf(BackingType))
+                        continue;
+
+                    for ([_]PackedType{
+                        ~@as(PackedType, 0), // all ones: -1 iN / maxInt uN
+                        @as(PackedType, 0), // all zeros: 0 iN / 0 uN
+                        @bitCast(PackedType, @as(iPackedType, math.maxInt(iPackedType))), // maxInt iN
+                        @bitCast(PackedType, @as(iPackedType, math.minInt(iPackedType))), // maxInt iN
+                        random.int(PackedType), // random
+                        random.int(PackedType), // random
+                    }) |write_value| {
+                        { // Fixed-size Read/Write (Native-endian)
+
+                            // Initialize Value
+                            var value: BackingType = init_value;
+
+                            // Read
+                            const read_value1 = readPackedInt(PackedType, asBytes(&value), offset, native_endian);
+                            try expect(read_value1 == @bitCast(PackedType, @truncate(uPackedType, value >> @intCast(Log2T, offset))));
+
+                            // Write
+                            writePackedInt(PackedType, asBytes(&value), offset, write_value, native_endian);
+                            try expect(write_value == @bitCast(PackedType, @truncate(uPackedType, value >> @intCast(Log2T, offset))));
+
+                            // Read again
+                            const read_value2 = readPackedInt(PackedType, asBytes(&value), offset, native_endian);
+                            try expect(read_value2 == write_value);
+
+                            // Verify bits outside of the target integer are unmodified
+                            const diff_bits = init_value ^ value;
+                            if (offset != offset_at_end)
+                                try expect(diff_bits >> @intCast(Log2T, offset + @bitSizeOf(PackedType)) == 0);
+                            if (offset != 0)
+                                try expect(diff_bits << @intCast(Log2T, @bitSizeOf(BackingType) - offset) == 0);
+                        }
+
+                        { // Fixed-size Read/Write (Foreign-endian)
+
+                            // Initialize Value
+                            var value: BackingType = @byteSwap(init_value);
+
+                            // Read
+                            const read_value1 = readPackedInt(PackedType, asBytes(&value), offset, foreign_endian);
+                            try expect(read_value1 == @bitCast(PackedType, @truncate(uPackedType, @byteSwap(value) >> @intCast(Log2T, offset))));
+
+                            // Write
+                            writePackedInt(PackedType, asBytes(&value), offset, write_value, foreign_endian);
+                            try expect(write_value == @bitCast(PackedType, @truncate(uPackedType, @byteSwap(value) >> @intCast(Log2T, offset))));
+
+                            // Read again
+                            const read_value2 = readPackedInt(PackedType, asBytes(&value), offset, foreign_endian);
+                            try expect(read_value2 == write_value);
+
+                            // Verify bits outside of the target integer are unmodified
+                            const diff_bits = init_value ^ @byteSwap(value);
+                            if (offset != offset_at_end)
+                                try expect(diff_bits >> @intCast(Log2T, offset + @bitSizeOf(PackedType)) == 0);
+                            if (offset != 0)
+                                try expect(diff_bits << @intCast(Log2T, @bitSizeOf(BackingType) - offset) == 0);
+                        }
+
+                        const signedness = @typeInfo(PackedType).Int.signedness;
+                        const NextPowerOfTwoInt = std.meta.Int(signedness, comptime try std.math.ceilPowerOfTwo(u16, @bitSizeOf(PackedType)));
+                        const ui64 = std.meta.Int(signedness, 64);
+                        inline for ([_]type{ PackedType, NextPowerOfTwoInt, ui64 }) |U| {
+                            { // Variable-size Read/Write (Native-endian)
+
+                                if (@bitSizeOf(U) < @bitSizeOf(PackedType))
+                                    continue;
+
+                                // Initialize Value
+                                var value: BackingType = init_value;
+
+                                // Read
+                                const read_value1 = readVarPackedInt(U, asBytes(&value), offset, @bitSizeOf(PackedType), native_endian, signedness);
+                                try expect(read_value1 == @bitCast(PackedType, @truncate(uPackedType, value >> @intCast(Log2T, offset))));
+
+                                // Write
+                                writeVarPackedInt(asBytes(&value), offset, @bitSizeOf(PackedType), @as(U, write_value), native_endian);
+                                try expect(write_value == @bitCast(PackedType, @truncate(uPackedType, value >> @intCast(Log2T, offset))));
+
+                                // Read again
+                                const read_value2 = readVarPackedInt(U, asBytes(&value), offset, @bitSizeOf(PackedType), native_endian, signedness);
+                                try expect(read_value2 == write_value);
+
+                                // Verify bits outside of the target integer are unmodified
+                                const diff_bits = init_value ^ value;
+                                if (offset != offset_at_end)
+                                    try expect(diff_bits >> @intCast(Log2T, offset + @bitSizeOf(PackedType)) == 0);
+                                if (offset != 0)
+                                    try expect(diff_bits << @intCast(Log2T, @bitSizeOf(BackingType) - offset) == 0);
+                            }
+
+                            { // Variable-size Read/Write (Foreign-endian)
+
+                                if (@bitSizeOf(U) < @bitSizeOf(PackedType))
+                                    continue;
+
+                                // Initialize Value
+                                var value: BackingType = @byteSwap(init_value);
+
+                                // Read
+                                const read_value1 = readVarPackedInt(U, asBytes(&value), offset, @bitSizeOf(PackedType), foreign_endian, signedness);
+                                try expect(read_value1 == @bitCast(PackedType, @truncate(uPackedType, @byteSwap(value) >> @intCast(Log2T, offset))));
+
+                                // Write
+                                writeVarPackedInt(asBytes(&value), offset, @bitSizeOf(PackedType), @as(U, write_value), foreign_endian);
+                                try expect(write_value == @bitCast(PackedType, @truncate(uPackedType, @byteSwap(value) >> @intCast(Log2T, offset))));
+
+                                // Read again
+                                const read_value2 = readVarPackedInt(U, asBytes(&value), offset, @bitSizeOf(PackedType), foreign_endian, signedness);
+                                try expect(read_value2 == write_value);
+
+                                // Verify bits outside of the target integer are unmodified
+                                const diff_bits = init_value ^ @byteSwap(value);
+                                if (offset != offset_at_end)
+                                    try expect(diff_bits >> @intCast(Log2T, offset + @bitSizeOf(PackedType)) == 0);
+                                if (offset != 0)
+                                    try expect(diff_bits << @intCast(Log2T, @bitSizeOf(BackingType) - offset) == 0);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/Compilation.zig b/src/Compilation.zig
index 5c3db25555..be9e82cd87 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -1109,6 +1109,7 @@ pub fn create(gpa: Allocator, options: InitOptions) !*Compilation {
         const root_name = try arena.dupeZ(u8, options.root_name);
 
         const use_stage1 = options.use_stage1 orelse false;
+        if (use_stage1 and !build_options.have_stage1) return error.ZigCompilerBuiltWithoutStage1;
 
         // Make a decision on whether to use LLVM or our own backend.
         const use_llvm = build_options.have_llvm and blk: {
diff --git a/src/Sema.zig b/src/Sema.zig
index d31c0745ed..a73c1eedcb 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26490,48 +26490,6 @@ fn bitCastVal(
     const target = sema.mod.getTarget();
     if (old_ty.eql(new_ty, sema.mod)) return val;
 
-    // Some conversions have a bitwise definition that ignores in-memory layout,
-    // such as converting between f80 and u80.
-
-    if (old_ty.eql(Type.f80, sema.mod) and new_ty.isAbiInt()) {
-        const float = val.toFloat(f80);
-        switch (new_ty.intInfo(target).signedness) {
-            .signed => {
-                const int = @bitCast(i80, float);
-                const limbs = try sema.arena.alloc(std.math.big.Limb, 2);
-                const big_int = std.math.big.int.Mutable.init(limbs, int);
-                return Value.fromBigInt(sema.arena, big_int.toConst());
-            },
-            .unsigned => {
-                const int = @bitCast(u80, float);
-                const limbs = try sema.arena.alloc(std.math.big.Limb, 2);
-                const big_int = std.math.big.int.Mutable.init(limbs, int);
-                return Value.fromBigInt(sema.arena, big_int.toConst());
-            },
-        }
-    }
-
-    if (new_ty.eql(Type.f80, sema.mod) and old_ty.isAbiInt()) {
-        var bigint_space: Value.BigIntSpace = undefined;
-        var bigint = try val.toBigIntAdvanced(&bigint_space, target, sema.kit(block, src));
-        switch (old_ty.intInfo(target).signedness) {
-            .signed => {
-                // This conversion cannot fail because we already checked bit size before
-                // calling bitCastVal.
-                const int = bigint.to(i80) catch unreachable;
-                const float = @bitCast(f80, int);
-                return Value.Tag.float_80.create(sema.arena, float);
-            },
-            .unsigned => {
-                // This conversion cannot fail because we already checked bit size before
-                // calling bitCastVal.
-                const int = bigint.to(u80) catch unreachable;
-                const float = @bitCast(f80, int);
-                return Value.Tag.float_80.create(sema.arena, float);
-            },
-        }
-    }
-
     // For types with well-defined memory layouts, we serialize them a byte buffer,
     // then deserialize to the new type.
     const abi_size = try sema.usizeCast(block, src, old_ty.abiSize(target));
diff --git a/src/codegen.zig b/src/codegen.zig
index 757bd23b38..6acea5a509 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -470,7 +470,7 @@ pub fn generateSymbol(
                 const abi_size = math.cast(usize, typed_value.ty.abiSize(target)) orelse return error.Overflow;
                 const start = code.items.len;
                 try code.resize(start + abi_size);
-                bigint.writeTwosComplement(code.items[start..][0..abi_size], info.bits, abi_size, endian);
+                bigint.writeTwosComplement(code.items[start..][0..abi_size], endian);
                 return Result{ .appended = {} };
             }
             switch (info.signedness) {
diff --git a/src/value.zig b/src/value.zig
index 7d01430103..a727df5d22 100644
--- a/src/value.zig
+++ b/src/value.zig
@@ -1206,8 +1206,13 @@ pub const Value = extern union {
         };
     }
 
+    /// Write a Value's contents to `buffer`.
+    ///
+    /// Asserts that buffer.len >= ty.abiSize(). The buffer is allowed to extend past
+    /// the end of the value in memory.
     pub fn writeToMemory(val: Value, ty: Type, mod: *Module, buffer: []u8) void {
         const target = mod.getTarget();
+        const endian = target.cpu.arch.endian();
         if (val.isUndef()) {
             const size = @intCast(usize, ty.abiSize(target));
             std.mem.set(u8, buffer[0..size], 0xaa);
@@ -1218,31 +1223,41 @@ pub const Value = extern union {
             .Bool => {
                 buffer[0] = @boolToInt(val.toBool());
             },
-            .Int => {
-                var bigint_buffer: BigIntSpace = undefined;
-                const bigint = val.toBigInt(&bigint_buffer, target);
-                const bits = ty.intInfo(target).bits;
-                const abi_size = @intCast(usize, ty.abiSize(target));
-                bigint.writeTwosComplement(buffer, bits, abi_size, target.cpu.arch.endian());
-            },
-            .Enum => {
+            .Int, .Enum => {
+                const int_info = ty.intInfo(target);
+                const bits = int_info.bits;
+                const byte_count = (bits + 7) / 8;
+
                 var enum_buffer: Payload.U64 = undefined;
                 const int_val = val.enumToInt(ty, &enum_buffer);
-                var bigint_buffer: BigIntSpace = undefined;
-                const bigint = int_val.toBigInt(&bigint_buffer, target);
-                const bits = ty.intInfo(target).bits;
-                const abi_size = @intCast(usize, ty.abiSize(target));
-                bigint.writeTwosComplement(buffer, bits, abi_size, target.cpu.arch.endian());
+
+                if (byte_count <= @sizeOf(u64)) {
+                    const int: u64 = switch (int_val.tag()) {
+                        .zero => 0,
+                        .one => 1,
+                        .int_u64 => int_val.castTag(.int_u64).?.data,
+                        .int_i64 => @bitCast(u64, int_val.castTag(.int_i64).?.data),
+                        else => unreachable,
+                    };
+                    for (buffer[0..byte_count]) |_, i| switch (endian) {
+                        .Little => buffer[i] = @truncate(u8, (int >> @intCast(u6, (8 * i)))),
+                        .Big => buffer[byte_count - i - 1] = @truncate(u8, (int >> @intCast(u6, (8 * i)))),
+                    };
+                } else {
+                    var bigint_buffer: BigIntSpace = undefined;
+                    const bigint = int_val.toBigInt(&bigint_buffer, target);
+                    bigint.writeTwosComplement(buffer[0..byte_count], endian);
+                }
             },
             .Float => switch (ty.floatBits(target)) {
-                16 => return floatWriteToMemory(f16, val.toFloat(f16), target, buffer),
-                32 => return floatWriteToMemory(f32, val.toFloat(f32), target, buffer),
-                64 => return floatWriteToMemory(f64, val.toFloat(f64), target, buffer),
-                80 => return floatWriteToMemory(f80, val.toFloat(f80), target, buffer),
-                128 => return floatWriteToMemory(f128, val.toFloat(f128), target, buffer),
+                16 => std.mem.writeInt(u16, buffer[0..2], @bitCast(u16, val.toFloat(f16)), endian),
+                32 => std.mem.writeInt(u32, buffer[0..4], @bitCast(u32, val.toFloat(f32)), endian),
+                64 => std.mem.writeInt(u64, buffer[0..8], @bitCast(u64, val.toFloat(f64)), endian),
+                80 => std.mem.writeInt(u80, buffer[0..10], @bitCast(u80, val.toFloat(f80)), endian),
+                128 => std.mem.writeInt(u128, buffer[0..16], @bitCast(u128, val.toFloat(f128)), endian),
                 else => unreachable,
             },
-            .Array, .Vector => {
+            .Array => {
                 const len = ty.arrayLen();
                 const elem_ty = ty.childType();
                 const elem_size = @intCast(usize, elem_ty.abiSize(target));
@@ -1251,10 +1266,16 @@ pub const Value = extern union {
                 var buf_off: usize = 0;
                 while (elem_i < len) : (elem_i += 1) {
                     const elem_val = val.elemValueBuffer(mod, elem_i, &elem_value_buf);
-                    writeToMemory(elem_val, elem_ty, mod, buffer[buf_off..]);
+                    elem_val.writeToMemory(elem_ty, mod, buffer[buf_off..]);
                     buf_off += elem_size;
                 }
             },
+            .Vector => {
+                // We use byte_count instead of abi_size here, so that any padding bytes
+                // follow the data bytes, on both big- and little-endian systems.
+                const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
+                writeToPackedMemory(val, ty, mod, buffer[0..byte_count], 0);
+            },
             .Struct => switch (ty.containerLayout()) {
                 .Auto => unreachable, // Sema is supposed to have emitted a compile error already
                 .Extern => {
@@ -1266,122 +1287,113 @@ pub const Value = extern union {
                     }
                 },
                 .Packed => {
-                    // TODO allocate enough heap space instead of using this buffer
-                    // on the stack.
-                    var buf: [16]std.math.big.Limb = undefined;
-                    const host_int = packedStructToInt(val, ty, target, &buf);
-                    const abi_size = @intCast(usize, ty.abiSize(target));
-                    const bit_size = @intCast(usize, ty.bitSize(target));
-                    host_int.writeTwosComplement(buffer, bit_size, abi_size, target.cpu.arch.endian());
+                    const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
+                    writeToPackedMemory(val, ty, mod, buffer[0..byte_count], 0);
                 },
             },
             .ErrorSet => {
                 // TODO revisit this when we have the concept of the error tag type
                 const Int = u16;
                 const int = mod.global_error_set.get(val.castTag(.@"error").?.data.name).?;
-                std.mem.writeInt(Int, buffer[0..@sizeOf(Int)], @intCast(Int, int), target.cpu.arch.endian());
+                std.mem.writeInt(Int, buffer[0..@sizeOf(Int)], @intCast(Int, int), endian);
             },
             else => @panic("TODO implement writeToMemory for more types"),
         }
     }
 
-    fn packedStructToInt(val: Value, ty: Type, target: Target, buf: []std.math.big.Limb) BigIntConst {
-        var bigint = BigIntMutable.init(buf, 0);
-        const fields = ty.structFields().values();
-        const field_vals = val.castTag(.aggregate).?.data;
-        var bits: u16 = 0;
-        // TODO allocate enough heap space instead of using this buffer
-        // on the stack.
-        var field_buf: [16]std.math.big.Limb = undefined;
-        var field_space: BigIntSpace = undefined;
-        var field_buf2: [16]std.math.big.Limb = undefined;
-        for (fields) |field, i| {
-            const field_val = field_vals[i];
-            const field_bigint_const = switch (field.ty.zigTypeTag()) {
-                .Void => continue,
-                .Float => floatToBigInt(field_val, field.ty, target, &field_buf),
-                .Int, .Bool => intOrBoolToBigInt(field_val, field.ty, target, &field_buf, &field_space),
-                .Struct => switch (field.ty.containerLayout()) {
-                    .Auto, .Extern => unreachable, // Sema should have error'd before this.
-                    .Packed => packedStructToInt(field_val, field.ty, target, &field_buf),
-                },
-                .Vector => vectorToBigInt(field_val, field.ty, target, &field_buf),
-                .Enum => enumToBigInt(field_val, field.ty, target, &field_space),
-                .Union => unreachable, // TODO: packed structs support packed unions
-                else => unreachable,
-            };
-            var field_bigint = BigIntMutable.init(&field_buf2, 0);
-            field_bigint.shiftLeft(field_bigint_const, bits);
-            bits += @intCast(u16, field.ty.bitSize(target));
-            bigint.bitOr(bigint.toConst(), field_bigint.toConst());
-        }
-        return bigint.toConst();
-    }
-
-    fn intOrBoolToBigInt(val: Value, ty: Type, target: Target, buf: []std.math.big.Limb, space: *BigIntSpace) BigIntConst {
-        const big_int_const = val.toBigInt(space, target);
-        if (big_int_const.positive) return big_int_const;
-
-        var big_int = BigIntMutable.init(buf, 0);
-        big_int.bitNotWrap(big_int_const.negate(), .unsigned, @intCast(u32, ty.bitSize(target)));
-        big_int.addScalar(big_int.toConst(), 1);
-        return big_int.toConst();
-    }
-
-    fn vectorToBigInt(val: Value, ty: Type, target: Target, buf: []std.math.big.Limb) BigIntConst {
+    /// Write a Value's contents to `buffer`.
+    ///
+    /// Both the start and the end of the provided buffer must be tight, since
+    /// big-endian packed memory layouts start at the end of the buffer.
+    pub fn writeToPackedMemory(val: Value, ty: Type, mod: *Module, buffer: []u8, bit_offset: usize) void {
+        const target = mod.getTarget();
         const endian = target.cpu.arch.endian();
-        var vec_bitint = BigIntMutable.init(buf, 0);
-        const vec_len = @intCast(usize, ty.arrayLen());
-        const elem_ty = ty.childType();
-        const elem_size = @intCast(usize, elem_ty.bitSize(target));
-
-        var elem_buf: [16]std.math.big.Limb = undefined;
-        var elem_space: BigIntSpace = undefined;
-        var elem_buf2: [16]std.math.big.Limb = undefined;
-
-        var elem_i: usize = 0;
-        while (elem_i < vec_len) : (elem_i += 1) {
-            const elem_i_target = if (endian == .Big) vec_len - elem_i - 1 else elem_i;
-            const elem_val = val.indexVectorlike(elem_i_target);
-            const elem_bigint_const = switch (elem_ty.zigTypeTag()) {
-                .Int, .Bool => intOrBoolToBigInt(elem_val, elem_ty, target, &elem_buf, &elem_space),
-                .Float => floatToBigInt(elem_val, elem_ty, target, &elem_buf),
-                .Pointer => unreachable, // TODO
-                else => unreachable, // Sema should not let this happen
-            };
-            var elem_bitint = BigIntMutable.init(&elem_buf2, 0);
-            elem_bitint.shiftLeft(elem_bigint_const, elem_size * elem_i);
-            vec_bitint.bitOr(vec_bitint.toConst(), elem_bitint.toConst());
+        if (val.isUndef()) {
+            const bit_size = @intCast(usize, ty.bitSize(target));
+            std.mem.writeVarPackedInt(buffer, bit_offset, bit_size, @as(u1, 0), endian);
+            return;
         }
-        return vec_bitint.toConst();
-    }
+        switch (ty.zigTypeTag()) {
+            .Void => {},
+            .Bool => {
+                const byte_index = switch (endian) {
+                    .Little => bit_offset / 8,
+                    .Big => buffer.len - bit_offset / 8 - 1,
+                };
+                if (val.toBool()) {
+                    buffer[byte_index] |= (@as(u8, 1) << @intCast(u3, bit_offset % 8));
+                } else {
+                    buffer[byte_index] &= ~(@as(u8, 1) << @intCast(u3, bit_offset % 8));
+                }
+            },
+            .Int, .Enum => {
+                const bits = ty.intInfo(target).bits;
+                const abi_size = @intCast(usize, ty.abiSize(target));
 
-    fn enumToBigInt(val: Value, ty: Type, target: Target, space: *BigIntSpace) BigIntConst {
-        var enum_buf: Payload.U64 = undefined;
-        const int_val = val.enumToInt(ty, &enum_buf);
-        return int_val.toBigInt(space, target);
-    }
+                var enum_buffer: Payload.U64 = undefined;
+                const int_val = val.enumToInt(ty, &enum_buffer);
 
-    fn floatToBigInt(val: Value, ty: Type, target: Target, buf: []std.math.big.Limb) BigIntConst {
-        return switch (ty.floatBits(target)) {
-            16 => bitcastFloatToBigInt(f16, val.toFloat(f16), buf),
-            32 => bitcastFloatToBigInt(f32, val.toFloat(f32), buf),
-            64 => bitcastFloatToBigInt(f64, val.toFloat(f64), buf),
-            80 => bitcastFloatToBigInt(f80, val.toFloat(f80), buf),
-            128 => bitcastFloatToBigInt(f128, val.toFloat(f128), buf),
-            else => unreachable,
-        };
-    }
+                if (abi_size <= @sizeOf(u64)) {
+                    const int: u64 = switch (int_val.tag()) {
+                        .zero => 0,
+                        .one => 1,
+                        .int_u64 => int_val.castTag(.int_u64).?.data,
+                        .int_i64 => @bitCast(u64, int_val.castTag(.int_i64).?.data),
+                        else => unreachable,
+                    };
+                    std.mem.writeVarPackedInt(buffer, bit_offset, bits, int, endian);
+                } else {
+                    var bigint_buffer: BigIntSpace = undefined;
+                    const bigint = int_val.toBigInt(&bigint_buffer, target);
+                    bigint.writePackedTwosComplement(buffer, bit_offset, bits, endian);
+                }
+            },
+            .Float => switch (ty.floatBits(target)) {
+                16 => std.mem.writePackedInt(u16, buffer, bit_offset, @bitCast(u16, val.toFloat(f16)), endian),
+                32 => std.mem.writePackedInt(u32, buffer, bit_offset, @bitCast(u32, val.toFloat(f32)), endian),
+                64 => std.mem.writePackedInt(u64, buffer, bit_offset, @bitCast(u64, val.toFloat(f64)), endian),
+                80 => std.mem.writePackedInt(u80, buffer, bit_offset, @bitCast(u80, val.toFloat(f80)), endian),
+                128 => std.mem.writePackedInt(u128, buffer, bit_offset, @bitCast(u128, val.toFloat(f128)), endian),
+                else => unreachable,
+            },
+            .Vector => {
+                const elem_ty = ty.childType();
+                const elem_bit_size = @intCast(u16, elem_ty.bitSize(target));
+                const len = @intCast(usize, ty.arrayLen());
 
-    fn bitcastFloatToBigInt(comptime F: type, f: F, buf: []std.math.big.Limb) BigIntConst {
-        const Int = @Type(.{ .Int = .{
-            .signedness = .unsigned,
-            .bits = @typeInfo(F).Float.bits,
-        } });
-        const int = @bitCast(Int, f);
-        return BigIntMutable.init(buf, int).toConst();
+                var bits: u16 = 0;
+                var elem_i: usize = 0;
+                var elem_value_buf: ElemValueBuffer = undefined;
+                while (elem_i < len) : (elem_i += 1) {
+                    // On big-endian systems, LLVM reverses the element order of vectors by default
+                    const tgt_elem_i = if (endian == .Big) len - elem_i - 1 else elem_i;
+                    const elem_val = val.elemValueBuffer(mod, tgt_elem_i, &elem_value_buf);
+                    elem_val.writeToPackedMemory(elem_ty, mod, buffer, bit_offset + bits);
+                    bits += elem_bit_size;
+                }
+            },
+            .Struct => switch (ty.containerLayout()) {
+                .Auto => unreachable, // Sema is supposed to have emitted a compile error already
+                .Extern => unreachable, // Handled in non-packed writeToMemory
+                .Packed => {
+                    var bits: u16 = 0;
+                    const fields = ty.structFields().values();
+                    const field_vals = val.castTag(.aggregate).?.data;
+                    for (fields) |field, i| {
+                        const field_bits = @intCast(u16, field.ty.bitSize(target));
+                        field_vals[i].writeToPackedMemory(field.ty, mod, buffer, bit_offset + bits);
+                        bits += field_bits;
+                    }
+                },
+            },
+            else => @panic("TODO implement writeToPackedMemory for more types"),
+        }
     }
 
+    /// Load a Value from the contents of `buffer`.
+    ///
+    /// Asserts that buffer.len >= ty.abiSize(). The buffer is allowed to extend past
+    /// the end of the value in memory.
     pub fn readFromMemory(
         ty: Type,
         mod: *Module,
@@ -1389,6 +1401,7 @@ pub const Value = extern union {
         arena: Allocator,
     ) Allocator.Error!Value {
         const target = mod.getTarget();
+        const endian = target.cpu.arch.endian();
         switch (ty.zigTypeTag()) {
             .Void => return Value.@"void",
             .Bool => {
@@ -1398,27 +1411,40 @@ pub const Value = extern union {
                     return Value.@"true";
                 }
             },
-            .Int => {
-                if (buffer.len == 0) return Value.zero;
+            .Int, .Enum => {
                 const int_info = ty.intInfo(target);
-                const endian = target.cpu.arch.endian();
-                const Limb = std.math.big.Limb;
-                const limb_count = (buffer.len + @sizeOf(Limb) - 1) / @sizeOf(Limb);
-                const limbs_buffer = try arena.alloc(Limb, limb_count);
-                const abi_size = @intCast(usize, ty.abiSize(target));
-                var bigint = BigIntMutable.init(limbs_buffer, 0);
-                bigint.readTwosComplement(buffer, int_info.bits, abi_size, endian, int_info.signedness);
-                return fromBigInt(arena, bigint.toConst());
+                const bits = int_info.bits;
+                const byte_count = (bits + 7) / 8;
+                if (bits == 0 or buffer.len == 0) return Value.zero;
+
+                if (bits <= 64) switch (int_info.signedness) { // Fast path for integers <= u64
+                    .signed => {
+                        const val = std.mem.readVarInt(i64, buffer[0..byte_count], endian);
+                        return Value.Tag.int_i64.create(arena, (val << @intCast(u6, 64 - bits)) >> @intCast(u6, 64 - bits));
+                    },
+                    .unsigned => {
+                        const val = std.mem.readVarInt(u64, buffer[0..byte_count], endian);
+                        return Value.Tag.int_u64.create(arena, (val << @intCast(u6, 64 - bits)) >> @intCast(u6, 64 - bits));
+                    },
+                } else { // Slow path, we have to construct a big-int
+                    const Limb = std.math.big.Limb;
+                    const limb_count = (byte_count + @sizeOf(Limb) - 1) / @sizeOf(Limb);
+                    const limbs_buffer = try arena.alloc(Limb, limb_count);
+
+                    var bigint = BigIntMutable.init(limbs_buffer, 0);
+                    bigint.readTwosComplement(buffer[0..byte_count], bits, endian, int_info.signedness);
+                    return fromBigInt(arena, bigint.toConst());
+                }
             },
             .Float => switch (ty.floatBits(target)) {
-                16 => return Value.Tag.float_16.create(arena, floatReadFromMemory(f16, target, buffer)),
-                32 => return Value.Tag.float_32.create(arena, floatReadFromMemory(f32, target, buffer)),
-                64 => return Value.Tag.float_64.create(arena, floatReadFromMemory(f64, target, buffer)),
-                80 => return Value.Tag.float_80.create(arena, floatReadFromMemory(f80, target, buffer)),
-                128 => return Value.Tag.float_128.create(arena, floatReadFromMemory(f128, target, buffer)),
+                16 => return Value.Tag.float_16.create(arena, @bitCast(f16, std.mem.readInt(u16, buffer[0..2], endian))),
+                32 => return Value.Tag.float_32.create(arena, @bitCast(f32, std.mem.readInt(u32, buffer[0..4], endian))),
+                64 => return Value.Tag.float_64.create(arena, @bitCast(f64, std.mem.readInt(u64, buffer[0..8], endian))),
+                80 => return Value.Tag.float_80.create(arena, @bitCast(f80, std.mem.readInt(u80, buffer[0..10], endian))),
+                128 => return Value.Tag.float_128.create(arena, @bitCast(f128, std.mem.readInt(u128, buffer[0..16], endian))),
                 else => unreachable,
             },
-            .Array, .Vector => {
+            .Array => {
                 const elem_ty = ty.childType();
                 const elem_size = elem_ty.abiSize(target);
                 const elems = try arena.alloc(Value, @intCast(usize, ty.arrayLen()));
@@ -1429,6 +1455,12 @@ pub const Value = extern union {
                 }
                 return Tag.aggregate.create(arena, elems);
             },
+            .Vector => {
+                // We use byte_count instead of abi_size here, so that any padding bytes
+                // follow the data bytes, on both big- and little-endian systems.
+                const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
+                return readFromPackedMemory(ty, mod, buffer[0..byte_count], 0, arena);
+            },
             .Struct => switch (ty.containerLayout()) {
                 .Auto => unreachable, // Sema is supposed to have emitted a compile error already
                 .Extern => {
@@ -1436,26 +1468,20 @@ pub const Value = extern union {
                     const field_vals = try arena.alloc(Value, fields.len);
                     for (fields) |field, i| {
                         const off = @intCast(usize, ty.structFieldOffset(i, target));
-                        field_vals[i] = try readFromMemory(field.ty, mod, buffer[off..], arena);
+                        const sz = @intCast(usize, ty.structFieldType(i).abiSize(target));
+                        field_vals[i] = try readFromMemory(field.ty, mod, buffer[off..(off + sz)], arena);
                     }
                     return Tag.aggregate.create(arena, field_vals);
                 },
                 .Packed => {
-                    const endian = target.cpu.arch.endian();
-                    const Limb = std.math.big.Limb;
-                    const abi_size = @intCast(usize, ty.abiSize(target));
-                    const bit_size = @intCast(usize, ty.bitSize(target));
-                    const limb_count = (buffer.len + @sizeOf(Limb) - 1) / @sizeOf(Limb);
-                    const limbs_buffer = try arena.alloc(Limb, limb_count);
-                    var bigint = BigIntMutable.init(limbs_buffer, 0);
-                    bigint.readTwosComplement(buffer, bit_size, abi_size, endian, .unsigned);
-                    return intToPackedStruct(ty, target, bigint.toConst(), arena);
+                    const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
+                    return readFromPackedMemory(ty, mod, buffer[0..byte_count], 0, arena);
                 },
             },
             .ErrorSet => {
                 // TODO revisit this when we have the concept of the error tag type
                 const Int = u16;
-                const int = std.mem.readInt(Int, buffer[0..@sizeOf(Int)], target.cpu.arch.endian());
+                const int = std.mem.readInt(Int, buffer[0..@sizeOf(Int)], endian);
 
                 const payload = try arena.create(Value.Payload.Error);
                 payload.* = .{
@@ -1468,115 +1494,90 @@ pub const Value = extern union {
         }
     }
 
-    fn intToPackedStruct(
+    /// Load a Value from the contents of `buffer`.
+    ///
+    /// Both the start and the end of the provided buffer must be tight, since
+    /// big-endian packed memory layouts start at the end of the buffer.
+    pub fn readFromPackedMemory(
         ty: Type,
-        target: Target,
-        bigint: BigIntConst,
+        mod: *Module,
+        buffer: []const u8,
+        bit_offset: usize,
         arena: Allocator,
     ) Allocator.Error!Value {
-        const limbs_buffer = try arena.alloc(std.math.big.Limb, bigint.limbs.len);
-        var bigint_mut = bigint.toMutable(limbs_buffer);
-        const fields = ty.structFields().values();
-        const field_vals = try arena.alloc(Value, fields.len);
-        var bits: u16 = 0;
-        for (fields) |field, i| {
-            const field_bits = @intCast(u16, field.ty.bitSize(target));
-            bigint_mut.shiftRight(bigint, bits);
-            bigint_mut.truncate(bigint_mut.toConst(), .unsigned, field_bits);
-            bits += field_bits;
-            const field_bigint = bigint_mut.toConst();
-
-            field_vals[i] = switch (field.ty.zigTypeTag()) {
-                .Float => switch (field.ty.floatBits(target)) {
-                    16 => try bitCastBigIntToFloat(f16, .float_16, field_bigint, arena),
-                    32 => try bitCastBigIntToFloat(f32, .float_32, field_bigint, arena),
-                    64 => try bitCastBigIntToFloat(f64, .float_64, field_bigint, arena),
-                    80 => try bitCastBigIntToFloat(f80, .float_80, field_bigint, arena),
-                    128 => try bitCastBigIntToFloat(f128, .float_128, field_bigint, arena),
-                    else => unreachable,
-                },
-                .Bool => makeBool(!field_bigint.eqZero()),
-                .Int => try Tag.int_big_positive.create(
-                    arena,
-                    try arena.dupe(std.math.big.Limb, field_bigint.limbs),
-                ),
-                .Struct => try intToPackedStruct(field.ty, target, field_bigint, arena),
-                else => unreachable,
-            };
-        }
-        return Tag.aggregate.create(arena, field_vals);
-    }
-
-    fn bitCastBigIntToFloat(
-        comptime F: type,
-        comptime float_tag: Tag,
-        bigint: BigIntConst,
-        arena: Allocator,
-    ) !Value {
-        const Int = @Type(.{ .Int = .{
-            .signedness = .unsigned,
-            .bits = @typeInfo(F).Float.bits,
-        } });
-        const int = bigint.to(Int) catch |err| switch (err) {
-            error.NegativeIntoUnsigned => unreachable,
-            error.TargetTooSmall => unreachable,
-        };
-        const f = @bitCast(F, int);
-        return float_tag.create(arena, f);
-    }
-
-    fn floatWriteToMemory(comptime F: type, f: F, target: Target, buffer: []u8) void {
+        const target = mod.getTarget();
         const endian = target.cpu.arch.endian();
-        if (F == f80) {
-            const repr = std.math.break_f80(f);
-            std.mem.writeInt(u64, buffer[0..8], repr.fraction, endian);
-            std.mem.writeInt(u16, buffer[8..10], repr.exp, endian);
-            std.mem.set(u8, buffer[10..], 0);
-            return;
-        }
-        const Int = @Type(.{ .Int = .{
-            .signedness = .unsigned,
-            .bits = @typeInfo(F).Float.bits,
-        } });
-        const int = @bitCast(Int, f);
-        std.mem.writeInt(Int, buffer[0..@sizeOf(Int)], int, endian);
-    }
+        switch (ty.zigTypeTag()) {
+            .Void => return Value.@"void",
+            .Bool => {
+                const byte = switch (endian) {
+                    .Big => buffer[buffer.len - bit_offset / 8 - 1],
+                    .Little => buffer[bit_offset / 8],
+                };
+                if (((byte >> @intCast(u3, bit_offset % 8)) & 1) == 0) {
+                    return Value.@"false";
+                } else {
+                    return Value.@"true";
+                }
+            },
+            .Int, .Enum => {
+                if (buffer.len == 0) return Value.zero;
+                const int_info = ty.intInfo(target);
+                const abi_size = @intCast(usize, ty.abiSize(target));
 
-    fn floatReadFromMemory(comptime F: type, target: Target, buffer: []const u8) F {
-        const endian = target.cpu.arch.endian();
-        if (F == f80) {
-            return std.math.make_f80(.{
-                .fraction = readInt(u64, buffer[0..8], endian),
-                .exp = readInt(u16, buffer[8..10], endian),
-            });
-        }
-        const Int = @Type(.{ .Int = .{
-            .signedness = .unsigned,
-            .bits = @typeInfo(F).Float.bits,
-        } });
-        const int = readInt(Int, buffer[0..@sizeOf(Int)], endian);
-        return @bitCast(F, int);
-    }
-
-    fn readInt(comptime Int: type, buffer: *const [@sizeOf(Int)]u8, endian: std.builtin.Endian) Int {
-        var result: Int = 0;
-        switch (endian) {
-            .Big => {
-                for (buffer) |byte| {
-                    result <<= 8;
-                    result |= byte;
+                const bits = int_info.bits;
+                if (bits <= 64) switch (int_info.signedness) { // Fast path for integers <= u64
+                    .signed => return Value.Tag.int_i64.create(arena, std.mem.readVarPackedInt(i64, buffer, bit_offset, bits, endian, .signed)),
+                    .unsigned => return Value.Tag.int_u64.create(arena, std.mem.readVarPackedInt(u64, buffer, bit_offset, bits, endian, .unsigned)),
+                } else { // Slow path, we have to construct a big-int
+                    const Limb = std.math.big.Limb;
+                    const limb_count = (abi_size + @sizeOf(Limb) - 1) / @sizeOf(Limb);
+                    const limbs_buffer = try arena.alloc(Limb, limb_count);
+
+                    var bigint = BigIntMutable.init(limbs_buffer, 0);
+                    bigint.readPackedTwosComplement(buffer, bit_offset, bits, endian, int_info.signedness);
+                    return fromBigInt(arena, bigint.toConst());
                 }
             },
-            .Little => {
-                var i: usize = buffer.len;
-                while (i != 0) {
-                    i -= 1;
-                    result <<= 8;
-                    result |= buffer[i];
+            .Float => switch (ty.floatBits(target)) {
+                16 => return Value.Tag.float_16.create(arena, @bitCast(f16, std.mem.readPackedInt(u16, buffer, bit_offset, endian))),
+                32 => return Value.Tag.float_32.create(arena, @bitCast(f32, std.mem.readPackedInt(u32, buffer, bit_offset, endian))),
+                64 => return Value.Tag.float_64.create(arena, @bitCast(f64, std.mem.readPackedInt(u64, buffer, bit_offset, endian))),
+                80 => return Value.Tag.float_80.create(arena, @bitCast(f80, std.mem.readPackedInt(u80, buffer, bit_offset, endian))),
+                128 => return Value.Tag.float_128.create(arena, @bitCast(f128, std.mem.readPackedInt(u128, buffer, bit_offset, endian))),
+                else => unreachable,
+            },
+            .Vector => {
+                const elem_ty = ty.childType();
+                const elems = try arena.alloc(Value, @intCast(usize, ty.arrayLen()));
+
+                var bits: u16 = 0;
+                const elem_bit_size = @intCast(u16, elem_ty.bitSize(target));
+                for (elems) |_, i| {
+                    // On big-endian systems, LLVM reverses the element order of vectors by default
+                    const tgt_elem_i = if (endian == .Big) elems.len - i - 1 else i;
+                    elems[tgt_elem_i] = try readFromPackedMemory(elem_ty, mod, buffer, bit_offset + bits, arena);
+                    bits += elem_bit_size;
                 }
+                return Tag.aggregate.create(arena, elems);
             },
+            .Struct => switch (ty.containerLayout()) {
+                .Auto => unreachable, // Sema is supposed to have emitted a compile error already
+                .Extern => unreachable, // Handled by non-packed readFromMemory
+                .Packed => {
+                    var bits: u16 = 0;
+                    const fields = ty.structFields().values();
+                    const field_vals = try arena.alloc(Value, fields.len);
+                    for (fields) |field, i| {
+                        const field_bits = @intCast(u16, field.ty.bitSize(target));
+                        field_vals[i] = try readFromPackedMemory(field.ty, mod, buffer, bit_offset + bits, arena);
+                        bits += field_bits;
+                    }
+                    return Tag.aggregate.create(arena, field_vals);
+                },
+            },
+            else => @panic("TODO implement readFromPackedMemory for more types"),
         }
-        return result;
     }
 
     /// Asserts that the value is a float or an integer.
diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig
index c629a1a34b..728e8fe62a 100644
--- a/test/behavior/bitcast.zig
+++ b/test/behavior/bitcast.zig
@@ -63,6 +63,10 @@ fn testBitCast(comptime N: usize) !void {
     try expect(conv_iN(N, 0) == 0);
 
     try expect(conv_iN(N, -0) == 0);
+
+    if (N > 24) {
+        try expect(conv_uN(N, 0xf23456) == 0xf23456);
+    }
 }
 
 fn conv_iN(comptime N: usize, x: std.meta.Int(.signed, N)) std.meta.Int(.unsigned, N) {
@@ -73,6 +77,55 @@ fn conv_uN(comptime N: usize, x: std.meta.Int(.unsigned, N)) std.meta.Int(.signe
     return @bitCast(std.meta.Int(.signed, N), x);
 }
 
+test "bitcast uX to bytes" {
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
+
+    const bit_values = [_]usize{ 1, 48, 27, 512, 493, 293, 125, 204, 112 };
+    inline for (bit_values) |bits| {
+        try testBitCast(bits);
+        comptime try testBitCast(bits);
+    }
+}
+
+fn testBitCastuXToBytes(comptime N: usize) !void {
+
+    // The location of padding bits in these layouts are technically not defined
+    // by LLVM, but we currently allow exotic integers to be cast (at comptime)
+    // to types that expose their padding bits anyway.
+    //
+    // This test at least makes sure those bits are matched by the runtime behavior
+    // on the platforms we target. If the above behavior is restricted after all,
+    // this test should be deleted.
+
+    const T = std.meta.Int(.unsigned, N);
+    for ([_]T{ 0, ~@as(T, 0) }) |init_value| {
+        var x: T = init_value;
+        const bytes = std.mem.asBytes(&x);
+
+        const byte_count = (N + 7) / 8;
+        switch (builtin.cpu.arch.endian()) {
+            .Little => {
+                var byte_i = 0;
+                while (byte_i < (byte_count - 1)) : (byte_i += 1) {
+                    try expect(bytes[byte_i] == 0xff);
+                }
+                try expect(((bytes[byte_i] ^ 0xff) << -%@truncate(u3, N)) == 0);
+            },
+            .Big => {
+                var byte_i = byte_count - 1;
+                while (byte_i > 0) : (byte_i -= 1) {
+                    try expect(bytes[byte_i] == 0xff);
+                }
+                try expect(((bytes[byte_i] ^ 0xff) << -%@truncate(u3, N)) == 0);
+            },
+        }
+    }
+}
+
 test "nested bitcast" {
     const S = struct {
         fn moo(x: isize) !void {
@@ -283,7 +336,8 @@ test "@bitCast packed struct of floats" {
     comptime try S.doTheTest();
 }
 
-test "comptime @bitCast packed struct to int" {
+test "comptime @bitCast packed struct to int and back" {
+    if (builtin.zig_backend == .stage1) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
@@ -304,6 +358,44 @@ test "comptime @bitCast packed struct to int" {
         vectorf: @Vector(2, f16) = .{ 3.14, 2.71 },
     };
     const Int = @typeInfo(S).Struct.backing_integer.?;
+
+    // S -> Int
     var s: S = .{};
     try expectEqual(@bitCast(Int, s), comptime @bitCast(Int, S{}));
+
+    // Int -> S
+    var i: Int = 0;
+    const rt_cast = @bitCast(S, i);
+    const ct_cast = comptime @bitCast(S, @as(Int, 0));
+    inline for (@typeInfo(S).Struct.fields) |field| {
+        if (@typeInfo(field.field_type) == .Vector)
+            continue; //TODO: https://github.com/ziglang/zig/issues/13201
+
+        try expectEqual(@field(rt_cast, field.name), @field(ct_cast, field.name));
+    }
+}
+
+test "comptime bitcast with fields following f80" {
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
+
+    const FloatT = extern struct { f: f80, x: u128 align(16) };
+    const x: FloatT = .{ .f = 0.5, .x = 123 };
+    var x_as_uint: u256 = comptime @bitCast(u256, x);
+
+    try expect(x.f == @bitCast(FloatT, x_as_uint).f);
+    try expect(x.x == @bitCast(FloatT, x_as_uint).x);
+}
+
+test "bitcast vector to integer and back" {
+    if (builtin.zig_backend != .stage1) return error.SkipZigTest; // TODO: https://github.com/ziglang/zig/issues/13220
+    if (builtin.zig_backend == .stage1) return error.SkipZigTest; // stage1 gets the comptime cast wrong
+
+    const arr: [16]bool = [_]bool{ true, false } ++ [_]bool{true} ** 14;
+    var x = @splat(16, true);
+    x[1] = false;
+    try expect(@bitCast(u16, x) == comptime @bitCast(u16, @as(@Vector(16, bool), arr)));
 }