diff options
Diffstat (limited to 'std/unicode.zig')
| -rw-r--r-- | std/unicode.zig | 163 |
1 files changed, 151 insertions, 12 deletions
diff --git a/std/unicode.zig b/std/unicode.zig index 356df824f0..300e129647 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -1,6 +1,16 @@ const std = @import("./index.zig"); const debug = std.debug; +/// Returns how many bytes the UTF-8 representation would require +/// for the given codepoint. +pub fn utf8CodepointSequenceLength(c: u32) !u3 { + if (c < 0x80) return u3(1); + if (c < 0x800) return u3(2); + if (c < 0x10000) return u3(3); + if (c < 0x110000) return u3(4); + return error.CodepointTooLarge; +} + /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. @@ -12,11 +22,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { return error.Utf8InvalidStartByte; } +/// Encodes the given codepoint into a UTF-8 byte sequence. +/// c: the codepoint. +/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). +/// Errors: if c cannot be encoded in UTF-8. +/// Returns: the number of bytes written to out. +pub fn utf8Encode(c: u32, out: []u8) !u3 { + const length = try utf8CodepointSequenceLength(c); + debug.assert(out.len >= length); + switch (length) { + // The pattern for each is the same + // - Increasing the initial shift by 6 each time + // - Each time after the first shorten the shifted + // value to a max of 0b111111 (63) + 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range + 2 => { + out[0] = u8(0b11000000 | (c >> 6)); + out[1] = u8(0b10000000 | (c & 0b111111)); + }, + 3 => { + if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf; + out[0] = u8(0b11100000 | (c >> 12)); + out[1] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[2] = u8(0b10000000 | (c & 0b111111)); + }, + 4 => { + out[0] = u8(0b11110000 | (c >> 18)); + out[1] = u8(0b10000000 | ((c >> 12) & 0b111111)); + out[2] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[3] = u8(0b10000000 | (c & 0b111111)); + }, + else => unreachable, + } + return length; +} + +const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) !u32 { +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { return switch (bytes.len) { 1 => u32(bytes[0]), 2 => utf8Decode2(bytes), @@ -25,7 +71,12 @@ pub fn utf8Decode(bytes: []const u8) !u32 { else => unreachable, }; } -pub fn utf8Decode2(bytes: []const u8) !u32 { + +const Utf8Decode2Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, +}; +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { debug.assert(bytes.len == 2); debug.assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; @@ -38,7 +89,13 @@ pub fn utf8Decode2(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode3(bytes: []const u8) !u32 { + +const Utf8Decode3Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8EncodesSurrogateHalf, +}; +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { debug.assert(bytes.len == 3); debug.assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; @@ -56,7 +113,13 @@ pub fn utf8Decode3(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode4(bytes: []const u8) !u32 { + +const Utf8Decode4Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8CodepointTooLarge, +}; +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { debug.assert(bytes.len == 4); debug.assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; @@ -158,19 +221,67 @@ const Utf8Iterator = struct { pub fn nextCodepoint(it: &Utf8Iterator) ?u32 { const slice = it.nextCodepointSlice() ?? return null; - const r = switch (slice.len) { - 1 => u32(slice[0]), - 2 => utf8Decode2(slice), - 3 => utf8Decode3(slice), - 4 => utf8Decode4(slice), + switch (slice.len) { + 1 => return u32(slice[0]), + 2 => return utf8Decode2(slice) catch unreachable, + 3 => return utf8Decode3(slice) catch unreachable, + 4 => return utf8Decode4(slice) catch unreachable, else => unreachable, - }; - - return r catch unreachable; + } } }; +test "utf8 encode" { + comptime testUtf8Encode() catch unreachable; + try testUtf8Encode(); +} +fn testUtf8Encode() !void { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); + debug.assert(array[0] == 0b11100010); + debug.assert(array[1] == 0b10000010); + debug.assert(array[2] == 0b10101100); + + debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + debug.assert(array[0] == 0b00100100); + + debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2); + debug.assert(array[0] == 0b11000010); + debug.assert(array[1] == 0b10100010); + + debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4); + debug.assert(array[0] == 0b11110000); + debug.assert(array[1] == 0b10010000); + debug.assert(array[2] == 0b10001101); + debug.assert(array[3] == 0b10001000); +} + +test "utf8 encode error" { + comptime testUtf8EncodeError(); + testUtf8EncodeError(); +} +fn testUtf8EncodeError() void { + var array: [4]u8 = undefined; + testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); + testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); +} + +fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { + if (utf8Encode(codePoint, array)) |_| { + unreachable; + } else |err| { + debug.assert(err == expectedErr); + } +} + test "utf8 iterator on ascii" { + comptime testUtf8IteratorOnAscii(); + testUtf8IteratorOnAscii(); +} +fn testUtf8IteratorOnAscii() void { const s = Utf8View.initComptime("abc"); var it1 = s.iterator(); @@ -187,6 +298,10 @@ test "utf8 iterator on ascii" { } test "utf8 view bad" { + comptime testUtf8ViewBad(); + testUtf8ViewBad(); +} +fn testUtf8ViewBad() void { // Compile-time error. // const s3 = Utf8View.initComptime("\xfe\xf2"); @@ -195,6 +310,10 @@ test "utf8 view bad" { } test "utf8 view ok" { + comptime testUtf8ViewOk(); + testUtf8ViewOk(); +} +fn testUtf8ViewOk() void { const s = Utf8View.initComptime("東京市"); var it1 = s.iterator(); @@ -211,6 +330,10 @@ test "utf8 view ok" { } test "bad utf8 slice" { + comptime testBadUtf8Slice(); + testBadUtf8Slice(); +} +fn testBadUtf8Slice() void { debug.assert(utf8ValidateSlice("abc")); debug.assert(!utf8ValidateSlice("abc\xc0")); debug.assert(!utf8ValidateSlice("abc\xc0abc")); @@ -218,6 +341,10 @@ test "bad utf8 slice" { } test "valid utf8" { + comptime testValidUtf8(); + testValidUtf8(); +} +fn testValidUtf8() void { testValid("\x00", 0x0); testValid("\x20", 0x20); testValid("\x7f", 0x7f); @@ -233,6 +360,10 @@ test "valid utf8" { } test "invalid utf8 continuation bytes" { + comptime testInvalidUtf8ContinuationBytes(); + testInvalidUtf8ContinuationBytes(); +} +fn testInvalidUtf8ContinuationBytes() void { // unexpected continuation testError("\x80", error.Utf8InvalidStartByte); testError("\xbf", error.Utf8InvalidStartByte); @@ -261,6 +392,10 @@ test "invalid utf8 continuation bytes" { } test "overlong utf8 codepoint" { + comptime testOverlongUtf8Codepoint(); + testOverlongUtf8Codepoint(); +} +fn testOverlongUtf8Codepoint() void { testError("\xc0\x80", error.Utf8OverlongEncoding); testError("\xc1\xbf", error.Utf8OverlongEncoding); testError("\xe0\x80\x80", error.Utf8OverlongEncoding); @@ -270,6 +405,10 @@ test "overlong utf8 codepoint" { } test "misc invalid utf8" { + comptime testMiscInvalidUtf8(); + testMiscInvalidUtf8(); +} +fn testMiscInvalidUtf8() void { // codepoint out of bounds testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); |
