From f6cbe9a9cca3718501272cea177ab1ad48852ffe Mon Sep 17 00:00:00 2001 From: Braedon Date: Wed, 25 Apr 2018 14:59:03 +1000 Subject: Utf8 Encode --- std/unicode.zig | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'std/unicode.zig') diff --git a/std/unicode.zig b/std/unicode.zig index 356df824f0..e8a82e7f04 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -1,6 +1,17 @@ const std = @import("./index.zig"); const debug = std.debug; +// Given a Utf8-Codepoint returns how many (1-4) +// bytes there are if represented as an array of bytes. +pub fn utf8CodepointSequenceLength(c: u32) !u3 { + if (c < 0x80) return u3(1); + if (c < 0x800) return u3(2); + if (c -% 0xd800 < 0x800) return error.InvalidCodepoint; + if (c < 0x10000) return u3(3); + if (c < 0x110000) return u3(4); + return error.CodepointTooLarge; +} + /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. @@ -12,6 +23,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { return error.Utf8InvalidStartByte; } +/// Encodes a code point back into utf8 +/// c: the code point +/// out: the out buffer to write to +/// Notes: out has to have a len big enough for the bytes +/// however this limit is dependent on the code point +/// but giving it a minimum of 4 will ensure it will work +/// for all code points. +/// Errors: Will return an error if the code point is invalid. +pub fn utf8Encode(c: u32, out: []u8) !u3 { + if (utf8CodepointSequenceLength(c)) |length| { + debug.assert(out.len >= length); + switch (length) { + 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range + 2 => { + // 64 to convert the codepoint into its segments + out[0] = u8(0b11000000 + c / 64); + out[1] = u8(0b10000000 + c % 64); + }, + 3 => { + // Again using 64 as a conversion into their segments + // But using C / 4096 (64 * 64) as the first, (C/64) % 64 as the second, and just C % 64 as the last + out[0] = u8(0b11100000 + c / 4096); + out[1] = u8(0b10000000 + (c / 64) % 64); + out[2] = u8(0b10000000 + c % 64); + }, + 4 => { + // Same as previously but now its C / 64^3 (262144), (C / 4096) % 64, (C / 64) % 64 and C % 64 + out[0] = u8(0b11110000 + c / 262144); + out[1] = u8(0b10000000 + (c / 4096) % 64); + out[2] = u8(0b10000000 + (c / 64) % 64); + out[3] = u8(0b10000000 + c % 64); + }, + else => unreachable, + } + + return length; + } else |err| { + return err; + } +} + /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of @@ -25,6 +77,7 @@ pub fn utf8Decode(bytes: []const u8) !u32 { else => unreachable, }; } + pub fn utf8Decode2(bytes: []const u8) !u32 { debug.assert(bytes.len == 2); debug.assert(bytes[0] & 0b11100000 == 0b11000000); @@ -38,6 +91,7 @@ pub fn utf8Decode2(bytes: []const u8) !u32 { return value; } + pub fn utf8Decode3(bytes: []const u8) !u32 { debug.assert(bytes.len == 3); debug.assert(bytes[0] & 0b11110000 == 0b11100000); @@ -56,6 +110,7 @@ pub fn utf8Decode3(bytes: []const u8) !u32 { return value; } + pub fn utf8Decode4(bytes: []const u8) !u32 { debug.assert(bytes.len == 4); debug.assert(bytes[0] & 0b11111000 == 0b11110000); @@ -170,6 +225,42 @@ const Utf8Iterator = struct { } }; +test "utf8 encode" { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); + debug.assert(array[0] == 0b11100010); + debug.assert(array[1] == 0b10000010); + debug.assert(array[2] == 0b10101100); + + debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + debug.assert(array[0] == 0b00100100); + + debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2); + debug.assert(array[0] == 0b11000010); + debug.assert(array[1] == 0b10100010); + + debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4); + debug.assert(array[0] == 0b11110000); + debug.assert(array[1] == 0b10010000); + debug.assert(array[2] == 0b10001101); + debug.assert(array[3] == 0b10001000); +} + +test "utf8 encode error" { + var array: [4]u8 = undefined; + testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge); + testErrorEncode(0xd900, array[0..], error.InvalidCodepoint); +} + +fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { + if (utf8Encode(codePoint, array)) |_| { + unreachable; + } else |err| { + assert(err == expectedErr); + } +} + test "utf8 iterator on ascii" { const s = Utf8View.initComptime("abc"); -- cgit v1.2.3 From 07af6559d8a883dcb21ff99cddb4a836d2bb66bd Mon Sep 17 00:00:00 2001 From: Braedon Date: Wed, 25 Apr 2018 16:26:57 +1000 Subject: Changed to use shifting and masking --- std/unicode.zig | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'std/unicode.zig') diff --git a/std/unicode.zig b/std/unicode.zig index e8a82e7f04..7650f83c83 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -35,25 +35,25 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { if (utf8CodepointSequenceLength(c)) |length| { debug.assert(out.len >= length); switch (length) { + // The pattern for each is the same + // - Increasing the initial shift by 6 each time + // - Each time after the first shorten the shifted + // value to a max of 0b111111 (63) 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range 2 => { - // 64 to convert the codepoint into its segments - out[0] = u8(0b11000000 + c / 64); - out[1] = u8(0b10000000 + c % 64); + out[0] = u8(0b11000000 | (c >> 6)); + out[1] = u8(0b10000000 | (c & 0b111111)); }, 3 => { - // Again using 64 as a conversion into their segments - // But using C / 4096 (64 * 64) as the first, (C/64) % 64 as the second, and just C % 64 as the last - out[0] = u8(0b11100000 + c / 4096); - out[1] = u8(0b10000000 + (c / 64) % 64); - out[2] = u8(0b10000000 + c % 64); + out[0] = u8(0b11100000 | (c >> 12)); + out[1] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[2] = u8(0b10000000 | (c & 0b111111)); }, 4 => { - // Same as previously but now its C / 64^3 (262144), (C / 4096) % 64, (C / 64) % 64 and C % 64 - out[0] = u8(0b11110000 + c / 262144); - out[1] = u8(0b10000000 + (c / 4096) % 64); - out[2] = u8(0b10000000 + (c / 64) % 64); - out[3] = u8(0b10000000 + c % 64); + out[0] = u8(0b11110000 | (c >> 18)); + out[1] = u8(0b10000000 | ((c >> 12) & 0b111111)); + out[2] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[3] = u8(0b10000000 | (c & 0b111111)); }, else => unreachable, } @@ -257,7 +257,7 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { if (utf8Encode(codePoint, array)) |_| { unreachable; } else |err| { - assert(err == expectedErr); + debug.assert(err == expectedErr); } } -- cgit v1.2.3 From 2387292f204c59259fc64d7c960d201e808af5a9 Mon Sep 17 00:00:00 2001 From: Josh Wolfe Date: Sun, 29 Apr 2018 17:28:11 -0400 Subject: move some checks around in utf8Encode logic to be more zig idiomatic --- std/unicode.zig | 79 +++++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 42 deletions(-) (limited to 'std/unicode.zig') diff --git a/std/unicode.zig b/std/unicode.zig index 7650f83c83..9548576785 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -1,12 +1,11 @@ const std = @import("./index.zig"); const debug = std.debug; -// Given a Utf8-Codepoint returns how many (1-4) -// bytes there are if represented as an array of bytes. +/// Returns how many bytes the UTF-8 representation would require +/// for the given codepoint. pub fn utf8CodepointSequenceLength(c: u32) !u3 { if (c < 0x80) return u3(1); if (c < 0x800) return u3(2); - if (c -% 0xd800 < 0x800) return error.InvalidCodepoint; if (c < 0x10000) return u3(3); if (c < 0x110000) return u3(4); return error.CodepointTooLarge; @@ -23,45 +22,39 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { return error.Utf8InvalidStartByte; } -/// Encodes a code point back into utf8 -/// c: the code point -/// out: the out buffer to write to -/// Notes: out has to have a len big enough for the bytes -/// however this limit is dependent on the code point -/// but giving it a minimum of 4 will ensure it will work -/// for all code points. -/// Errors: Will return an error if the code point is invalid. +/// Encodes the given codepoint into a UTF-8 byte sequence. +/// c: the codepoint. +/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). +/// Errors: if c cannot be encoded in UTF-8. +/// Returns: the number of bytes written to out. pub fn utf8Encode(c: u32, out: []u8) !u3 { - if (utf8CodepointSequenceLength(c)) |length| { - debug.assert(out.len >= length); - switch (length) { - // The pattern for each is the same - // - Increasing the initial shift by 6 each time - // - Each time after the first shorten the shifted - // value to a max of 0b111111 (63) - 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range - 2 => { - out[0] = u8(0b11000000 | (c >> 6)); - out[1] = u8(0b10000000 | (c & 0b111111)); - }, - 3 => { - out[0] = u8(0b11100000 | (c >> 12)); - out[1] = u8(0b10000000 | ((c >> 6) & 0b111111)); - out[2] = u8(0b10000000 | (c & 0b111111)); - }, - 4 => { - out[0] = u8(0b11110000 | (c >> 18)); - out[1] = u8(0b10000000 | ((c >> 12) & 0b111111)); - out[2] = u8(0b10000000 | ((c >> 6) & 0b111111)); - out[3] = u8(0b10000000 | (c & 0b111111)); - }, - else => unreachable, - } - - return length; - } else |err| { - return err; + const length = try utf8CodepointSequenceLength(c); + debug.assert(out.len >= length); + switch (length) { + // The pattern for each is the same + // - Increasing the initial shift by 6 each time + // - Each time after the first shorten the shifted + // value to a max of 0b111111 (63) + 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range + 2 => { + out[0] = u8(0b11000000 | (c >> 6)); + out[1] = u8(0b10000000 | (c & 0b111111)); + }, + 3 => { + if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf; + out[0] = u8(0b11100000 | (c >> 12)); + out[1] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[2] = u8(0b10000000 | (c & 0b111111)); + }, + 4 => { + out[0] = u8(0b11110000 | (c >> 18)); + out[1] = u8(0b10000000 | ((c >> 12) & 0b111111)); + out[2] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[3] = u8(0b10000000 | (c & 0b111111)); + }, + else => unreachable, } + return length; } /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. @@ -249,8 +242,10 @@ test "utf8 encode" { test "utf8 encode error" { var array: [4]u8 = undefined; - testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge); - testErrorEncode(0xd900, array[0..], error.InvalidCodepoint); + testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); + testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); } fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { -- cgit v1.2.3 From 9543c0a7cc0bbdccc405370e224b546c56b76a0f Mon Sep 17 00:00:00 2001 From: Josh Wolfe Date: Sun, 29 Apr 2018 17:38:41 -0400 Subject: use explicit error sets for utf8Decode functions and run unicode tests at comptime also --- std/unicode.zig | 77 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 12 deletions(-) (limited to 'std/unicode.zig') diff --git a/std/unicode.zig b/std/unicode.zig index 9548576785..300e129647 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -57,11 +57,12 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { return length; } +const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) !u32 { +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { return switch (bytes.len) { 1 => u32(bytes[0]), 2 => utf8Decode2(bytes), @@ -71,7 +72,11 @@ pub fn utf8Decode(bytes: []const u8) !u32 { }; } -pub fn utf8Decode2(bytes: []const u8) !u32 { +const Utf8Decode2Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, +}; +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { debug.assert(bytes.len == 2); debug.assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; @@ -85,7 +90,12 @@ pub fn utf8Decode2(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode3(bytes: []const u8) !u32 { +const Utf8Decode3Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8EncodesSurrogateHalf, +}; +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { debug.assert(bytes.len == 3); debug.assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; @@ -104,7 +114,12 @@ pub fn utf8Decode3(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode4(bytes: []const u8) !u32 { +const Utf8Decode4Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8CodepointTooLarge, +}; +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { debug.assert(bytes.len == 4); debug.assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; @@ -206,19 +221,21 @@ const Utf8Iterator = struct { pub fn nextCodepoint(it: &Utf8Iterator) ?u32 { const slice = it.nextCodepointSlice() ?? return null; - const r = switch (slice.len) { - 1 => u32(slice[0]), - 2 => utf8Decode2(slice), - 3 => utf8Decode3(slice), - 4 => utf8Decode4(slice), + switch (slice.len) { + 1 => return u32(slice[0]), + 2 => return utf8Decode2(slice) catch unreachable, + 3 => return utf8Decode3(slice) catch unreachable, + 4 => return utf8Decode4(slice) catch unreachable, else => unreachable, - }; - - return r catch unreachable; + } } }; test "utf8 encode" { + comptime testUtf8Encode() catch unreachable; + try testUtf8Encode(); +} +fn testUtf8Encode() !void { // A few taken from wikipedia a few taken elsewhere var array: [4]u8 = undefined; debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); @@ -241,6 +258,10 @@ test "utf8 encode" { } test "utf8 encode error" { + comptime testUtf8EncodeError(); + testUtf8EncodeError(); +} +fn testUtf8EncodeError() void { var array: [4]u8 = undefined; testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); @@ -257,6 +278,10 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { } test "utf8 iterator on ascii" { + comptime testUtf8IteratorOnAscii(); + testUtf8IteratorOnAscii(); +} +fn testUtf8IteratorOnAscii() void { const s = Utf8View.initComptime("abc"); var it1 = s.iterator(); @@ -273,6 +298,10 @@ test "utf8 iterator on ascii" { } test "utf8 view bad" { + comptime testUtf8ViewBad(); + testUtf8ViewBad(); +} +fn testUtf8ViewBad() void { // Compile-time error. // const s3 = Utf8View.initComptime("\xfe\xf2"); @@ -281,6 +310,10 @@ test "utf8 view bad" { } test "utf8 view ok" { + comptime testUtf8ViewOk(); + testUtf8ViewOk(); +} +fn testUtf8ViewOk() void { const s = Utf8View.initComptime("東京市"); var it1 = s.iterator(); @@ -297,6 +330,10 @@ test "utf8 view ok" { } test "bad utf8 slice" { + comptime testBadUtf8Slice(); + testBadUtf8Slice(); +} +fn testBadUtf8Slice() void { debug.assert(utf8ValidateSlice("abc")); debug.assert(!utf8ValidateSlice("abc\xc0")); debug.assert(!utf8ValidateSlice("abc\xc0abc")); @@ -304,6 +341,10 @@ test "bad utf8 slice" { } test "valid utf8" { + comptime testValidUtf8(); + testValidUtf8(); +} +fn testValidUtf8() void { testValid("\x00", 0x0); testValid("\x20", 0x20); testValid("\x7f", 0x7f); @@ -319,6 +360,10 @@ test "valid utf8" { } test "invalid utf8 continuation bytes" { + comptime testInvalidUtf8ContinuationBytes(); + testInvalidUtf8ContinuationBytes(); +} +fn testInvalidUtf8ContinuationBytes() void { // unexpected continuation testError("\x80", error.Utf8InvalidStartByte); testError("\xbf", error.Utf8InvalidStartByte); @@ -347,6 +392,10 @@ test "invalid utf8 continuation bytes" { } test "overlong utf8 codepoint" { + comptime testOverlongUtf8Codepoint(); + testOverlongUtf8Codepoint(); +} +fn testOverlongUtf8Codepoint() void { testError("\xc0\x80", error.Utf8OverlongEncoding); testError("\xc1\xbf", error.Utf8OverlongEncoding); testError("\xe0\x80\x80", error.Utf8OverlongEncoding); @@ -356,6 +405,10 @@ test "overlong utf8 codepoint" { } test "misc invalid utf8" { + comptime testMiscInvalidUtf8(); + testMiscInvalidUtf8(); +} +fn testMiscInvalidUtf8() void { // codepoint out of bounds testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); -- cgit v1.2.3