diff options
| author | Josh Wolfe <thejoshwolfe@gmail.com> | 2018-04-29 17:38:41 -0400 |
|---|---|---|
| committer | Josh Wolfe <thejoshwolfe@gmail.com> | 2018-04-29 18:07:18 -0400 |
| commit | 9543c0a7cc0bbdccc405370e224b546c56b76a0f (patch) | |
| tree | 52eaed9efd6fb2b309bbe10a4a7284d0709a6492 /std | |
| parent | c03b9010db55e52dfa227b17e35203b93b5ee1df (diff) | |
| download | zig-9543c0a7cc0bbdccc405370e224b546c56b76a0f.tar.gz zig-9543c0a7cc0bbdccc405370e224b546c56b76a0f.zip | |
use explicit error sets for utf8Decode functions
and run unicode tests at comptime also
Diffstat (limited to 'std')
| -rw-r--r-- | std/unicode.zig | 77 |
1 files changed, 65 insertions, 12 deletions
diff --git a/std/unicode.zig b/std/unicode.zig index 9548576785..300e129647 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -57,11 +57,12 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { return length; } +const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) !u32 { +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { return switch (bytes.len) { 1 => u32(bytes[0]), 2 => utf8Decode2(bytes), @@ -71,7 +72,11 @@ pub fn utf8Decode(bytes: []const u8) !u32 { }; } -pub fn utf8Decode2(bytes: []const u8) !u32 { +const Utf8Decode2Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, +}; +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { debug.assert(bytes.len == 2); debug.assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; @@ -85,7 +90,12 @@ pub fn utf8Decode2(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode3(bytes: []const u8) !u32 { +const Utf8Decode3Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8EncodesSurrogateHalf, +}; +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { debug.assert(bytes.len == 3); debug.assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; @@ -104,7 +114,12 @@ pub fn utf8Decode3(bytes: []const u8) !u32 { return value; } -pub fn utf8Decode4(bytes: []const u8) !u32 { +const Utf8Decode4Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8CodepointTooLarge, +}; +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { debug.assert(bytes.len == 4); debug.assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; @@ -206,19 +221,21 @@ const Utf8Iterator = struct { pub fn nextCodepoint(it: &Utf8Iterator) ?u32 { const slice = it.nextCodepointSlice() ?? return null; - const r = switch (slice.len) { - 1 => u32(slice[0]), - 2 => utf8Decode2(slice), - 3 => utf8Decode3(slice), - 4 => utf8Decode4(slice), + switch (slice.len) { + 1 => return u32(slice[0]), + 2 => return utf8Decode2(slice) catch unreachable, + 3 => return utf8Decode3(slice) catch unreachable, + 4 => return utf8Decode4(slice) catch unreachable, else => unreachable, - }; - - return r catch unreachable; + } } }; test "utf8 encode" { + comptime testUtf8Encode() catch unreachable; + try testUtf8Encode(); +} +fn testUtf8Encode() !void { // A few taken from wikipedia a few taken elsewhere var array: [4]u8 = undefined; debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); @@ -241,6 +258,10 @@ test "utf8 encode" { } test "utf8 encode error" { + comptime testUtf8EncodeError(); + testUtf8EncodeError(); +} +fn testUtf8EncodeError() void { var array: [4]u8 = undefined; testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); @@ -257,6 +278,10 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { } test "utf8 iterator on ascii" { + comptime testUtf8IteratorOnAscii(); + testUtf8IteratorOnAscii(); +} +fn testUtf8IteratorOnAscii() void { const s = Utf8View.initComptime("abc"); var it1 = s.iterator(); @@ -273,6 +298,10 @@ test "utf8 iterator on ascii" { } test "utf8 view bad" { + comptime testUtf8ViewBad(); + testUtf8ViewBad(); +} +fn testUtf8ViewBad() void { // Compile-time error. // const s3 = Utf8View.initComptime("\xfe\xf2"); @@ -281,6 +310,10 @@ test "utf8 view bad" { } test "utf8 view ok" { + comptime testUtf8ViewOk(); + testUtf8ViewOk(); +} +fn testUtf8ViewOk() void { const s = Utf8View.initComptime("東京市"); var it1 = s.iterator(); @@ -297,6 +330,10 @@ test "utf8 view ok" { } test "bad utf8 slice" { + comptime testBadUtf8Slice(); + testBadUtf8Slice(); +} +fn testBadUtf8Slice() void { debug.assert(utf8ValidateSlice("abc")); debug.assert(!utf8ValidateSlice("abc\xc0")); debug.assert(!utf8ValidateSlice("abc\xc0abc")); @@ -304,6 +341,10 @@ test "bad utf8 slice" { } test "valid utf8" { + comptime testValidUtf8(); + testValidUtf8(); +} +fn testValidUtf8() void { testValid("\x00", 0x0); testValid("\x20", 0x20); testValid("\x7f", 0x7f); @@ -319,6 +360,10 @@ test "valid utf8" { } test "invalid utf8 continuation bytes" { + comptime testInvalidUtf8ContinuationBytes(); + testInvalidUtf8ContinuationBytes(); +} +fn testInvalidUtf8ContinuationBytes() void { // unexpected continuation testError("\x80", error.Utf8InvalidStartByte); testError("\xbf", error.Utf8InvalidStartByte); @@ -347,6 +392,10 @@ test "invalid utf8 continuation bytes" { } test "overlong utf8 codepoint" { + comptime testOverlongUtf8Codepoint(); + testOverlongUtf8Codepoint(); +} +fn testOverlongUtf8Codepoint() void { testError("\xc0\x80", error.Utf8OverlongEncoding); testError("\xc1\xbf", error.Utf8OverlongEncoding); testError("\xe0\x80\x80", error.Utf8OverlongEncoding); @@ -356,6 +405,10 @@ test "overlong utf8 codepoint" { } test "misc invalid utf8" { + comptime testMiscInvalidUtf8(); + testMiscInvalidUtf8(); +} +fn testMiscInvalidUtf8() void { // codepoint out of bounds testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); |
