From f6cbe9a9cca3718501272cea177ab1ad48852ffe Mon Sep 17 00:00:00 2001
From: Braedon <Braedonww@gmail.com>
Date: Wed, 25 Apr 2018 14:59:03 +1000
Subject: Utf8 Encode

---
 std/unicode.zig | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'std/unicode.zig')

diff --git a/std/unicode.zig b/std/unicode.zig
index 356df824f0..e8a82e7f04 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -1,6 +1,17 @@
 const std = @import("./index.zig");
 const debug = std.debug;
 
+// Given a Utf8-Codepoint returns how many (1-4)
+// bytes there are if represented as an array of bytes.
+pub fn utf8CodepointSequenceLength(c: u32) !u3 {
+    if (c < 0x80) return u3(1);
+    if (c < 0x800) return u3(2);
+    if (c -% 0xd800 < 0x800) return error.InvalidCodepoint;
+    if (c < 0x10000) return u3(3);
+    if (c < 0x110000) return u3(4);
+    return error.CodepointTooLarge;
+}
+
 /// Given the first byte of a UTF-8 codepoint,
 /// returns a number 1-4 indicating the total length of the codepoint in bytes.
 /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
@@ -12,6 +23,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
     return error.Utf8InvalidStartByte;
 }
 
+/// Encodes a code point back into utf8
+/// c: the code point
+/// out: the out buffer to write to
+/// Notes: out has to have a len big enough for the bytes
+///        however this limit is dependent on the code point
+///        but giving it a minimum of 4 will ensure it will work
+///        for all code points.
+/// Errors: Will return an error if the code point is invalid.
+pub fn utf8Encode(c: u32, out: []u8) !u3 {
+    if (utf8CodepointSequenceLength(c)) |length| {
+        debug.assert(out.len >= length);
+        switch (length) {
+            1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
+            2 => {
+                // 64 to convert the codepoint into its segments
+                out[0] = u8(0b11000000 + c / 64);
+                out[1] = u8(0b10000000 + c % 64);
+            },
+            3 => {
+                // Again using 64 as a conversion into their segments
+                // But using C / 4096 (64 * 64) as the first, (C/64) % 64 as the second, and just C % 64 as the last
+                out[0] = u8(0b11100000 + c / 4096);
+                out[1] = u8(0b10000000 + (c / 64) % 64);
+                out[2] = u8(0b10000000 + c % 64);
+            },
+            4 => {
+                // Same as previously but now its C / 64^3 (262144), (C / 4096) % 64, (C / 64) % 64 and C % 64
+                out[0] = u8(0b11110000 + c / 262144);
+                out[1] = u8(0b10000000 + (c / 4096) % 64);
+                out[2] = u8(0b10000000 + (c / 64) % 64);
+                out[3] = u8(0b10000000 + c % 64);
+            },
+            else => unreachable,
+        }
+
+        return length;
+    } else |err| {
+        return err;
+    }
+}
+
 /// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
 /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
 /// If you already know the length at comptime, you can call one of
@@ -25,6 +77,7 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
         else => unreachable,
     };
 }
+
 pub fn utf8Decode2(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 2);
     debug.assert(bytes[0] & 0b11100000 == 0b11000000);
@@ -38,6 +91,7 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
 
     return value;
 }
+
 pub fn utf8Decode3(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 3);
     debug.assert(bytes[0] & 0b11110000 == 0b11100000);
@@ -56,6 +110,7 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
 
     return value;
 }
+
 pub fn utf8Decode4(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 4);
     debug.assert(bytes[0] & 0b11111000 == 0b11110000);
@@ -170,6 +225,42 @@ const Utf8Iterator = struct {
     }
 };
 
+test "utf8 encode" {
+    // A few taken from wikipedia a few taken elsewhere
+    var array: [4]u8 = undefined;
+    debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
+    debug.assert(array[0] == 0b11100010);
+    debug.assert(array[1] == 0b10000010);
+    debug.assert(array[2] == 0b10101100);
+
+    debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
+    debug.assert(array[0] == 0b00100100);
+
+    debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
+    debug.assert(array[0] == 0b11000010);
+    debug.assert(array[1] == 0b10100010);
+
+    debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
+    debug.assert(array[0] == 0b11110000);
+    debug.assert(array[1] == 0b10010000);
+    debug.assert(array[2] == 0b10001101);
+    debug.assert(array[3] == 0b10001000);
+}
+
+test "utf8 encode error" {
+    var array: [4]u8 = undefined;
+    testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge);
+    testErrorEncode(0xd900, array[0..], error.InvalidCodepoint);
+}
+
+fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
+    if (utf8Encode(codePoint, array)) |_| {
+        unreachable;
+    } else |err| {
+        assert(err == expectedErr);
+    }
+}
+
 test "utf8 iterator on ascii" {
     const s = Utf8View.initComptime("abc");
 
-- 
cgit v1.2.3


From 07af6559d8a883dcb21ff99cddb4a836d2bb66bd Mon Sep 17 00:00:00 2001
From: Braedon <Braedonww@gmail.com>
Date: Wed, 25 Apr 2018 16:26:57 +1000
Subject: Changed to use shifting and masking

---
 std/unicode.zig | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'std/unicode.zig')

diff --git a/std/unicode.zig b/std/unicode.zig
index e8a82e7f04..7650f83c83 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -35,25 +35,25 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 {
     if (utf8CodepointSequenceLength(c)) |length| {
         debug.assert(out.len >= length);
         switch (length) {
+            // The pattern for each is the same
+            // - Increasing the initial shift by 6 each time
+            // - Each time after the first shorten the shifted
+            //   value to a max of 0b111111 (63)
             1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
             2 => {
-                // 64 to convert the codepoint into its segments
-                out[0] = u8(0b11000000 + c / 64);
-                out[1] = u8(0b10000000 + c % 64);
+                out[0] = u8(0b11000000 | (c >> 6));
+                out[1] = u8(0b10000000 | (c & 0b111111));
             },
             3 => {
-                // Again using 64 as a conversion into their segments
-                // But using C / 4096 (64 * 64) as the first, (C/64) % 64 as the second, and just C % 64 as the last
-                out[0] = u8(0b11100000 + c / 4096);
-                out[1] = u8(0b10000000 + (c / 64) % 64);
-                out[2] = u8(0b10000000 + c % 64);
+                out[0] = u8(0b11100000 | (c >> 12));
+                out[1] = u8(0b10000000 | ((c >> 6) & 0b111111));
+                out[2] = u8(0b10000000 | (c & 0b111111));
             },
             4 => {
-                // Same as previously but now its C / 64^3 (262144), (C / 4096) % 64, (C / 64) % 64 and C % 64
-                out[0] = u8(0b11110000 + c / 262144);
-                out[1] = u8(0b10000000 + (c / 4096) % 64);
-                out[2] = u8(0b10000000 + (c / 64) % 64);
-                out[3] = u8(0b10000000 + c % 64);
+                out[0] = u8(0b11110000 | (c >> 18));
+                out[1] = u8(0b10000000 | ((c >> 12) & 0b111111));
+                out[2] = u8(0b10000000 | ((c >> 6) & 0b111111));
+                out[3] = u8(0b10000000 | (c & 0b111111));
             },
             else => unreachable,
         }
@@ -257,7 +257,7 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
     if (utf8Encode(codePoint, array)) |_| {
         unreachable;
     } else |err| {
-        assert(err == expectedErr);
+        debug.assert(err == expectedErr);
     }
 }
 
-- 
cgit v1.2.3


From 2387292f204c59259fc64d7c960d201e808af5a9 Mon Sep 17 00:00:00 2001
From: Josh Wolfe <thejoshwolfe@gmail.com>
Date: Sun, 29 Apr 2018 17:28:11 -0400
Subject: move some checks around in utf8Encode logic to be more zig idiomatic

---
 std/unicode.zig | 79 +++++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 42 deletions(-)

(limited to 'std/unicode.zig')

diff --git a/std/unicode.zig b/std/unicode.zig
index 7650f83c83..9548576785 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -1,12 +1,11 @@
 const std = @import("./index.zig");
 const debug = std.debug;
 
-// Given a Utf8-Codepoint returns how many (1-4)
-// bytes there are if represented as an array of bytes.
+/// Returns how many bytes the UTF-8 representation would require
+/// for the given codepoint.
 pub fn utf8CodepointSequenceLength(c: u32) !u3 {
     if (c < 0x80) return u3(1);
     if (c < 0x800) return u3(2);
-    if (c -% 0xd800 < 0x800) return error.InvalidCodepoint;
     if (c < 0x10000) return u3(3);
     if (c < 0x110000) return u3(4);
     return error.CodepointTooLarge;
@@ -23,45 +22,39 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
     return error.Utf8InvalidStartByte;
 }
 
-/// Encodes a code point back into utf8
-/// c: the code point
-/// out: the out buffer to write to
-/// Notes: out has to have a len big enough for the bytes
-///        however this limit is dependent on the code point
-///        but giving it a minimum of 4 will ensure it will work
-///        for all code points.
-/// Errors: Will return an error if the code point is invalid.
+/// Encodes the given codepoint into a UTF-8 byte sequence.
+/// c: the codepoint.
+/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
+/// Errors: if c cannot be encoded in UTF-8.
+/// Returns: the number of bytes written to out.
 pub fn utf8Encode(c: u32, out: []u8) !u3 {
-    if (utf8CodepointSequenceLength(c)) |length| {
-        debug.assert(out.len >= length);
-        switch (length) {
-            // The pattern for each is the same
-            // - Increasing the initial shift by 6 each time
-            // - Each time after the first shorten the shifted
-            //   value to a max of 0b111111 (63)
-            1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
-            2 => {
-                out[0] = u8(0b11000000 | (c >> 6));
-                out[1] = u8(0b10000000 | (c & 0b111111));
-            },
-            3 => {
-                out[0] = u8(0b11100000 | (c >> 12));
-                out[1] = u8(0b10000000 | ((c >> 6) & 0b111111));
-                out[2] = u8(0b10000000 | (c & 0b111111));
-            },
-            4 => {
-                out[0] = u8(0b11110000 | (c >> 18));
-                out[1] = u8(0b10000000 | ((c >> 12) & 0b111111));
-                out[2] = u8(0b10000000 | ((c >> 6) & 0b111111));
-                out[3] = u8(0b10000000 | (c & 0b111111));
-            },
-            else => unreachable,
-        }
-
-        return length;
-    } else |err| {
-        return err;
+    const length = try utf8CodepointSequenceLength(c);
+    debug.assert(out.len >= length);
+    switch (length) {
+        // The pattern for each is the same
+        // - Increasing the initial shift by 6 each time
+        // - Each time after the first shorten the shifted
+        //   value to a max of 0b111111 (63)
+        1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
+        2 => {
+            out[0] = u8(0b11000000 | (c >> 6));
+            out[1] = u8(0b10000000 | (c & 0b111111));
+        },
+        3 => {
+            if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;
+            out[0] = u8(0b11100000 | (c >> 12));
+            out[1] = u8(0b10000000 | ((c >> 6) & 0b111111));
+            out[2] = u8(0b10000000 | (c & 0b111111));
+        },
+        4 => {
+            out[0] = u8(0b11110000 | (c >> 18));
+            out[1] = u8(0b10000000 | ((c >> 12) & 0b111111));
+            out[2] = u8(0b10000000 | ((c >> 6) & 0b111111));
+            out[3] = u8(0b10000000 | (c & 0b111111));
+        },
+        else => unreachable,
     }
+    return length;
 }
 
 /// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
@@ -249,8 +242,10 @@ test "utf8 encode" {
 
 test "utf8 encode error" {
     var array: [4]u8 = undefined;
-    testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge);
-    testErrorEncode(0xd900, array[0..], error.InvalidCodepoint);
+    testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
+    testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
+    testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
+    testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge);
 }
 
 fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
-- 
cgit v1.2.3


From 9543c0a7cc0bbdccc405370e224b546c56b76a0f Mon Sep 17 00:00:00 2001
From: Josh Wolfe <thejoshwolfe@gmail.com>
Date: Sun, 29 Apr 2018 17:38:41 -0400
Subject: use explicit error sets for utf8Decode functions

and run unicode tests at comptime also
---
 std/unicode.zig | 77 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 65 insertions(+), 12 deletions(-)

(limited to 'std/unicode.zig')

diff --git a/std/unicode.zig b/std/unicode.zig
index 9548576785..300e129647 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -57,11 +57,12 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 {
     return length;
 }
 
+const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
 /// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
 /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
 /// If you already know the length at comptime, you can call one of
 /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
-pub fn utf8Decode(bytes: []const u8) !u32 {
+pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
     return switch (bytes.len) {
         1 => u32(bytes[0]),
         2 => utf8Decode2(bytes),
@@ -71,7 +72,11 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
     };
 }
 
-pub fn utf8Decode2(bytes: []const u8) !u32 {
+const Utf8Decode2Error = error{
+    Utf8ExpectedContinuation,
+    Utf8OverlongEncoding,
+};
+pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
     debug.assert(bytes.len == 2);
     debug.assert(bytes[0] & 0b11100000 == 0b11000000);
     var value: u32 = bytes[0] & 0b00011111;
@@ -85,7 +90,12 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
     return value;
 }
 
-pub fn utf8Decode3(bytes: []const u8) !u32 {
+const Utf8Decode3Error = error{
+    Utf8ExpectedContinuation,
+    Utf8OverlongEncoding,
+    Utf8EncodesSurrogateHalf,
+};
+pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
     debug.assert(bytes.len == 3);
     debug.assert(bytes[0] & 0b11110000 == 0b11100000);
     var value: u32 = bytes[0] & 0b00001111;
@@ -104,7 +114,12 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
     return value;
 }
 
-pub fn utf8Decode4(bytes: []const u8) !u32 {
+const Utf8Decode4Error = error{
+    Utf8ExpectedContinuation,
+    Utf8OverlongEncoding,
+    Utf8CodepointTooLarge,
+};
+pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
     debug.assert(bytes.len == 4);
     debug.assert(bytes[0] & 0b11111000 == 0b11110000);
     var value: u32 = bytes[0] & 0b00000111;
@@ -206,19 +221,21 @@ const Utf8Iterator = struct {
     pub fn nextCodepoint(it: &Utf8Iterator) ?u32 {
         const slice = it.nextCodepointSlice() ?? return null;
 
-        const r = switch (slice.len) {
-            1 => u32(slice[0]),
-            2 => utf8Decode2(slice),
-            3 => utf8Decode3(slice),
-            4 => utf8Decode4(slice),
+        switch (slice.len) {
+            1 => return u32(slice[0]),
+            2 => return utf8Decode2(slice) catch unreachable,
+            3 => return utf8Decode3(slice) catch unreachable,
+            4 => return utf8Decode4(slice) catch unreachable,
             else => unreachable,
-        };
-
-        return r catch unreachable;
+        }
     }
 };
 
 test "utf8 encode" {
+    comptime testUtf8Encode() catch unreachable;
+    try testUtf8Encode();
+}
+fn testUtf8Encode() !void {
     // A few taken from wikipedia a few taken elsewhere
     var array: [4]u8 = undefined;
     debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
@@ -241,6 +258,10 @@ test "utf8 encode" {
 }
 
 test "utf8 encode error" {
+    comptime testUtf8EncodeError();
+    testUtf8EncodeError();
+}
+fn testUtf8EncodeError() void {
     var array: [4]u8 = undefined;
     testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
     testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
@@ -257,6 +278,10 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
 }
 
 test "utf8 iterator on ascii" {
+    comptime testUtf8IteratorOnAscii();
+    testUtf8IteratorOnAscii();
+}
+fn testUtf8IteratorOnAscii() void {
     const s = Utf8View.initComptime("abc");
 
     var it1 = s.iterator();
@@ -273,6 +298,10 @@ test "utf8 iterator on ascii" {
 }
 
 test "utf8 view bad" {
+    comptime testUtf8ViewBad();
+    testUtf8ViewBad();
+}
+fn testUtf8ViewBad() void {
     // Compile-time error.
     // const s3 = Utf8View.initComptime("\xfe\xf2");
 
@@ -281,6 +310,10 @@ test "utf8 view bad" {
 }
 
 test "utf8 view ok" {
+    comptime testUtf8ViewOk();
+    testUtf8ViewOk();
+}
+fn testUtf8ViewOk() void {
     const s = Utf8View.initComptime("東京市");
 
     var it1 = s.iterator();
@@ -297,6 +330,10 @@ test "utf8 view ok" {
 }
 
 test "bad utf8 slice" {
+    comptime testBadUtf8Slice();
+    testBadUtf8Slice();
+}
+fn testBadUtf8Slice() void {
     debug.assert(utf8ValidateSlice("abc"));
     debug.assert(!utf8ValidateSlice("abc\xc0"));
     debug.assert(!utf8ValidateSlice("abc\xc0abc"));
@@ -304,6 +341,10 @@ test "bad utf8 slice" {
 }
 
 test "valid utf8" {
+    comptime testValidUtf8();
+    testValidUtf8();
+}
+fn testValidUtf8() void {
     testValid("\x00", 0x0);
     testValid("\x20", 0x20);
     testValid("\x7f", 0x7f);
@@ -319,6 +360,10 @@ test "valid utf8" {
 }
 
 test "invalid utf8 continuation bytes" {
+    comptime testInvalidUtf8ContinuationBytes();
+    testInvalidUtf8ContinuationBytes();
+}
+fn testInvalidUtf8ContinuationBytes() void {
     // unexpected continuation
     testError("\x80", error.Utf8InvalidStartByte);
     testError("\xbf", error.Utf8InvalidStartByte);
@@ -347,6 +392,10 @@ test "invalid utf8 continuation bytes" {
 }
 
 test "overlong utf8 codepoint" {
+    comptime testOverlongUtf8Codepoint();
+    testOverlongUtf8Codepoint();
+}
+fn testOverlongUtf8Codepoint() void {
     testError("\xc0\x80", error.Utf8OverlongEncoding);
     testError("\xc1\xbf", error.Utf8OverlongEncoding);
     testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
@@ -356,6 +405,10 @@ test "overlong utf8 codepoint" {
 }
 
 test "misc invalid utf8" {
+    comptime testMiscInvalidUtf8();
+    testMiscInvalidUtf8();
+}
+fn testMiscInvalidUtf8() void {
     // codepoint out of bounds
     testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
     testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
-- 
cgit v1.2.3