Merge pull request #954 from BraedonWooding/patch-2

Utf8 Encoding from Codepoint to Bytes
author: Josh Wolfe <thejoshwolfe@gmail.com> 2018-04-29 16:57:29 -0400
committer: GitHub <noreply@github.com> 2018-04-29 16:57:29 -0400
commit: 8c567d84f1f14e06286897fe1b2408a1d2fd7d76 (patch)
tree: 6cfd3acaa871f88ceb27e502d92398425c9239b7 /std
parent: ad4ee47d9fec15945d445f637987d487405e7b22 (diff)
parent: 07af6559d8a883dcb21ff99cddb4a836d2bb66bd (diff)
download: zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.tar.gz
zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.zip
1 files changed, 91 insertions, 0 deletions
diff --git a/std/unicode.zig b/std/unicode.zig
index 356df824f0..7650f83c83 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -1,6 +1,17 @@
 const std = @import("./index.zig");
 const debug = std.debug;
 
+// Given a Utf8-Codepoint returns how many (1-4)
+// bytes there are if represented as an array of bytes.
+pub fn utf8CodepointSequenceLength(c: u32) !u3 {
+    if (c < 0x80) return u3(1);
+    if (c < 0x800) return u3(2);
+    if (c -% 0xd800 < 0x800) return error.InvalidCodepoint;
+    if (c < 0x10000) return u3(3);
+    if (c < 0x110000) return u3(4);
+    return error.CodepointTooLarge;
+}
+
 /// Given the first byte of a UTF-8 codepoint,
 /// returns a number 1-4 indicating the total length of the codepoint in bytes.
 /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
@@ -12,6 +23,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
     return error.Utf8InvalidStartByte;
 }
 
+/// Encodes a code point back into utf8
+/// c: the code point
+/// out: the out buffer to write to
+/// Notes: out has to have a len big enough for the bytes
+///        however this limit is dependent on the code point
+///        but giving it a minimum of 4 will ensure it will work
+///        for all code points.
+/// Errors: Will return an error if the code point is invalid.
+pub fn utf8Encode(c: u32, out: []u8) !u3 {
+    if (utf8CodepointSequenceLength(c)) |length| {
+        debug.assert(out.len >= length);
+        switch (length) {
+            // The pattern for each is the same
+            // - Increasing the initial shift by 6 each time
+            // - Each time after the first shorten the shifted
+            //   value to a max of 0b111111 (63)
+            1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
+            2 => {
+                out[0] = u8(0b11000000 | (c >> 6));
+                out[1] = u8(0b10000000 | (c & 0b111111));
+            },
+            3 => {
+                out[0] = u8(0b11100000 | (c >> 12));
+                out[1] = u8(0b10000000 | ((c >> 6) & 0b111111));
+                out[2] = u8(0b10000000 | (c & 0b111111));
+            },
+            4 => {
+                out[0] = u8(0b11110000 | (c >> 18));
+                out[1] = u8(0b10000000 | ((c >> 12) & 0b111111));
+                out[2] = u8(0b10000000 | ((c >> 6) & 0b111111));
+                out[3] = u8(0b10000000 | (c & 0b111111));
+            },
+            else => unreachable,
+        }
+
+        return length;
+    } else |err| {
+        return err;
+    }
+}
+
 /// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
 /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
 /// If you already know the length at comptime, you can call one of
@@ -25,6 +77,7 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
         else => unreachable,
     };
 }
+
 pub fn utf8Decode2(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 2);
     debug.assert(bytes[0] & 0b11100000 == 0b11000000);
@@ -38,6 +91,7 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
 
     return value;
 }
+
 pub fn utf8Decode3(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 3);
     debug.assert(bytes[0] & 0b11110000 == 0b11100000);
@@ -56,6 +110,7 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
 
     return value;
 }
+
 pub fn utf8Decode4(bytes: []const u8) !u32 {
     debug.assert(bytes.len == 4);
     debug.assert(bytes[0] & 0b11111000 == 0b11110000);
@@ -170,6 +225,42 @@ const Utf8Iterator = struct {
     }
 };
 
+test "utf8 encode" {
+    // A few taken from wikipedia a few taken elsewhere
+    var array: [4]u8 = undefined;
+    debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
+    debug.assert(array[0] == 0b11100010);
+    debug.assert(array[1] == 0b10000010);
+    debug.assert(array[2] == 0b10101100);
+
+    debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
+    debug.assert(array[0] == 0b00100100);
+
+    debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
+    debug.assert(array[0] == 0b11000010);
+    debug.assert(array[1] == 0b10100010);
+
+    debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
+    debug.assert(array[0] == 0b11110000);
+    debug.assert(array[1] == 0b10010000);
+    debug.assert(array[2] == 0b10001101);
+    debug.assert(array[3] == 0b10001000);
+}
+
+test "utf8 encode error" {
+    var array: [4]u8 = undefined;
+    testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge);
+    testErrorEncode(0xd900, array[0..], error.InvalidCodepoint);
+}
+
+fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
+    if (utf8Encode(codePoint, array)) |_| {
+        unreachable;
+    } else |err| {
+        debug.assert(err == expectedErr);
+    }
+}
+
 test "utf8 iterator on ascii" {
     const s = Utf8View.initComptime("abc");
author	Josh Wolfe <thejoshwolfe@gmail.com>	2018-04-29 16:57:29 -0400
committer	GitHub <noreply@github.com>	2018-04-29 16:57:29 -0400
commit	8c567d84f1f14e06286897fe1b2408a1d2fd7d76 (patch)
tree	6cfd3acaa871f88ceb27e502d92398425c9239b7 /std
parent	ad4ee47d9fec15945d445f637987d487405e7b22 (diff)
parent	07af6559d8a883dcb21ff99cddb4a836d2bb66bd (diff)
download	zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.tar.gz zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.zip