diff options
| author | Josh Wolfe <thejoshwolfe@gmail.com> | 2018-04-29 16:57:29 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2018-04-29 16:57:29 -0400 |
| commit | 8c567d84f1f14e06286897fe1b2408a1d2fd7d76 (patch) | |
| tree | 6cfd3acaa871f88ceb27e502d92398425c9239b7 /std | |
| parent | ad4ee47d9fec15945d445f637987d487405e7b22 (diff) | |
| parent | 07af6559d8a883dcb21ff99cddb4a836d2bb66bd (diff) | |
| download | zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.tar.gz zig-8c567d84f1f14e06286897fe1b2408a1d2fd7d76.zip | |
Merge pull request #954 from BraedonWooding/patch-2
Utf8 Encoding from Codepoint to Bytes
Diffstat (limited to 'std')
| -rw-r--r-- | std/unicode.zig | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/std/unicode.zig b/std/unicode.zig index 356df824f0..7650f83c83 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -1,6 +1,17 @@ const std = @import("./index.zig"); const debug = std.debug; +// Given a Utf8-Codepoint returns how many (1-4) +// bytes there are if represented as an array of bytes. +pub fn utf8CodepointSequenceLength(c: u32) !u3 { + if (c < 0x80) return u3(1); + if (c < 0x800) return u3(2); + if (c -% 0xd800 < 0x800) return error.InvalidCodepoint; + if (c < 0x10000) return u3(3); + if (c < 0x110000) return u3(4); + return error.CodepointTooLarge; +} + /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. @@ -12,6 +23,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { return error.Utf8InvalidStartByte; } +/// Encodes a code point back into utf8 +/// c: the code point +/// out: the out buffer to write to +/// Notes: out has to have a len big enough for the bytes +/// however this limit is dependent on the code point +/// but giving it a minimum of 4 will ensure it will work +/// for all code points. +/// Errors: Will return an error if the code point is invalid. +pub fn utf8Encode(c: u32, out: []u8) !u3 { + if (utf8CodepointSequenceLength(c)) |length| { + debug.assert(out.len >= length); + switch (length) { + // The pattern for each is the same + // - Increasing the initial shift by 6 each time + // - Each time after the first shorten the shifted + // value to a max of 0b111111 (63) + 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range + 2 => { + out[0] = u8(0b11000000 | (c >> 6)); + out[1] = u8(0b10000000 | (c & 0b111111)); + }, + 3 => { + out[0] = u8(0b11100000 | (c >> 12)); + out[1] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[2] = u8(0b10000000 | (c & 0b111111)); + }, + 4 => { + out[0] = u8(0b11110000 | (c >> 18)); + out[1] = u8(0b10000000 | ((c >> 12) & 0b111111)); + out[2] = u8(0b10000000 | ((c >> 6) & 0b111111)); + out[3] = u8(0b10000000 | (c & 0b111111)); + }, + else => unreachable, + } + + return length; + } else |err| { + return err; + } +} + /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of @@ -25,6 +77,7 @@ pub fn utf8Decode(bytes: []const u8) !u32 { else => unreachable, }; } + pub fn utf8Decode2(bytes: []const u8) !u32 { debug.assert(bytes.len == 2); debug.assert(bytes[0] & 0b11100000 == 0b11000000); @@ -38,6 +91,7 @@ pub fn utf8Decode2(bytes: []const u8) !u32 { return value; } + pub fn utf8Decode3(bytes: []const u8) !u32 { debug.assert(bytes.len == 3); debug.assert(bytes[0] & 0b11110000 == 0b11100000); @@ -56,6 +110,7 @@ pub fn utf8Decode3(bytes: []const u8) !u32 { return value; } + pub fn utf8Decode4(bytes: []const u8) !u32 { debug.assert(bytes.len == 4); debug.assert(bytes[0] & 0b11111000 == 0b11110000); @@ -170,6 +225,42 @@ const Utf8Iterator = struct { } }; +test "utf8 encode" { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); + debug.assert(array[0] == 0b11100010); + debug.assert(array[1] == 0b10000010); + debug.assert(array[2] == 0b10101100); + + debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + debug.assert(array[0] == 0b00100100); + + debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2); + debug.assert(array[0] == 0b11000010); + debug.assert(array[1] == 0b10100010); + + debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4); + debug.assert(array[0] == 0b11110000); + debug.assert(array[1] == 0b10010000); + debug.assert(array[2] == 0b10001101); + debug.assert(array[3] == 0b10001000); +} + +test "utf8 encode error" { + var array: [4]u8 = undefined; + testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge); + testErrorEncode(0xd900, array[0..], error.InvalidCodepoint); +} + +fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { + if (utf8Encode(codePoint, array)) |_| { + unreachable; + } else |err| { + debug.assert(err == expectedErr); + } +} + test "utf8 iterator on ascii" { const s = Utf8View.initComptime("abc"); |
