aboutsummaryrefslogtreecommitdiff
path: root/std/unicode.zig
diff options
context:
space:
mode:
Diffstat (limited to 'std/unicode.zig')
-rw-r--r--std/unicode.zig163
1 files changed, 151 insertions, 12 deletions
diff --git a/std/unicode.zig b/std/unicode.zig
index 356df824f0..300e129647 100644
--- a/std/unicode.zig
+++ b/std/unicode.zig
@@ -1,6 +1,16 @@
const std = @import("./index.zig");
const debug = std.debug;
+/// Returns how many bytes the UTF-8 representation would require
+/// for the given codepoint.
+pub fn utf8CodepointSequenceLength(c: u32) !u3 {
+ if (c < 0x80) return u3(1);
+ if (c < 0x800) return u3(2);
+ if (c < 0x10000) return u3(3);
+ if (c < 0x110000) return u3(4);
+ return error.CodepointTooLarge;
+}
+
/// Given the first byte of a UTF-8 codepoint,
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
@@ -12,11 +22,47 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
return error.Utf8InvalidStartByte;
}
+/// Encodes the given codepoint into a UTF-8 byte sequence.
+/// c: the codepoint.
+/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
+/// Errors: if c cannot be encoded in UTF-8.
+/// Returns: the number of bytes written to out.
+pub fn utf8Encode(c: u32, out: []u8) !u3 {
+ const length = try utf8CodepointSequenceLength(c);
+ debug.assert(out.len >= length);
+ switch (length) {
+ // The pattern for each is the same
+ // - Increasing the initial shift by 6 each time
+ // - Each time after the first shorten the shifted
+ // value to a max of 0b111111 (63)
+ 1 => out[0] = u8(c), // Can just do 0 + codepoint for initial range
+ 2 => {
+ out[0] = u8(0b11000000 | (c >> 6));
+ out[1] = u8(0b10000000 | (c & 0b111111));
+ },
+ 3 => {
+ if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;
+ out[0] = u8(0b11100000 | (c >> 12));
+ out[1] = u8(0b10000000 | ((c >> 6) & 0b111111));
+ out[2] = u8(0b10000000 | (c & 0b111111));
+ },
+ 4 => {
+ out[0] = u8(0b11110000 | (c >> 18));
+ out[1] = u8(0b10000000 | ((c >> 12) & 0b111111));
+ out[2] = u8(0b10000000 | ((c >> 6) & 0b111111));
+ out[3] = u8(0b10000000 | (c & 0b111111));
+ },
+ else => unreachable,
+ }
+ return length;
+}
+
+const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
/// If you already know the length at comptime, you can call one of
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
-pub fn utf8Decode(bytes: []const u8) !u32 {
+pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
return switch (bytes.len) {
1 => u32(bytes[0]),
2 => utf8Decode2(bytes),
@@ -25,7 +71,12 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
else => unreachable,
};
}
-pub fn utf8Decode2(bytes: []const u8) !u32 {
+
+const Utf8Decode2Error = error{
+ Utf8ExpectedContinuation,
+ Utf8OverlongEncoding,
+};
+pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
debug.assert(bytes.len == 2);
debug.assert(bytes[0] & 0b11100000 == 0b11000000);
var value: u32 = bytes[0] & 0b00011111;
@@ -38,7 +89,13 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
return value;
}
-pub fn utf8Decode3(bytes: []const u8) !u32 {
+
+const Utf8Decode3Error = error{
+ Utf8ExpectedContinuation,
+ Utf8OverlongEncoding,
+ Utf8EncodesSurrogateHalf,
+};
+pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
debug.assert(bytes.len == 3);
debug.assert(bytes[0] & 0b11110000 == 0b11100000);
var value: u32 = bytes[0] & 0b00001111;
@@ -56,7 +113,13 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
return value;
}
-pub fn utf8Decode4(bytes: []const u8) !u32 {
+
+const Utf8Decode4Error = error{
+ Utf8ExpectedContinuation,
+ Utf8OverlongEncoding,
+ Utf8CodepointTooLarge,
+};
+pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
debug.assert(bytes.len == 4);
debug.assert(bytes[0] & 0b11111000 == 0b11110000);
var value: u32 = bytes[0] & 0b00000111;
@@ -158,19 +221,67 @@ const Utf8Iterator = struct {
pub fn nextCodepoint(it: &Utf8Iterator) ?u32 {
const slice = it.nextCodepointSlice() ?? return null;
- const r = switch (slice.len) {
- 1 => u32(slice[0]),
- 2 => utf8Decode2(slice),
- 3 => utf8Decode3(slice),
- 4 => utf8Decode4(slice),
+ switch (slice.len) {
+ 1 => return u32(slice[0]),
+ 2 => return utf8Decode2(slice) catch unreachable,
+ 3 => return utf8Decode3(slice) catch unreachable,
+ 4 => return utf8Decode4(slice) catch unreachable,
else => unreachable,
- };
-
- return r catch unreachable;
+ }
}
};
+test "utf8 encode" {
+ comptime testUtf8Encode() catch unreachable;
+ try testUtf8Encode();
+}
+fn testUtf8Encode() !void {
+ // A few taken from wikipedia a few taken elsewhere
+ var array: [4]u8 = undefined;
+ debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
+ debug.assert(array[0] == 0b11100010);
+ debug.assert(array[1] == 0b10000010);
+ debug.assert(array[2] == 0b10101100);
+
+ debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
+ debug.assert(array[0] == 0b00100100);
+
+ debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
+ debug.assert(array[0] == 0b11000010);
+ debug.assert(array[1] == 0b10100010);
+
+ debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
+ debug.assert(array[0] == 0b11110000);
+ debug.assert(array[1] == 0b10010000);
+ debug.assert(array[2] == 0b10001101);
+ debug.assert(array[3] == 0b10001000);
+}
+
+test "utf8 encode error" {
+ comptime testUtf8EncodeError();
+ testUtf8EncodeError();
+}
+fn testUtf8EncodeError() void {
+ var array: [4]u8 = undefined;
+ testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
+ testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
+ testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
+ testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge);
+}
+
+fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
+ if (utf8Encode(codePoint, array)) |_| {
+ unreachable;
+ } else |err| {
+ debug.assert(err == expectedErr);
+ }
+}
+
test "utf8 iterator on ascii" {
+ comptime testUtf8IteratorOnAscii();
+ testUtf8IteratorOnAscii();
+}
+fn testUtf8IteratorOnAscii() void {
const s = Utf8View.initComptime("abc");
var it1 = s.iterator();
@@ -187,6 +298,10 @@ test "utf8 iterator on ascii" {
}
test "utf8 view bad" {
+ comptime testUtf8ViewBad();
+ testUtf8ViewBad();
+}
+fn testUtf8ViewBad() void {
// Compile-time error.
// const s3 = Utf8View.initComptime("\xfe\xf2");
@@ -195,6 +310,10 @@ test "utf8 view bad" {
}
test "utf8 view ok" {
+ comptime testUtf8ViewOk();
+ testUtf8ViewOk();
+}
+fn testUtf8ViewOk() void {
const s = Utf8View.initComptime("東京市");
var it1 = s.iterator();
@@ -211,6 +330,10 @@ test "utf8 view ok" {
}
test "bad utf8 slice" {
+ comptime testBadUtf8Slice();
+ testBadUtf8Slice();
+}
+fn testBadUtf8Slice() void {
debug.assert(utf8ValidateSlice("abc"));
debug.assert(!utf8ValidateSlice("abc\xc0"));
debug.assert(!utf8ValidateSlice("abc\xc0abc"));
@@ -218,6 +341,10 @@ test "bad utf8 slice" {
}
test "valid utf8" {
+ comptime testValidUtf8();
+ testValidUtf8();
+}
+fn testValidUtf8() void {
testValid("\x00", 0x0);
testValid("\x20", 0x20);
testValid("\x7f", 0x7f);
@@ -233,6 +360,10 @@ test "valid utf8" {
}
test "invalid utf8 continuation bytes" {
+ comptime testInvalidUtf8ContinuationBytes();
+ testInvalidUtf8ContinuationBytes();
+}
+fn testInvalidUtf8ContinuationBytes() void {
// unexpected continuation
testError("\x80", error.Utf8InvalidStartByte);
testError("\xbf", error.Utf8InvalidStartByte);
@@ -261,6 +392,10 @@ test "invalid utf8 continuation bytes" {
}
test "overlong utf8 codepoint" {
+ comptime testOverlongUtf8Codepoint();
+ testOverlongUtf8Codepoint();
+}
+fn testOverlongUtf8Codepoint() void {
testError("\xc0\x80", error.Utf8OverlongEncoding);
testError("\xc1\xbf", error.Utf8OverlongEncoding);
testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
@@ -270,6 +405,10 @@ test "overlong utf8 codepoint" {
}
test "misc invalid utf8" {
+ comptime testMiscInvalidUtf8();
+ testMiscInvalidUtf8();
+}
+fn testMiscInvalidUtf8() void {
// codepoint out of bounds
testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);