diff options
Diffstat (limited to 'lib/std/unicode.zig')
| -rw-r--r-- | lib/std/unicode.zig | 84 |
1 files changed, 78 insertions, 6 deletions
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 18bd5ab0e2..2d4d4b40d9 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 { /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - return switch (@clz(u8, ~first_byte)) { - 0 => 1, - 2 => 2, - 3 => 3, - 4 => 4, + // The switch is optimized much better than a "smart" approach using @clz + return switch (first_byte) { + 0b0000_0000 ... 0b0111_1111 => 1, + 0b1100_0000 ... 0b1101_1111 => 2, + 0b1110_0000 ... 0b1110_1111 => 3, + 0b1111_0000 ... 0b1111_0111 => 4, else => error.Utf8InvalidStartByte, }; } @@ -153,6 +154,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { return value; } +/// Returns true if the given unicode codepoint can be encoded in UTF-8. +pub fn utf8ValidCodepoint(value: u21) bool { + return switch (value) { + 0xD800 ... 0xDFFF => false, // Surrogates range + 0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value + else => true, + }; +} + +/// Returns the length of a supplied UTF-8 string literal in terms of unicode +/// codepoints. +/// Asserts that the data is valid UTF-8. +pub fn utf8CountCodepoints(s: []const u8) !usize { + var len: usize = 0; + + const N = @sizeOf(usize); + const MASK = 0x80 * (std.math.maxInt(usize) / 0xff); + + var i: usize = 0; + while (i < s.len) { + // Fast path for ASCII sequences + while (i + N <= s.len) : (i += N) { + const v = mem.readIntNative(usize, s[i..][0..N]); + if (v & MASK != 0) break; + len += N; + } + + if (i < s.len) { + const n = try utf8ByteSequenceLength(s[i]); + if (i + n > s.len) return error.TruncatedInput; + + switch (n) { + 1 => {}, // ASCII, no validation needed + else => _ = try utf8Decode(s[i .. i + n]), + } + + i += n; + len += 1; + } + } + + return len; +} + pub fn utf8ValidateSlice(s: []const u8) bool { var i: usize = 0; while (i < s.len) { @@ -687,7 +732,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le } } -/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8. fn calcUtf16LeLen(utf8: []const u8) usize { var src_i: usize = 0; var dest_len: usize = 0; @@ -757,3 +801,31 @@ test "utf8ToUtf16LeStringLiteral" { testing.expect(utf16[2] == 0); } } + +fn testUtf8CountCodepoints() !void { + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij")); + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö")); + testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは")); + // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); +} + +test "utf8 count codepoints" { + try testUtf8CountCodepoints(); + comptime testUtf8CountCodepoints() catch unreachable; +} + +fn testUtf8ValidCodepoint() !void { + testing.expect(utf8ValidCodepoint('e')); + testing.expect(utf8ValidCodepoint('ë')); + testing.expect(utf8ValidCodepoint('は')); + testing.expect(utf8ValidCodepoint(0xe000)); + testing.expect(utf8ValidCodepoint(0x10ffff)); + testing.expect(!utf8ValidCodepoint(0xd800)); + testing.expect(!utf8ValidCodepoint(0xdfff)); + testing.expect(!utf8ValidCodepoint(0x110000)); +} + +test "utf8 valid codepoint" { + try testUtf8ValidCodepoint(); + comptime testUtf8ValidCodepoint() catch unreachable; +} |
