aboutsummaryrefslogtreecommitdiff
path: root/lib/std/unicode.zig
diff options
context:
space:
mode:
authorAndrew Kelley <andrew@ziglang.org>2022-09-14 19:25:15 -0400
committerGitHub <noreply@github.com>2022-09-14 19:25:15 -0400
commit0931dda9a95e14b97a84e60aed424fd8bb5e1232 (patch)
tree5e18e74a3128b36ae807b6f108623e7567a1bf4e /lib/std/unicode.zig
parentd7a0fe67b38a60a4f294d6d9034c7a342bed7094 (diff)
parentcf744cf04f8cad148ae93e5c5c7d8c5f5f62c164 (diff)
downloadzig-0931dda9a95e14b97a84e60aed424fd8bb5e1232.tar.gz
zig-0931dda9a95e14b97a84e60aed424fd8bb5e1232.zip
Merge pull request #11663 from matu3ba/utf16
std.unicode: add utf16 byte length and codepoints counting routines
Diffstat (limited to 'lib/std/unicode.zig')
-rw-r--r--lib/std/unicode.zig62
1 files changed, 56 insertions, 6 deletions
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index 81a7ed838f..e8ad0d5e5b 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -164,7 +164,6 @@ pub fn utf8ValidCodepoint(value: u21) bool {
/// Returns the length of a supplied UTF-8 string literal in terms of unicode
/// codepoints.
-/// Asserts that the data is valid UTF-8.
pub fn utf8CountCodepoints(s: []const u8) !usize {
var len: usize = 0;
@@ -325,6 +324,41 @@ pub const Utf16LeIterator = struct {
}
};
+/// Returns the length of a supplied UTF-16 string literal in terms of unicode
+/// codepoints.
+pub fn utf16CountCodepoints(utf16le: []const u16) !usize {
+ var len: usize = 0;
+ var it = Utf16LeIterator.init(utf16le);
+ while (try it.nextCodepoint()) |_| len += 1;
+ return len;
+}
+
+fn testUtf16CountCodepoints() !void {
+ try testing.expectEqual(
+ @as(usize, 1),
+ try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("a")),
+ );
+ try testing.expectEqual(
+ @as(usize, 10),
+ try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("abcdefghij")),
+ );
+ try testing.expectEqual(
+ @as(usize, 10),
+ try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("äåéëþüúíóö")),
+ );
+ try testing.expectEqual(
+ @as(usize, 5),
+ try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("こんにちは")),
+ );
+}
+
+test "utf16 count codepoints" {
+ try testUtf16CountCodepoints();
+ // TODO stage1 error: out of bounds slice
+ if (@import("builtin").zig_backend != .stage1)
+ comptime try testUtf16CountCodepoints();
+}
+
test "utf8 encode" {
comptime try testUtf8Encode();
try testUtf8Encode();
@@ -748,9 +782,9 @@ test "utf8ToUtf16LeWithNull" {
}
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
-pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8):0]u16 {
+pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 {
comptime {
- const len: usize = calcUtf16LeLen(utf8);
+ const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err);
var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
assert(len == utf16le_len);
@@ -758,13 +792,17 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le
}
}
-fn calcUtf16LeLen(utf8: []const u8) usize {
+const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
+
+/// Returns length in UTF-16 of UTF-8 slice as length of []u16.
+/// Length in []u8 is 2*len16.
+pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
var src_i: usize = 0;
var dest_len: usize = 0;
while (src_i < utf8.len) {
- const n = utf8ByteSequenceLength(utf8[src_i]) catch unreachable;
+ const n = try utf8ByteSequenceLength(utf8[src_i]);
const next_src_i = src_i + n;
- const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch unreachable;
+ const codepoint = try utf8Decode(utf8[src_i..next_src_i]);
if (codepoint < 0x10000) {
dest_len += 1;
} else {
@@ -775,6 +813,18 @@ fn calcUtf16LeLen(utf8: []const u8) usize {
return dest_len;
}
+fn testCalcUtf16LeLen() !void {
+ try testing.expectEqual(@as(usize, 1), try calcUtf16LeLen("a"));
+ try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("abcdefghij"));
+ try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("äåéëþüúíóö"));
+ try testing.expectEqual(@as(usize, 5), try calcUtf16LeLen("こんにちは"));
+}
+
+test "calculate utf16 string length of given utf8 string in u16" {
+ try testCalcUtf16LeLen();
+ comptime try testCalcUtf16LeLen();
+}
+
/// Print the given `utf16le` string
fn formatUtf16le(
utf16le: []const u16,