diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2019-09-26 01:54:45 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-09-26 01:54:45 -0400 |
| commit | 68bb3945708c43109c48bda3664176307d45b62c (patch) | |
| tree | afb9731e10cef9d192560b52cd9ae2cf179775c4 /lib/std/unicode.zig | |
| parent | 6128bc728d1e1024a178c16c2149f5b1a167a013 (diff) | |
| parent | 4637e8f9699af9c3c6cf4df50ef5bb67c7a318a4 (diff) | |
| download | zig-68bb3945708c43109c48bda3664176307d45b62c.tar.gz zig-68bb3945708c43109c48bda3664176307d45b62c.zip | |
Merge pull request #3315 from ziglang/mv-std-lib
Move std/ to lib/std/
Diffstat (limited to 'lib/std/unicode.zig')
| -rw-r--r-- | lib/std/unicode.zig | 593 |
1 files changed, 593 insertions, 0 deletions
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig new file mode 100644 index 0000000000..2e96147166 --- /dev/null +++ b/lib/std/unicode.zig @@ -0,0 +1,593 @@ +const std = @import("./std.zig"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const testing = std.testing; +const mem = std.mem; + +/// Returns how many bytes the UTF-8 representation would require +/// for the given codepoint. +pub fn utf8CodepointSequenceLength(c: u32) !u3 { + if (c < 0x80) return u3(1); + if (c < 0x800) return u3(2); + if (c < 0x10000) return u3(3); + if (c < 0x110000) return u3(4); + return error.CodepointTooLarge; +} + +/// Given the first byte of a UTF-8 codepoint, +/// returns a number 1-4 indicating the total length of the codepoint in bytes. +/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. +pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { + if (first_byte < 0b10000000) return u3(1); + if (first_byte & 0b11100000 == 0b11000000) return u3(2); + if (first_byte & 0b11110000 == 0b11100000) return u3(3); + if (first_byte & 0b11111000 == 0b11110000) return u3(4); + return error.Utf8InvalidStartByte; +} + +/// Encodes the given codepoint into a UTF-8 byte sequence. +/// c: the codepoint. +/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). +/// Errors: if c cannot be encoded in UTF-8. +/// Returns: the number of bytes written to out. +pub fn utf8Encode(c: u32, out: []u8) !u3 { + const length = try utf8CodepointSequenceLength(c); + assert(out.len >= length); + switch (length) { + // The pattern for each is the same + // - Increasing the initial shift by 6 each time + // - Each time after the first shorten the shifted + // value to a max of 0b111111 (63) + 1 => out[0] = @intCast(u8, c), // Can just do 0 + codepoint for initial range + 2 => { + out[0] = @intCast(u8, 0b11000000 | (c >> 6)); + out[1] = @intCast(u8, 0b10000000 | (c & 0b111111)); + }, + 3 => { + if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf; + out[0] = @intCast(u8, 0b11100000 | (c >> 12)); + out[1] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111)); + out[2] = @intCast(u8, 0b10000000 | (c & 0b111111)); + }, + 4 => { + out[0] = @intCast(u8, 0b11110000 | (c >> 18)); + out[1] = @intCast(u8, 0b10000000 | ((c >> 12) & 0b111111)); + out[2] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111)); + out[3] = @intCast(u8, 0b10000000 | (c & 0b111111)); + }, + else => unreachable, + } + return length; +} + +const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; + +/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. +/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. +/// If you already know the length at comptime, you can call one of +/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { + return switch (bytes.len) { + 1 => u32(bytes[0]), + 2 => utf8Decode2(bytes), + 3 => utf8Decode3(bytes), + 4 => utf8Decode4(bytes), + else => unreachable, + }; +} + +const Utf8Decode2Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, +}; +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { + assert(bytes.len == 2); + assert(bytes[0] & 0b11100000 == 0b11000000); + var value: u32 = bytes[0] & 0b00011111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (value < 0x80) return error.Utf8OverlongEncoding; + + return value; +} + +const Utf8Decode3Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8EncodesSurrogateHalf, +}; +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { + assert(bytes.len == 3); + assert(bytes[0] & 0b11110000 == 0b11100000); + var value: u32 = bytes[0] & 0b00001111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[2] & 0b00111111; + + if (value < 0x800) return error.Utf8OverlongEncoding; + if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; + + return value; +} + +const Utf8Decode4Error = error{ + Utf8ExpectedContinuation, + Utf8OverlongEncoding, + Utf8CodepointTooLarge, +}; +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { + assert(bytes.len == 4); + assert(bytes[0] & 0b11111000 == 0b11110000); + var value: u32 = bytes[0] & 0b00000111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[2] & 0b00111111; + + if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + value <<= 6; + value |= bytes[3] & 0b00111111; + + if (value < 0x10000) return error.Utf8OverlongEncoding; + if (value > 0x10FFFF) return error.Utf8CodepointTooLarge; + + return value; +} + +pub fn utf8ValidateSlice(s: []const u8) bool { + var i: usize = 0; + while (i < s.len) { + if (utf8ByteSequenceLength(s[i])) |cp_len| { + if (i + cp_len > s.len) { + return false; + } + + if (utf8Decode(s[i .. i + cp_len])) |_| {} else |_| { + return false; + } + i += cp_len; + } else |err| { + return false; + } + } + return true; +} + +/// Utf8View iterates the code points of a utf-8 encoded string. +/// +/// ``` +/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator(); +/// while (utf8.nextCodepointSlice()) |codepoint| { +/// std.debug.warn("got codepoint {}\n", codepoint); +/// } +/// ``` +pub const Utf8View = struct { + bytes: []const u8, + + pub fn init(s: []const u8) !Utf8View { + if (!utf8ValidateSlice(s)) { + return error.InvalidUtf8; + } + + return initUnchecked(s); + } + + pub fn initUnchecked(s: []const u8) Utf8View { + return Utf8View{ .bytes = s }; + } + + /// TODO: https://github.com/ziglang/zig/issues/425 + pub fn initComptime(comptime s: []const u8) Utf8View { + if (comptime init(s)) |r| { + return r; + } else |err| switch (err) { + error.InvalidUtf8 => { + @compileError("invalid utf8"); + unreachable; + }, + } + } + + pub fn iterator(s: Utf8View) Utf8Iterator { + return Utf8Iterator{ + .bytes = s.bytes, + .i = 0, + }; + } +}; + +pub const Utf8Iterator = struct { + bytes: []const u8, + i: usize, + + pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 { + if (it.i >= it.bytes.len) { + return null; + } + + const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable; + it.i += cp_len; + return it.bytes[it.i - cp_len .. it.i]; + } + + pub fn nextCodepoint(it: *Utf8Iterator) ?u32 { + const slice = it.nextCodepointSlice() orelse return null; + + switch (slice.len) { + 1 => return u32(slice[0]), + 2 => return utf8Decode2(slice) catch unreachable, + 3 => return utf8Decode3(slice) catch unreachable, + 4 => return utf8Decode4(slice) catch unreachable, + else => unreachable, + } + } +}; + +pub const Utf16LeIterator = struct { + bytes: []const u8, + i: usize, + + pub fn init(s: []const u16) Utf16LeIterator { + return Utf16LeIterator{ + .bytes = @sliceToBytes(s), + .i = 0, + }; + } + + pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + assert(it.i <= it.bytes.len); + if (it.i == it.bytes.len) return null; + const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c0 & ~u32(0x03ff) == 0xd800) { + // surrogate pair + it.i += 2; + if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf; + const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; + it.i += 2; + return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); + } else if (c0 & ~u32(0x03ff) == 0xdc00) { + return error.UnexpectedSecondSurrogateHalf; + } else { + it.i += 2; + return c0; + } + } +}; + +test "utf8 encode" { + comptime testUtf8Encode() catch unreachable; + try testUtf8Encode(); +} +fn testUtf8Encode() !void { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + testing.expect((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); + testing.expect(array[0] == 0b11100010); + testing.expect(array[1] == 0b10000010); + testing.expect(array[2] == 0b10101100); + + testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + testing.expect(array[0] == 0b00100100); + + testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2); + testing.expect(array[0] == 0b11000010); + testing.expect(array[1] == 0b10100010); + + testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4); + testing.expect(array[0] == 0b11110000); + testing.expect(array[1] == 0b10010000); + testing.expect(array[2] == 0b10001101); + testing.expect(array[3] == 0b10001000); +} + +test "utf8 encode error" { + comptime testUtf8EncodeError(); + testUtf8EncodeError(); +} +fn testUtf8EncodeError() void { + var array: [4]u8 = undefined; + testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); + testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); + testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); +} + +fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void { + testing.expectError(expectedErr, utf8Encode(codePoint, array)); +} + +test "utf8 iterator on ascii" { + comptime testUtf8IteratorOnAscii(); + testUtf8IteratorOnAscii(); +} +fn testUtf8IteratorOnAscii() void { + const s = Utf8View.initComptime("abc"); + + var it1 = s.iterator(); + testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?)); + testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?)); + testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?)); + testing.expect(it1.nextCodepointSlice() == null); + + var it2 = s.iterator(); + testing.expect(it2.nextCodepoint().? == 'a'); + testing.expect(it2.nextCodepoint().? == 'b'); + testing.expect(it2.nextCodepoint().? == 'c'); + testing.expect(it2.nextCodepoint() == null); +} + +test "utf8 view bad" { + comptime testUtf8ViewBad(); + testUtf8ViewBad(); +} +fn testUtf8ViewBad() void { + // Compile-time error. + // const s3 = Utf8View.initComptime("\xfe\xf2"); + testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo")); +} + +test "utf8 view ok" { + comptime testUtf8ViewOk(); + testUtf8ViewOk(); +} +fn testUtf8ViewOk() void { + const s = Utf8View.initComptime("東京市"); + + var it1 = s.iterator(); + testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?)); + testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?)); + testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?)); + testing.expect(it1.nextCodepointSlice() == null); + + var it2 = s.iterator(); + testing.expect(it2.nextCodepoint().? == 0x6771); + testing.expect(it2.nextCodepoint().? == 0x4eac); + testing.expect(it2.nextCodepoint().? == 0x5e02); + testing.expect(it2.nextCodepoint() == null); +} + +test "bad utf8 slice" { + comptime testBadUtf8Slice(); + testBadUtf8Slice(); +} +fn testBadUtf8Slice() void { + testing.expect(utf8ValidateSlice("abc")); + testing.expect(!utf8ValidateSlice("abc\xc0")); + testing.expect(!utf8ValidateSlice("abc\xc0abc")); + testing.expect(utf8ValidateSlice("abc\xdf\xbf")); +} + +test "valid utf8" { + comptime testValidUtf8(); + testValidUtf8(); +} +fn testValidUtf8() void { + testValid("\x00", 0x0); + testValid("\x20", 0x20); + testValid("\x7f", 0x7f); + testValid("\xc2\x80", 0x80); + testValid("\xdf\xbf", 0x7ff); + testValid("\xe0\xa0\x80", 0x800); + testValid("\xe1\x80\x80", 0x1000); + testValid("\xef\xbf\xbf", 0xffff); + testValid("\xf0\x90\x80\x80", 0x10000); + testValid("\xf1\x80\x80\x80", 0x40000); + testValid("\xf3\xbf\xbf\xbf", 0xfffff); + testValid("\xf4\x8f\xbf\xbf", 0x10ffff); +} + +test "invalid utf8 continuation bytes" { + comptime testInvalidUtf8ContinuationBytes(); + testInvalidUtf8ContinuationBytes(); +} +fn testInvalidUtf8ContinuationBytes() void { + // unexpected continuation + testError("\x80", error.Utf8InvalidStartByte); + testError("\xbf", error.Utf8InvalidStartByte); + // too many leading 1's + testError("\xf8", error.Utf8InvalidStartByte); + testError("\xff", error.Utf8InvalidStartByte); + // expected continuation for 2 byte sequences + testError("\xc2", error.UnexpectedEof); + testError("\xc2\x00", error.Utf8ExpectedContinuation); + testError("\xc2\xc0", error.Utf8ExpectedContinuation); + // expected continuation for 3 byte sequences + testError("\xe0", error.UnexpectedEof); + testError("\xe0\x00", error.UnexpectedEof); + testError("\xe0\xc0", error.UnexpectedEof); + testError("\xe0\xa0", error.UnexpectedEof); + testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation); + testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation); + // expected continuation for 4 byte sequences + testError("\xf0", error.UnexpectedEof); + testError("\xf0\x00", error.UnexpectedEof); + testError("\xf0\xc0", error.UnexpectedEof); + testError("\xf0\x90\x00", error.UnexpectedEof); + testError("\xf0\x90\xc0", error.UnexpectedEof); + testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation); + testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation); +} + +test "overlong utf8 codepoint" { + comptime testOverlongUtf8Codepoint(); + testOverlongUtf8Codepoint(); +} +fn testOverlongUtf8Codepoint() void { + testError("\xc0\x80", error.Utf8OverlongEncoding); + testError("\xc1\xbf", error.Utf8OverlongEncoding); + testError("\xe0\x80\x80", error.Utf8OverlongEncoding); + testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding); + testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding); + testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding); +} + +test "misc invalid utf8" { + comptime testMiscInvalidUtf8(); + testMiscInvalidUtf8(); +} +fn testMiscInvalidUtf8() void { + // codepoint out of bounds + testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); + testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); + // surrogate halves + testValid("\xed\x9f\xbf", 0xd7ff); + testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf); + testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf); + testValid("\xee\x80\x80", 0xe000); +} + +fn testError(bytes: []const u8, expected_err: anyerror) void { + testing.expectError(expected_err, testDecode(bytes)); +} + +fn testValid(bytes: []const u8, expected_codepoint: u32) void { + testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint); +} + +fn testDecode(bytes: []const u8) !u32 { + const length = try utf8ByteSequenceLength(bytes[0]); + if (bytes.len < length) return error.UnexpectedEof; + testing.expect(bytes.len == length); + return utf8Decode(bytes); +} + +/// Caller must free returned memory. +pub fn utf16leToUtf8Alloc(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 { + var result = std.ArrayList(u8).init(allocator); + // optimistically guess that it will all be ascii. + try result.ensureCapacity(utf16le.len); + var out_index: usize = 0; + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |codepoint| { + const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; + try result.resize(result.len + utf8_len); + assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len); + out_index += utf8_len; + } + + return result.toOwnedSlice(); +} + +/// Asserts that the output buffer is big enough. +/// Returns end byte index into utf8. +pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize { + var end_index: usize = 0; + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |codepoint| { + end_index += try utf8Encode(codepoint, utf8[end_index..]); + } + return end_index; +} + +test "utf16leToUtf8" { + var utf16le: [2]u16 = undefined; + const utf16le_as_bytes = @sliceToBytes(utf16le[0..]); + + { + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 'A'); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 'a'); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "Aa")); + } + + { + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0x80); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xffff); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf")); + } + + { + // the values just outside the surrogate half range + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xd7ff); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xe000); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80")); + } + + { + // smallest surrogate pair + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xd800); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdc00); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "\xf0\x90\x80\x80")); + } + + { + // largest surrogate pair + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xdbff); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdfff); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf")); + } + + { + mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xdbff); + mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdc00); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); + testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80")); + } +} + +/// TODO support codepoints bigger than 16 bits +/// TODO type for null terminated pointer +pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 { + var result = std.ArrayList(u16).init(allocator); + // optimistically guess that it will not require surrogate pairs + try result.ensureCapacity(utf8.len + 1); + + const view = try Utf8View.init(utf8); + var it = view.iterator(); + while (it.nextCodepoint()) |codepoint| { + try result.append(@intCast(u16, codepoint)); // TODO surrogate pairs + } + + try result.append(0); + return result.toOwnedSlice(); +} + +/// Returns index of next character. If exact fit, returned index equals output slice length. +/// Assumes there is enough space for the output. +/// TODO support codepoints bigger than 16 bits +pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { + var dest_i: usize = 0; + var src_i: usize = 0; + while (src_i < utf8.len) { + const byte = utf8[src_i]; + const n = @clz(u8, ~byte); + switch (n) { + 0 => { + utf16le[dest_i] = byte; + dest_i += 1; + src_i += 1; + continue; + }, + 2, 3, 4 => { + const next_src_i = src_i + n; + const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8; + const short = @intCast(u16, codepoint); // TODO surrogate pairs + utf16le[dest_i] = switch (builtin.endian) { + .Little => short, + .Big => @byteSwap(u16, short), + }; + dest_i += 1; + src_i = next_src_i; + }, + else => return error.InvalidUtf8, + } + } + return dest_i; +} |
