diff options
| author | Josh Wolfe <thejoshwolfe@gmail.com> | 2018-07-18 03:01:01 -0400 |
|---|---|---|
| committer | Josh Wolfe <thejoshwolfe@gmail.com> | 2018-07-18 03:01:01 -0400 |
| commit | 843529d23415efcf3fe894c48b3d08c0925edc08 (patch) | |
| tree | c8b30608b2d853cce0619f928c6de0e9d1df56ec /src-self-hosted | |
| parent | cbfe9a407776d3b56a4da2a3bc3108a2f31b9212 (diff) | |
| download | zig-843529d23415efcf3fe894c48b3d08c0925edc08.tar.gz zig-843529d23415efcf3fe894c48b3d08c0925edc08.zip | |
implement proper utf16leToUtf8
Diffstat (limited to 'src-self-hosted')
| -rw-r--r-- | src-self-hosted/compilation.zig | 89 |
1 files changed, 83 insertions, 6 deletions
diff --git a/src-self-hosted/compilation.zig b/src-self-hosted/compilation.zig index 5cb7565a98..5ffb43667d 100644 --- a/src-self-hosted/compilation.zig +++ b/src-self-hosted/compilation.zig @@ -4,6 +4,7 @@ const io = std.io; const mem = std.mem; const Allocator = mem.Allocator; const Buffer = std.Buffer; +const unicode = std.unicode; const llvm = @import("llvm.zig"); const c = @import("c.zig"); const builtin = @import("builtin"); @@ -998,14 +999,90 @@ test "getAppDataDir" { std.debug.warn("{}...", result); } -// TODO full utf-16 LE support +// TODO: put general purpose stuff in std.unicode fn utf16leToUtf8(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 { - const utf8_bytes = try allocator.alloc(u8, utf16le.len); - for (utf16le) |codepoint, i| { - assert(codepoint < 127); // TODO full utf-16 LE support - utf8_bytes[i] = @intCast(u8, codepoint); + var result = ArrayList(u8).init(allocator); + // optimistically guess that it will all be ascii. + try result.ensureCapacity(utf16le.len); + + const utf16le_as_bytes = @sliceToBytes(utf16le); + var i: usize = 0; + var out_index: usize = 0; + while (i < utf16le_as_bytes.len) : (i += 2) { + // decode + const c0: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]); + var codepoint: u32 = undefined; + if (c0 & ~u32(0x03ff) == 0xd800) { + // surrogate pair + i += 2; + if (i >= utf16le_as_bytes.len) return error.DanglingSurrogateHalf; + const c1: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]); + if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; + codepoint = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); + } else if (c0 & ~u32(0x03ff) == 0xdc00) { + return error.UnexpectedSecondSurrogateHalf; + } else { + codepoint = c0; + } + + // encode + const utf8_len = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + try result.resize(result.len + utf8_len); + _ = unicode.utf8Encode(codepoint, result.items[out_index..]) catch unreachable; + out_index += utf8_len; + } + + return result.toOwnedSlice(); +} + +test "utf16leToUtf8" { + var utf16le: [2]u16 = undefined; + const utf16le_as_bytes = @sliceToBytes(utf16le[0..]); + + { + mem.writeInt(utf16le_as_bytes[0..], u16('A'), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16('a'), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "Aa")); + } + + { + mem.writeInt(utf16le_as_bytes[0..], u16(0x80), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16(0xffff), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf")); + } + + { + // the values just outside the surrogate half range + mem.writeInt(utf16le_as_bytes[0..], u16(0xd7ff), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16(0xe000), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80")); + } + + { + // smallest surrogate pair + mem.writeInt(utf16le_as_bytes[0..], u16(0xd800), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "\xf0\x90\x80\x80")); + } + + { + // largest surrogate pair + mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16(0xdfff), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf")); + } + + { + mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little); + mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little); + const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + assert(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80")); } - return utf8_bytes; } fn utf16lePtrSlice(ptr: [*]const u16) []const u16 { |
