diff options
| author | Andrew Kelley <superjoe30@gmail.com> | 2018-08-21 21:02:01 -0400 |
|---|---|---|
| committer | Andrew Kelley <superjoe30@gmail.com> | 2018-08-21 21:02:01 -0400 |
| commit | 3d780cf2ef8391b6b48124f599858ee99ddc4cdc (patch) | |
| tree | 5e073a9784a6fa4699e0eca9a3eb0148756e6722 /std/unicode.zig | |
| parent | b2917e6be09138adcf7cfdab51a1909a30eec320 (diff) | |
| parent | 3dd1026c8bcb438228c336add7cc4014552aa05c (diff) | |
| download | zig-3d780cf2ef8391b6b48124f599858ee99ddc4cdc.tar.gz zig-3d780cf2ef8391b6b48124f599858ee99ddc4cdc.zip | |
Merge branch 'shawnl-path_max'
This does a proof of concept of changing most file system APIs to not
require an allocator and remove the possibility of failure via
OutOfMemory.
This also does most of the work of #534.
Diffstat (limited to 'std/unicode.zig')
| -rw-r--r-- | std/unicode.zig | 102 |
1 files changed, 71 insertions, 31 deletions
diff --git a/std/unicode.zig b/std/unicode.zig index 0e7b4cdc3e..105c38627f 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -218,7 +218,6 @@ const Utf8Iterator = struct { } const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable; - it.i += cp_len; return it.bytes[it.i - cp_len .. it.i]; } @@ -236,6 +235,38 @@ const Utf8Iterator = struct { } }; +pub const Utf16LeIterator = struct { + bytes: []const u8, + i: usize, + + pub fn init(s: []const u16) Utf16LeIterator { + return Utf16LeIterator{ + .bytes = @sliceToBytes(s), + .i = 0, + }; + } + + pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + assert(it.i <= it.bytes.len); + if (it.i == it.bytes.len) return null; + const c0: u32 = mem.readIntLE(u16, it.bytes[it.i .. it.i + 2]); + if (c0 & ~u32(0x03ff) == 0xd800) { + // surrogate pair + it.i += 2; + if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf; + const c1: u32 = mem.readIntLE(u16, it.bytes[it.i .. it.i + 2]); + if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; + it.i += 2; + return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); + } else if (c0 & ~u32(0x03ff) == 0xdc00) { + return error.UnexpectedSecondSurrogateHalf; + } else { + it.i += 2; + return c0; + } + } +}; + test "utf8 encode" { comptime testUtf8Encode() catch unreachable; try testUtf8Encode(); @@ -446,42 +477,34 @@ fn testDecode(bytes: []const u8) !u32 { return utf8Decode(bytes); } -// TODO: make this API on top of a non-allocating Utf16LeView -pub fn utf16leToUtf8(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 { +/// Caller must free returned memory. +pub fn utf16leToUtf8Alloc(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 { var result = std.ArrayList(u8).init(allocator); // optimistically guess that it will all be ascii. try result.ensureCapacity(utf16le.len); - - const utf16le_as_bytes = @sliceToBytes(utf16le); - var i: usize = 0; var out_index: usize = 0; - while (i < utf16le_as_bytes.len) : (i += 2) { - // decode - const c0: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]); - var codepoint: u32 = undefined; - if (c0 & ~u32(0x03ff) == 0xd800) { - // surrogate pair - i += 2; - if (i >= utf16le_as_bytes.len) return error.DanglingSurrogateHalf; - const c1: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]); - if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; - codepoint = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); - } else if (c0 & ~u32(0x03ff) == 0xdc00) { - return error.UnexpectedSecondSurrogateHalf; - } else { - codepoint = c0; - } - - // encode + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |codepoint| { const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; try result.resize(result.len + utf8_len); - _ = utf8Encode(codepoint, result.items[out_index..]) catch unreachable; + assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len); out_index += utf8_len; } return result.toOwnedSlice(); } +/// Asserts that the output buffer is big enough. +/// Returns end byte index into utf8. +pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize { + var end_index: usize = 0; + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |codepoint| { + end_index += try utf8Encode(codepoint, utf8[end_index..]); + } + return end_index; +} + test "utf16leToUtf8" { var utf16le: [2]u16 = undefined; const utf16le_as_bytes = @sliceToBytes(utf16le[0..]); @@ -489,14 +512,14 @@ test "utf16leToUtf8" { { mem.writeInt(utf16le_as_bytes[0..], u16('A'), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16('a'), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "Aa")); } { mem.writeInt(utf16le_as_bytes[0..], u16(0x80), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16(0xffff), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf")); } @@ -504,7 +527,7 @@ test "utf16leToUtf8" { // the values just outside the surrogate half range mem.writeInt(utf16le_as_bytes[0..], u16(0xd7ff), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16(0xe000), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80")); } @@ -512,7 +535,7 @@ test "utf16leToUtf8" { // smallest surrogate pair mem.writeInt(utf16le_as_bytes[0..], u16(0xd800), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "\xf0\x90\x80\x80")); } @@ -520,14 +543,14 @@ test "utf16leToUtf8" { // largest surrogate pair mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16(0xdfff), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf")); } { mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little); mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little); - const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le); + const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le); assert(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80")); } } @@ -548,3 +571,20 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 try result.append(0); return result.toOwnedSlice(); } + +/// Returns index of next character. If exact fit, returned index equals output slice length. +/// If ran out of room, returned index equals output slice length + 1. +/// TODO support codepoints bigger than 16 bits +pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { + const utf16le_as_bytes = @sliceToBytes(utf16le[0..]); + var end_index: usize = 0; + + var it = (try Utf8View.init(utf8)).iterator(); + while (it.nextCodepoint()) |codepoint| { + if (end_index == utf16le_as_bytes.len) return (end_index / 2) + 1; + // TODO surrogate pairs + mem.writeInt(utf16le_as_bytes[end_index..], @intCast(u16, codepoint), builtin.Endian.Little); + end_index += 2; + } + return end_index / 2; +} |
