Merge remote-tracking branch 'origin/master' into llvm14

author: Andrew Kelley <andrew@ziglang.org> 2022-07-01 15:52:54 -0700
committer: Andrew Kelley <andrew@ziglang.org> 2022-07-01 15:52:54 -0700
commit: c89dd15e1be4959800dc7092d7dd4375253db7bc (patch)
tree: ca184ae53592efa21e67128a5f891d642d7f1118 /lib/std/unicode.zig
parent: 5466e87fce581f2ef90ac23bb80b1dbc05836fc6 (diff)
parent: 2360f8c490f3ec684ed64ff28e8c1fade249070b (diff)
download: zig-c89dd15e1be4959800dc7092d7dd4375253db7bc.tar.gz
zig-c89dd15e1be4959800dc7092d7dd4375253db7bc.zip
1 files changed, 8 insertions, 11 deletions
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index e0a000dfe5..81a7ed838f 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -3,6 +3,11 @@ const assert = std.debug.assert;
 const testing = std.testing;
 const mem = std.mem;
 
+/// Use this to replace an unknown, unrecognized, or unrepresentable character.
+///
+/// See also: https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
+pub const replacement_character: u21 = 0xFFFD;
+
 /// Returns how many bytes the UTF-8 representation would require
 /// for the given codepoint.
 pub fn utf8CodepointSequenceLength(c: u21) !u3 {
@@ -269,14 +274,7 @@ pub const Utf8Iterator = struct {
 
     pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
         const slice = it.nextCodepointSlice() orelse return null;
-
-        switch (slice.len) {
-            1 => return @as(u21, slice[0]),
-            2 => return utf8Decode2(slice) catch unreachable,
-            3 => return utf8Decode3(slice) catch unreachable,
-            4 => return utf8Decode4(slice) catch unreachable,
-            else => unreachable,
-        }
+        return utf8Decode(slice) catch unreachable;
     }
 
     /// Look ahead at the next n codepoints without advancing the iterator.
@@ -784,15 +782,14 @@ fn formatUtf16le(
     options: std.fmt.FormatOptions,
     writer: anytype,
 ) !void {
-    const unknown_codepoint = 0xfffd;
     _ = fmt;
     _ = options;
     var buf: [300]u8 = undefined; // just a random size I chose
     var it = Utf16LeIterator.init(utf16le);
     var u8len: usize = 0;
-    while (it.nextCodepoint() catch unknown_codepoint) |codepoint| {
+    while (it.nextCodepoint() catch replacement_character) |codepoint| {
         u8len += utf8Encode(codepoint, buf[u8len..]) catch
-            utf8Encode(unknown_codepoint, buf[u8len..]) catch unreachable;
+            utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
         if (u8len + 3 >= buf.len) {
             try writer.writeAll(buf[0..u8len]);
             u8len = 0;
author	Andrew Kelley <andrew@ziglang.org>	2022-07-01 15:52:54 -0700
committer	Andrew Kelley <andrew@ziglang.org>	2022-07-01 15:52:54 -0700
commit	c89dd15e1be4959800dc7092d7dd4375253db7bc (patch)
tree	ca184ae53592efa21e67128a5f891d642d7f1118 /lib/std/unicode.zig
parent	5466e87fce581f2ef90ac23bb80b1dbc05836fc6 (diff)
parent	2360f8c490f3ec684ed64ff28e8c1fade249070b (diff)
download	zig-c89dd15e1be4959800dc7092d7dd4375253db7bc.tar.gz zig-c89dd15e1be4959800dc7092d7dd4375253db7bc.zip