diff options
| author | Ersikan <julien.philippon@epitech.eu> | 2021-03-14 18:07:09 +0100 |
|---|---|---|
| committer | Ersikan <julien.philippon@epitech.eu> | 2021-03-17 10:27:26 +0100 |
| commit | 8942243f7a825e42c16c8d210f5f9dc3baa76b2f (patch) | |
| tree | ae67b24aeacd992e1a382e349d9b51d474bf3307 /src | |
| parent | a3540000909bdc6a59ba07c85d21afeb3a7e54e2 (diff) | |
| download | zig-8942243f7a825e42c16c8d210f5f9dc3baa76b2f.tar.gz zig-8942243f7a825e42c16c8d210f5f9dc3baa76b2f.zip | |
zig fmt: factorize source file reading and decoding
Now reading a source file and decoding it from UTF-16LE to UTF-8 is
done in a single function. Error messages are improved, and an error is
emitted when the source file has a BOM not supported (UTF-16BE, UTF-32).
Please note that the BOM of UTF-32 is composed of the same bytes as the
BOM of UTF-16 followed by a null character. Therefore a source file in
UTF-16LE starting with a null byte will be interpreted as an UTF-32, and
rejeted because of an invalid format. In pratice this is not a problem,
as the code would have been rejected later anyway because of the null
character.
Diffstat (limited to 'src')
| -rw-r--r-- | src/main.zig | 102 |
1 files changed, 55 insertions, 47 deletions
diff --git a/src/main.zig b/src/main.zig index 1625b7213f..be227a2895 100644 --- a/src/main.zig +++ b/src/main.zig @@ -2637,6 +2637,50 @@ fn argvCmd(allocator: *Allocator, argv: []const []const u8) ![]u8 { return cmd.toOwnedSlice(); } +fn readSourceFileToEndAlloc(allocator: *mem.Allocator, input: *const fs.File, size_hint: ?usize) ![]const u8 { + const source_code = input.readToEndAllocOptions( + allocator, + max_src_size, + size_hint, + @alignOf(u16), + null, + ) catch |err| switch (err) { + error.ConnectionResetByPeer => unreachable, + error.ConnectionTimedOut => unreachable, + error.NotOpenForReading => unreachable, + else => |e| return e, + }; + errdefer allocator.free(source_code); + + // Detect unsupported file types with their Byte Order Mark + const unsupported_boms = [_][]const u8{ + "\xff\xfe\x00\x00", // UTF-32 little endian + "\xfe\xff\x00\x00", // UTF-32 big endian + "\xfe\xff", // UTF-16 big endian + }; + for (unsupported_boms) |bom| { + if (mem.startsWith(u8, source_code, bom)) { + return error.UnsupportedEncoding; + } + } + + // If the file starts with a UTF-16 little endian BOM, translate it to UTF-8 + if (mem.startsWith(u8, source_code, "\xff\xfe")) { + const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); + const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(allocator, source_code_utf16_le) catch |err| switch (err) { + error.DanglingSurrogateHalf => error.UnsupportedEncoding, + error.ExpectedSecondSurrogateHalf => error.UnsupportedEncoding, + error.UnexpectedSecondSurrogateHalf => error.UnsupportedEncoding, + else => |e| return e, + }; + + allocator.free(source_code); + return source_code_utf8; + } + + return source_code; +} + pub const usage_fmt = \\Usage: zig fmt [file]... \\ @@ -2709,20 +2753,8 @@ pub fn cmdFmt(gpa: *Allocator, args: []const []const u8) !void { } const stdin = io.getStdIn(); - - const source_code = blk: { - const source_code = try stdin.readToEndAllocOptions(gpa, max_src_size, null, @alignOf(u16), null); - errdefer gpa.free(source_code); - - // If the file starts with a UTF-16 BOM, translate it to UTF-8 - if (mem.startsWith(u8, source_code, "\xff\xfe")) { - const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); - const source_code_utf8 = try std.unicode.utf16leToUtf8Alloc(gpa, source_code_utf16_le); - gpa.free(source_code); - break :blk source_code_utf8; - } else { - break :blk source_code; - } + const source_code = readSourceFileToEndAlloc(gpa, &stdin, null) catch |err| { + fatal("unable to read stdin: {s}", .{err}); }; defer gpa.free(source_code); @@ -2798,7 +2830,7 @@ const FmtError = error{ EndOfStream, Unseekable, NotOpenForWriting, - UnknownTextFormat, + UnsupportedEncoding, } || fs.File.OpenError; fn fmtPath(fmt: *Fmt, file_path: []const u8, check_mode: bool, dir: fs.Dir, sub_path: []const u8) FmtError!void { @@ -2864,40 +2896,16 @@ fn fmtPathFile( if (stat.kind == .Directory) return error.IsDir; - const source_code = blk: { - const source_code = source_file.readToEndAllocOptions( - fmt.gpa, - max_src_size, - std.math.cast(usize, stat.size) catch return error.FileTooBig, - @alignOf(u16), - null, - ) catch |err| switch (err) { - error.ConnectionResetByPeer => unreachable, - error.ConnectionTimedOut => unreachable, - error.NotOpenForReading => unreachable, - else => |e| return e, - }; - source_file.close(); - file_closed = true; - errdefer fmt.gpa.free(source_code); - - // If the file starts with a UTF-16 BOM, translate it to UTF-8 - if (mem.eql(u8, source_code[0..2], "\xff\xfe")) { - const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); - const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(fmt.gpa, source_code_utf16_le) catch |err| return switch (err) { - error.DanglingSurrogateHalf => FmtError.UnknownTextFormat, - error.ExpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, - error.UnexpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, - else => |e| e, - }; - fmt.gpa.free(source_code); - break :blk source_code_utf8; - } else { - break :blk source_code; - } - }; + const source_code = try readSourceFileToEndAlloc( + fmt.gpa, + &source_file, + std.math.cast(usize, stat.size) catch return error.FileTooBig, + ); defer fmt.gpa.free(source_code); + source_file.close(); + file_closed = true; + // Add to set after no longer possible to get error.IsDir. if (try fmt.seen.fetchPut(stat.inode, {})) |_| return; |
