lib/compiler/resinator/disjoint_code_page.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

const std = @import("std");
const lex = @import("lex.zig");
const SourceMappings = @import("source_mapping.zig").SourceMappings;
const SupportedCodePage = @import("code_pages.zig").SupportedCodePage;

pub fn hasDisjointCodePage(source: []const u8, source_mappings: ?*const SourceMappings, default_code_page: SupportedCodePage) bool {
    var line_handler = lex.LineHandler{ .buffer = source };
    var i: usize = 0;
    while (i < source.len) {
        const codepoint = default_code_page.codepointAt(i, source) orelse break;
        const c = codepoint.value;
        switch (c) {
            '\r', '\n' => {
                _ = line_handler.incrementLineNumber(i);
                // Any lines that are not from the root file interrupt the disjoint code page
                if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) return false;
            },
            // whitespace is ignored
            ' ',
            '\t',
            // NBSP, this should technically be in the TODO below, but it is treated as whitespace
            // due to a (misguided) special casing in the lexer, see the TODO in lex.zig
            '\u{A0}',
            => {},

            // TODO: All of the below are treated as whitespace by the Win32 RC preprocessor, which also
            //       means they are trimmed from the file during preprocessing. This means that these characters
            //       should be treated like ' ', '\t' above, but since the resinator preprocessor does not treat
            //       them as whitespace *or* trim whitespace, files with these characters are likely going to
            //       error. So, in the future some sort of emulation of/rejection of the Win32 behavior might
            //       make handling these codepoints specially make sense, but for now it doesn't really matter
            //       so they are not handled specially for simplicity's sake.
            //'\u{1680}',
            //'\u{180E}',
            //'\u{2001}',
            //'\u{2002}',
            //'\u{2003}',
            //'\u{2004}',
            //'\u{2005}',
            //'\u{2006}',
            //'\u{2007}',
            //'\u{2008}',
            //'\u{2009}',
            //'\u{200A}',
            //'\u{2028}',
            //'\u{2029}',
            //'\u{202F}',
            //'\u{205F}',
            //'\u{3000}',

            '#' => {
                if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) {
                    return false;
                }
                const start_i = i;
                while (i < source.len and source[i] != '\r' and source[i] != '\n') : (i += 1) {}
                const line = source[start_i..i];
                _ = (lex.parsePragmaCodePage(line) catch |err| switch (err) {
                    error.NotPragma => return false,
                    error.NotCodePagePragma => continue,
                    error.CodePagePragmaUnsupportedCodePage => continue,
                    else => continue,
                }) orelse return false; // DEFAULT interrupts disjoint code page

                // If we got a code page, then it is a disjoint code page pragma
                return true;
            },
            else => {
                // Any other character interrupts the disjoint code page
                return false;
            },
        }

        i += codepoint.byte_len;
    }
    return false;
}

test hasDisjointCodePage {
    try std.testing.expect(hasDisjointCodePage("#pragma code_page(65001)\n", null, .windows1252));
    // NBSP is a special case
    try std.testing.expect(hasDisjointCodePage("\xA0\n#pragma code_page(65001)\n", null, .windows1252));
    try std.testing.expect(hasDisjointCodePage("\u{A0}\n#pragma code_page(1252)\n", null, .utf8));
    // other preprocessor commands don't interrupt
    try std.testing.expect(hasDisjointCodePage("#pragma foo\n#pragma code_page(65001)\n", null, .windows1252));
    // invalid code page doesn't interrupt
    try std.testing.expect(hasDisjointCodePage("#pragma code_page(1234567)\n#pragma code_page(65001)\n", null, .windows1252));

    try std.testing.expect(!hasDisjointCodePage("#if 1\n#endif\n#pragma code_page(65001)", null, .windows1252));
    try std.testing.expect(!hasDisjointCodePage("// comment\n#pragma code_page(65001)", null, .windows1252));
    try std.testing.expect(!hasDisjointCodePage("/* comment */\n#pragma code_page(65001)", null, .windows1252));
}

test "multiline comment edge case" {
    // TODO
    if (true) return error.SkipZigTest;

    try std.testing.expect(hasDisjointCodePage("/* comment */#pragma code_page(65001)", null, .windows1252));
}