diff options
| author | Veikka Tuominen <git@vexu.eu> | 2023-10-02 07:08:53 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-10-02 07:08:53 +0300 |
| commit | fc4d53e2ea6b41440e37caf32d2fd236d0f58c93 (patch) | |
| tree | be400bc7033d3f198978ad04c05c14f15b8c5324 /deps/aro/Preprocessor.zig | |
| parent | 0f1652dc603ad43be733cfdd721cedf38d9e45d9 (diff) | |
| parent | 5792570197f44b2c7599fb756f5c1e9d59bd0a9a (diff) | |
| download | zig-fc4d53e2ea6b41440e37caf32d2fd236d0f58c93.tar.gz zig-fc4d53e2ea6b41440e37caf32d2fd236d0f58c93.zip | |
Merge pull request #17221 from Vexu/aro-translate-c
Aro translate-c
Diffstat (limited to 'deps/aro/Preprocessor.zig')
| -rw-r--r-- | deps/aro/Preprocessor.zig | 2691 |
1 files changed, 2691 insertions, 0 deletions
diff --git a/deps/aro/Preprocessor.zig b/deps/aro/Preprocessor.zig new file mode 100644 index 0000000000..95758ae374 --- /dev/null +++ b/deps/aro/Preprocessor.zig @@ -0,0 +1,2691 @@ +const std = @import("std"); +const mem = std.mem; +const Allocator = mem.Allocator; +const assert = std.debug.assert; +const Compilation = @import("Compilation.zig"); +const Error = Compilation.Error; +const Source = @import("Source.zig"); +const Tokenizer = @import("Tokenizer.zig"); +const RawToken = Tokenizer.Token; +const Parser = @import("Parser.zig"); +const Diagnostics = @import("Diagnostics.zig"); +const Token = @import("Tree.zig").Token; +const Attribute = @import("Attribute.zig"); +const features = @import("features.zig"); + +const Preprocessor = @This(); +const DefineMap = std.StringHashMap(Macro); +const RawTokenList = std.ArrayList(RawToken); +const max_include_depth = 200; + +/// Errors that can be returned when expanding a macro. +/// error.UnknownPragma can occur within Preprocessor.pragma() but +/// it is handled there and doesn't escape that function +const MacroError = Error || error{StopPreprocessing}; + +const Macro = struct { + /// Parameters of the function type macro + params: []const []const u8, + + /// Token constituting the macro body + tokens: []const RawToken, + + /// If the function type macro has variable number of arguments + var_args: bool, + + /// Is a function type macro + is_func: bool, + + /// Is a predefined macro + is_builtin: bool = false, + + /// Location of macro in the source + /// `byte_offset` and `line` are used to define the range of tokens included + /// in the macro. + loc: Source.Location, + + fn eql(a: Macro, b: Macro, pp: *Preprocessor) bool { + if (a.tokens.len != b.tokens.len) return false; + if (a.is_builtin != b.is_builtin) return false; + for (a.tokens, b.tokens) |a_tok, b_tok| if (!tokEql(pp, a_tok, b_tok)) return false; + + if (a.is_func and b.is_func) { + if (a.var_args != b.var_args) return false; + if (a.params.len != b.params.len) return false; + for (a.params, b.params) |a_param, b_param| if (!mem.eql(u8, a_param, b_param)) return false; + } + + return true; + } + + fn tokEql(pp: *Preprocessor, a: RawToken, b: RawToken) bool { + return mem.eql(u8, pp.tokSlice(a), pp.tokSlice(b)); + } +}; + +comp: *Compilation, +gpa: mem.Allocator, +arena: std.heap.ArenaAllocator, +defines: DefineMap, +tokens: Token.List = .{}, +token_buf: RawTokenList, +char_buf: std.ArrayList(u8), +/// Counter that is incremented each time preprocess() is called +/// Can be used to distinguish multiple preprocessings of the same file +preprocess_count: u32 = 0, +generated_line: u32 = 1, +add_expansion_nl: u32 = 0, +include_depth: u8 = 0, +counter: u32 = 0, +expansion_source_loc: Source.Location = undefined, +poisoned_identifiers: std.StringHashMap(void), +/// Map from Source.Id to macro name in the `#ifndef` condition which guards the source, if any +include_guards: std.AutoHashMapUnmanaged(Source.Id, []const u8) = .{}, + +/// Memory is retained to avoid allocation on every single token. +top_expansion_buf: ExpandBuf, + +/// Dump current state to stderr. +verbose: bool = false, +preserve_whitespace: bool = false, + +pub fn init(comp: *Compilation) Preprocessor { + const pp = Preprocessor{ + .comp = comp, + .gpa = comp.gpa, + .arena = std.heap.ArenaAllocator.init(comp.gpa), + .defines = DefineMap.init(comp.gpa), + .token_buf = RawTokenList.init(comp.gpa), + .char_buf = std.ArrayList(u8).init(comp.gpa), + .poisoned_identifiers = std.StringHashMap(void).init(comp.gpa), + .top_expansion_buf = ExpandBuf.init(comp.gpa), + }; + comp.pragmaEvent(.before_preprocess); + return pp; +} + +const builtin_macros = struct { + const args = [1][]const u8{"X"}; + + const has_attribute = [1]RawToken{.{ + .id = .macro_param_has_attribute, + .source = .generated, + }}; + const has_warning = [1]RawToken{.{ + .id = .macro_param_has_warning, + .source = .generated, + }}; + const has_feature = [1]RawToken{.{ + .id = .macro_param_has_feature, + .source = .generated, + }}; + const has_extension = [1]RawToken{.{ + .id = .macro_param_has_extension, + .source = .generated, + }}; + const has_builtin = [1]RawToken{.{ + .id = .macro_param_has_builtin, + .source = .generated, + }}; + const has_include = [1]RawToken{.{ + .id = .macro_param_has_include, + .source = .generated, + }}; + const has_include_next = [1]RawToken{.{ + .id = .macro_param_has_include_next, + .source = .generated, + }}; + + const is_identifier = [1]RawToken{.{ + .id = .macro_param_is_identifier, + .source = .generated, + }}; + + const pragma_operator = [1]RawToken{.{ + .id = .macro_param_pragma_operator, + .source = .generated, + }}; + + const file = [1]RawToken{.{ + .id = .macro_file, + .source = .generated, + }}; + const line = [1]RawToken{.{ + .id = .macro_line, + .source = .generated, + }}; + const counter = [1]RawToken{.{ + .id = .macro_counter, + .source = .generated, + }}; +}; + +fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, is_func: bool, tokens: []const RawToken) !void { + try pp.defines.putNoClobber(name, .{ + .params = &builtin_macros.args, + .tokens = tokens, + .var_args = false, + .is_func = is_func, + .loc = .{ .id = .generated }, + .is_builtin = true, + }); +} + +pub fn addBuiltinMacros(pp: *Preprocessor) !void { + try pp.addBuiltinMacro("__has_attribute", true, &builtin_macros.has_attribute); + try pp.addBuiltinMacro("__has_warning", true, &builtin_macros.has_warning); + try pp.addBuiltinMacro("__has_feature", true, &builtin_macros.has_feature); + try pp.addBuiltinMacro("__has_extension", true, &builtin_macros.has_extension); + try pp.addBuiltinMacro("__has_builtin", true, &builtin_macros.has_builtin); + try pp.addBuiltinMacro("__has_include", true, &builtin_macros.has_include); + try pp.addBuiltinMacro("__has_include_next", true, &builtin_macros.has_include_next); + try pp.addBuiltinMacro("__is_identifier", true, &builtin_macros.is_identifier); + try pp.addBuiltinMacro("_Pragma", true, &builtin_macros.pragma_operator); + + try pp.addBuiltinMacro("__FILE__", false, &builtin_macros.file); + try pp.addBuiltinMacro("__LINE__", false, &builtin_macros.line); + try pp.addBuiltinMacro("__COUNTER__", false, &builtin_macros.counter); +} + +pub fn deinit(pp: *Preprocessor) void { + pp.defines.deinit(); + for (pp.tokens.items(.expansion_locs)) |loc| Token.free(loc, pp.gpa); + pp.tokens.deinit(pp.gpa); + pp.arena.deinit(); + pp.token_buf.deinit(); + pp.char_buf.deinit(); + pp.poisoned_identifiers.deinit(); + pp.include_guards.deinit(pp.gpa); + pp.top_expansion_buf.deinit(); +} + +/// Preprocess a source file, returns eof token. +pub fn preprocess(pp: *Preprocessor, source: Source) Error!Token { + return pp.preprocessExtra(source) catch |er| switch (er) { + // This cannot occur in the main file and is handled in `include`. + error.StopPreprocessing => unreachable, + else => |e| return e, + }; +} + +/// Return the name of the #ifndef guard macro that starts a source, if any. +fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { + var tokenizer = Tokenizer{ + .buf = source.buf, + .comp = pp.comp, + .source = source.id, + }; + var hash = tokenizer.nextNoWS(); + while (hash.id == .nl) hash = tokenizer.nextNoWS(); + if (hash.id != .hash) return null; + const ifndef = tokenizer.nextNoWS(); + if (ifndef.id != .keyword_ifndef) return null; + const guard = tokenizer.nextNoWS(); + if (guard.id != .identifier) return null; + return pp.tokSlice(guard); +} + +fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { + if (pp.comp.invalid_utf8_locs.get(source.id)) |offset| { + try pp.comp.diag.add(.{ + .tag = .invalid_utf8, + // Todo: compute line number + .loc = .{ .id = source.id, .byte_offset = offset }, + }, &.{}); + return error.FatalError; + } + var guard_name = pp.findIncludeGuard(source); + + pp.preprocess_count += 1; + var tokenizer = Tokenizer{ + .buf = source.buf, + .comp = pp.comp, + .source = source.id, + }; + + // Estimate how many new tokens this source will contain. + const estimated_token_count = source.buf.len / 8; + try pp.tokens.ensureTotalCapacity(pp.gpa, pp.tokens.len + estimated_token_count); + + var if_level: u8 = 0; + var if_kind = std.PackedIntArray(u2, 256).init([1]u2{0} ** 256); + const until_else = 0; + const until_endif = 1; + const until_endif_seen_else = 2; + + var start_of_line = true; + while (true) { + var tok = tokenizer.next(); + switch (tok.id) { + .hash => if (!start_of_line) try pp.tokens.append(pp.gpa, tokFromRaw(tok)) else { + const directive = tokenizer.nextNoWS(); + switch (directive.id) { + .keyword_error, .keyword_warning => { + // #error tokens.. + pp.top_expansion_buf.items.len = 0; + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + + while (true) { + tok = tokenizer.next(); + if (tok.id == .nl or tok.id == .eof) break; + if (tok.id == .whitespace) tok.id = .macro_ws; + try pp.top_expansion_buf.append(tokFromRaw(tok)); + } + try pp.stringify(pp.top_expansion_buf.items); + const slice = pp.char_buf.items[char_top + 1 .. pp.char_buf.items.len - 2]; + const duped = try pp.comp.diag.arena.allocator().dupe(u8, slice); + + try pp.comp.diag.add(.{ + .tag = if (directive.id == .keyword_error) .error_directive else .warning_directive, + .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, + .extra = .{ .str = duped }, + }, &.{}); + }, + .keyword_if => { + const sum, const overflowed = @addWithOverflow(if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + if_level = sum; + + if (try pp.expr(&tokenizer)) { + if_kind.set(if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #if", .{}); + } + } else { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #if", .{}); + } + } + }, + .keyword_ifdef => { + const sum, const overflowed = @addWithOverflow(if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + if_level = sum; + + const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; + try pp.expectNl(&tokenizer); + if (pp.defines.get(macro_name) != null) { + if_kind.set(if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #ifdef", .{}); + } + } else { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #ifdef", .{}); + } + } + }, + .keyword_ifndef => { + const sum, const overflowed = @addWithOverflow(if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + if_level = sum; + + const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; + try pp.expectNl(&tokenizer); + if (pp.defines.get(macro_name) == null) { + if_kind.set(if_level, until_endif); + } else { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + } + }, + .keyword_elif => { + if (if_level == 0) { + try pp.err(directive, .elif_without_if); + if_level += 1; + if_kind.set(if_level, until_else); + } else if (if_level == 1) { + guard_name = null; + } + switch (if_kind.get(if_level)) { + until_else => if (try pp.expr(&tokenizer)) { + if_kind.set(if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elif", .{}); + } + } else { + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elif", .{}); + } + }, + until_endif => try pp.skip(&tokenizer, .until_endif), + until_endif_seen_else => { + try pp.err(directive, .elif_after_else); + skipToNl(&tokenizer); + }, + else => unreachable, + } + }, + .keyword_elifdef => { + if (if_level == 0) { + try pp.err(directive, .elifdef_without_if); + if_level += 1; + if_kind.set(if_level, until_else); + } else if (if_level == 1) { + guard_name = null; + } + switch (if_kind.get(if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(&tokenizer); + if (macro_name == null) { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } else { + try pp.expectNl(&tokenizer); + if (pp.defines.get(macro_name.?) != null) { + if_kind.set(if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifdef", .{}); + } + } else { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } + } + }, + until_endif => try pp.skip(&tokenizer, .until_endif), + until_endif_seen_else => { + try pp.err(directive, .elifdef_after_else); + skipToNl(&tokenizer); + }, + else => unreachable, + } + }, + .keyword_elifndef => { + if (if_level == 0) { + try pp.err(directive, .elifdef_without_if); + if_level += 1; + if_kind.set(if_level, until_else); + } else if (if_level == 1) { + guard_name = null; + } + switch (if_kind.get(if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(&tokenizer); + if (macro_name == null) { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } else { + try pp.expectNl(&tokenizer); + if (pp.defines.get(macro_name.?) == null) { + if_kind.set(if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifndef", .{}); + } + } else { + if_kind.set(if_level, until_else); + try pp.skip(&tokenizer, .until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } + } + }, + until_endif => try pp.skip(&tokenizer, .until_endif), + until_endif_seen_else => { + try pp.err(directive, .elifdef_after_else); + skipToNl(&tokenizer); + }, + else => unreachable, + } + }, + .keyword_else => { + try pp.expectNl(&tokenizer); + if (if_level == 0) { + try pp.err(directive, .else_without_if); + continue; + } else if (if_level == 1) { + guard_name = null; + } + switch (if_kind.get(if_level)) { + until_else => { + if_kind.set(if_level, until_endif_seen_else); + if (pp.verbose) { + pp.verboseLog(directive, "#else branch here", .{}); + } + }, + until_endif => try pp.skip(&tokenizer, .until_endif_seen_else), + until_endif_seen_else => { + try pp.err(directive, .else_after_else); + skipToNl(&tokenizer); + }, + else => unreachable, + } + }, + .keyword_endif => { + try pp.expectNl(&tokenizer); + if (if_level == 0) { + guard_name = null; + try pp.err(directive, .endif_without_if); + continue; + } else if (if_level == 1) { + const saved_tokenizer = tokenizer; + defer tokenizer = saved_tokenizer; + + var next = tokenizer.nextNoWS(); + while (next.id == .nl) : (next = tokenizer.nextNoWS()) {} + if (next.id != .eof) guard_name = null; + } + if_level -= 1; + }, + .keyword_define => try pp.define(&tokenizer), + .keyword_undef => { + const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; + + _ = pp.defines.remove(macro_name); + try pp.expectNl(&tokenizer); + }, + .keyword_include => try pp.include(&tokenizer, .first), + .keyword_include_next => { + try pp.comp.diag.add(.{ + .tag = .include_next, + .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, + }, &.{}); + if (pp.include_depth == 0) { + try pp.comp.diag.add(.{ + .tag = .include_next_outside_header, + .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, + }, &.{}); + try pp.include(&tokenizer, .first); + } else { + try pp.include(&tokenizer, .next); + } + }, + .keyword_embed => try pp.embed(&tokenizer), + .keyword_pragma => try pp.pragma(&tokenizer, directive, null, &.{}), + .keyword_line => { + // #line number "file" + const digits = tokenizer.nextNoWS(); + if (digits.id != .pp_num) try pp.err(digits, .line_simple_digit); + // TODO: validate that the pp_num token is solely digits + + if (digits.id == .eof or digits.id == .nl) continue; + const name = tokenizer.nextNoWS(); + if (name.id == .eof or name.id == .nl) continue; + if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); + try pp.expectNl(&tokenizer); + }, + .pp_num => { + // # number "file" flags + // TODO: validate that the pp_num token is solely digits + // if not, emit `GNU line marker directive requires a simple digit sequence` + const name = tokenizer.nextNoWS(); + if (name.id == .eof or name.id == .nl) continue; + if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); + + const flag_1 = tokenizer.nextNoWS(); + if (flag_1.id == .eof or flag_1.id == .nl) continue; + const flag_2 = tokenizer.nextNoWS(); + if (flag_2.id == .eof or flag_2.id == .nl) continue; + const flag_3 = tokenizer.nextNoWS(); + if (flag_3.id == .eof or flag_3.id == .nl) continue; + const flag_4 = tokenizer.nextNoWS(); + if (flag_4.id == .eof or flag_4.id == .nl) continue; + try pp.expectNl(&tokenizer); + }, + .nl => {}, + .eof => { + if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); + return tokFromRaw(directive); + }, + else => { + try pp.err(tok, .invalid_preprocessing_directive); + skipToNl(&tokenizer); + }, + } + }, + .whitespace => if (pp.preserve_whitespace) try pp.tokens.append(pp.gpa, tokFromRaw(tok)), + .nl => { + start_of_line = true; + if (pp.preserve_whitespace) try pp.tokens.append(pp.gpa, tokFromRaw(tok)); + }, + .eof => { + if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); + // The following check needs to occur here and not at the top of the function + // because a pragma may change the level during preprocessing + if (source.buf.len > 0 and source.buf[source.buf.len - 1] != '\n') { + try pp.err(tok, .newline_eof); + } + if (guard_name) |name| { + if (try pp.include_guards.fetchPut(pp.gpa, source.id, name)) |prev| { + assert(mem.eql(u8, name, prev.value)); + } + } + return tokFromRaw(tok); + }, + else => { + if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { + try pp.err(tok, .poisoned_identifier); + } + // Add the token to the buffer doing any necessary expansions. + start_of_line = false; + try pp.expandMacro(&tokenizer, tok); + }, + } + } +} + +/// Get raw token source string. +/// Returned slice is invalidated when comp.generated_buf is updated. +pub fn tokSlice(pp: *Preprocessor, token: RawToken) []const u8 { + if (token.id.lexeme()) |some| return some; + const source = pp.comp.getSource(token.source); + return source.buf[token.start..token.end]; +} + +/// Convert a token from the Tokenizer into a token used by the parser. +fn tokFromRaw(raw: RawToken) Token { + return .{ + .id = raw.id, + .loc = .{ + .id = raw.source, + .byte_offset = raw.start, + .line = raw.line, + }, + }; +} + +fn err(pp: *Preprocessor, raw: RawToken, tag: Diagnostics.Tag) !void { + try pp.comp.diag.add(.{ + .tag = tag, + .loc = .{ + .id = raw.source, + .byte_offset = raw.start, + .line = raw.line, + }, + }, &.{}); +} + +fn fatal(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) Compilation.Error { + const source = pp.comp.getSource(raw.source); + const line_col = source.lineCol(.{ .id = raw.source, .line = raw.line, .byte_offset = raw.start }); + return pp.comp.diag.fatal(source.path, line_col.line, raw.line, line_col.col, fmt, args); +} + +fn verboseLog(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) void { + const source = pp.comp.getSource(raw.source); + const line_col = source.lineCol(.{ .id = raw.source, .line = raw.line, .byte_offset = raw.start }); + + const stderr = std.io.getStdErr().writer(); + var buf_writer = std.io.bufferedWriter(stderr); + const writer = buf_writer.writer(); + defer buf_writer.flush() catch {}; + writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; + writer.print(fmt, args) catch return; + writer.writeByte('\n') catch return; + writer.writeAll(line_col.line) catch return; + writer.writeByte('\n') catch return; +} + +/// Consume next token, error if it is not an identifier. +fn expectMacroName(pp: *Preprocessor, tokenizer: *Tokenizer) Error!?[]const u8 { + const macro_name = tokenizer.nextNoWS(); + if (!macro_name.id.isMacroIdentifier()) { + try pp.err(macro_name, .macro_name_missing); + skipToNl(tokenizer); + return null; + } + return pp.tokSlice(macro_name); +} + +/// Skip until after a newline, error if extra tokens before it. +fn expectNl(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { + var sent_err = false; + while (true) { + const tok = tokenizer.next(); + if (tok.id == .nl or tok.id == .eof) return; + if (tok.id == .whitespace) continue; + if (!sent_err) { + sent_err = true; + try pp.err(tok, .extra_tokens_directive_end); + } + } +} + +/// Consume all tokens until a newline and parse the result into a boolean. +fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool { + const start = pp.tokens.len; + defer { + for (pp.top_expansion_buf.items) |tok| Token.free(tok.expansion_locs, pp.gpa); + pp.tokens.len = start; + } + + pp.top_expansion_buf.items.len = 0; + const eof = while (true) { + var tok = tokenizer.next(); + switch (tok.id) { + .nl, .eof => break tok, + .whitespace => if (pp.top_expansion_buf.items.len == 0) continue, + else => {}, + } + try pp.top_expansion_buf.append(tokFromRaw(tok)); + } else unreachable; + if (pp.top_expansion_buf.items.len != 0) { + pp.expansion_source_loc = pp.top_expansion_buf.items[0].loc; + try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, pp.top_expansion_buf.items.len, false, .expr); + } + for (pp.top_expansion_buf.items) |tok| { + if (tok.id == .macro_ws) continue; + if (!tok.id.validPreprocessorExprStart()) { + try pp.comp.diag.add(.{ + .tag = .invalid_preproc_expr_start, + .loc = tok.loc, + }, tok.expansionSlice()); + return false; + } + break; + } else { + try pp.err(eof, .expected_value_in_expr); + return false; + } + + // validate the tokens in the expression + try pp.tokens.ensureUnusedCapacity(pp.gpa, pp.top_expansion_buf.items.len); + var i: usize = 0; + const items = pp.top_expansion_buf.items; + while (i < items.len) : (i += 1) { + var tok = items[i]; + switch (tok.id) { + .string_literal, + .string_literal_utf_16, + .string_literal_utf_8, + .string_literal_utf_32, + .string_literal_wide, + => { + try pp.comp.diag.add(.{ + .tag = .string_literal_in_pp_expr, + .loc = tok.loc, + }, tok.expansionSlice()); + return false; + }, + .plus_plus, + .minus_minus, + .plus_equal, + .minus_equal, + .asterisk_equal, + .slash_equal, + .percent_equal, + .angle_bracket_angle_bracket_left_equal, + .angle_bracket_angle_bracket_right_equal, + .ampersand_equal, + .caret_equal, + .pipe_equal, + .l_bracket, + .r_bracket, + .l_brace, + .r_brace, + .ellipsis, + .semicolon, + .hash, + .hash_hash, + .equal, + .arrow, + .period, + => { + try pp.comp.diag.add(.{ + .tag = .invalid_preproc_operator, + .loc = tok.loc, + }, tok.expansionSlice()); + return false; + }, + .macro_ws, .whitespace => continue, + .keyword_false => tok.id = .zero, + .keyword_true => tok.id = .one, + else => if (tok.id.isMacroIdentifier()) { + if (tok.id == .keyword_defined) { + const tokens_consumed = try pp.handleKeywordDefined(&tok, items[i + 1 ..], eof); + i += tokens_consumed; + } else { + try pp.comp.diag.add(.{ + .tag = .undefined_macro, + .loc = tok.loc, + .extra = .{ .str = pp.expandedSlice(tok) }, + }, tok.expansionSlice()); + + if (i + 1 < pp.top_expansion_buf.items.len and + pp.top_expansion_buf.items[i + 1].id == .l_paren) + { + try pp.comp.diag.add(.{ + .tag = .fn_macro_undefined, + .loc = tok.loc, + .extra = .{ .str = pp.expandedSlice(tok) }, + }, tok.expansionSlice()); + return false; + } + + tok.id = .zero; // undefined macro + } + }, + } + pp.tokens.appendAssumeCapacity(tok); + } + try pp.tokens.append(pp.gpa, .{ + .id = .eof, + .loc = tokFromRaw(eof).loc, + }); + + // Actually parse it. + var parser = Parser{ + .pp = pp, + .comp = pp.comp, + .gpa = pp.gpa, + .tok_ids = pp.tokens.items(.id), + .tok_i = @intCast(start), + .arena = pp.arena.allocator(), + .in_macro = true, + .data = undefined, + .strings = undefined, + .retained_strings = undefined, + .value_map = undefined, + .labels = undefined, + .decl_buf = undefined, + .list_buf = undefined, + .param_buf = undefined, + .enum_buf = undefined, + .record_buf = undefined, + .attr_buf = undefined, + .field_attr_buf = undefined, + .string_ids = undefined, + }; + return parser.macroExpr(); +} + +/// Turns macro_tok from .keyword_defined into .zero or .one depending on whether the argument is defined +/// Returns the number of tokens consumed +fn handleKeywordDefined(pp: *Preprocessor, macro_tok: *Token, tokens: []const Token, eof: RawToken) !usize { + std.debug.assert(macro_tok.id == .keyword_defined); + var it = TokenIterator.init(tokens); + const first = it.nextNoWS() orelse { + try pp.err(eof, .macro_name_missing); + return it.i; + }; + switch (first.id) { + .l_paren => {}, + else => { + if (!first.id.isMacroIdentifier()) { + try pp.comp.diag.add(.{ + .tag = .macro_name_must_be_identifier, + .loc = first.loc, + .extra = .{ .str = pp.expandedSlice(first) }, + }, first.expansionSlice()); + } + macro_tok.id = if (pp.defines.contains(pp.expandedSlice(first))) .one else .zero; + return it.i; + }, + } + const second = it.nextNoWS() orelse { + try pp.err(eof, .macro_name_missing); + return it.i; + }; + if (!second.id.isMacroIdentifier()) { + try pp.comp.diag.add(.{ + .tag = .macro_name_must_be_identifier, + .loc = second.loc, + }, second.expansionSlice()); + return it.i; + } + macro_tok.id = if (pp.defines.contains(pp.expandedSlice(second))) .one else .zero; + + const last = it.nextNoWS(); + if (last == null or last.?.id != .r_paren) { + const tok = last orelse tokFromRaw(eof); + try pp.comp.diag.add(.{ + .tag = .closing_paren, + .loc = tok.loc, + }, tok.expansionSlice()); + try pp.comp.diag.add(.{ + .tag = .to_match_paren, + .loc = first.loc, + }, first.expansionSlice()); + } + + return it.i; +} + +/// Skip until #else #elif #endif, return last directive token id. +/// Also skips nested #if ... #endifs. +fn skip( + pp: *Preprocessor, + tokenizer: *Tokenizer, + cont: enum { until_else, until_endif, until_endif_seen_else }, +) Error!void { + var ifs_seen: u32 = 0; + var line_start = true; + while (tokenizer.index < tokenizer.buf.len) { + if (line_start) { + const saved_tokenizer = tokenizer.*; + const hash = tokenizer.nextNoWS(); + if (hash.id == .nl) continue; + line_start = false; + if (hash.id != .hash) continue; + const directive = tokenizer.nextNoWS(); + switch (directive.id) { + .keyword_else => { + if (ifs_seen != 0) continue; + if (cont == .until_endif_seen_else) { + try pp.err(directive, .else_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elif => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + try pp.err(directive, .elif_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifdef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + try pp.err(directive, .elifdef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifndef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + try pp.err(directive, .elifndef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_endif => { + if (ifs_seen == 0) { + tokenizer.* = saved_tokenizer; + return; + } + ifs_seen -= 1; + }, + .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, + else => {}, + } + } else if (tokenizer.buf[tokenizer.index] == '\n') { + line_start = true; + tokenizer.index += 1; + tokenizer.line += 1; + } else { + line_start = false; + tokenizer.index += 1; + } + } else { + const eof = tokenizer.next(); + return pp.err(eof, .unterminated_conditional_directive); + } +} + +// Skip until newline, ignore other tokens. +fn skipToNl(tokenizer: *Tokenizer) void { + while (true) { + const tok = tokenizer.next(); + if (tok.id == .nl or tok.id == .eof) return; + } +} + +const ExpandBuf = std.ArrayList(Token); +fn removePlacemarkers(buf: *ExpandBuf) void { + var i: usize = buf.items.len -% 1; + while (i < buf.items.len) : (i -%= 1) { + if (buf.items[i].id == .placemarker) { + const placemarker = buf.orderedRemove(i); + Token.free(placemarker.expansion_locs, buf.allocator); + } + } +} + +const MacroArguments = std.ArrayList([]const Token); +fn deinitMacroArguments(allocator: Allocator, args: *const MacroArguments) void { + for (args.items) |item| { + for (item) |tok| Token.free(tok.expansion_locs, allocator); + allocator.free(item); + } + args.deinit(); +} + +fn expandObjMacro(pp: *Preprocessor, simple_macro: *const Macro) Error!ExpandBuf { + var buf = ExpandBuf.init(pp.gpa); + errdefer buf.deinit(); + try buf.ensureTotalCapacity(simple_macro.tokens.len); + + // Add all of the simple_macros tokens to the new buffer handling any concats. + var i: usize = 0; + while (i < simple_macro.tokens.len) : (i += 1) { + const raw = simple_macro.tokens[i]; + const tok = tokFromRaw(raw); + switch (raw.id) { + .hash_hash => { + var rhs = tokFromRaw(simple_macro.tokens[i + 1]); + i += 1; + while (rhs.id == .whitespace) { + rhs = tokFromRaw(simple_macro.tokens[i + 1]); + i += 1; + } + try pp.pasteTokens(&buf, &.{rhs}); + }, + .whitespace => if (pp.preserve_whitespace) buf.appendAssumeCapacity(tok), + .macro_file => { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(pp.expansion_source_loc.id); + try pp.comp.generated_buf.writer().print("\"{s}\"\n", .{source.path}); + + buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .string_literal, tok)); + }, + .macro_line => { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(pp.expansion_source_loc.id); + try pp.comp.generated_buf.writer().print("{d}\n", .{source.physicalLine(pp.expansion_source_loc)}); + + buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); + }, + .macro_counter => { + defer pp.counter += 1; + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.writer().print("{d}\n", .{pp.counter}); + + buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); + }, + else => buf.appendAssumeCapacity(tok), + } + } + + return buf; +} + +/// Join a possibly-parenthesized series of string literal tokens into a single string without +/// leading or trailing quotes. The returned slice is invalidated if pp.char_buf changes. +/// Returns error.ExpectedStringLiteral if parentheses are not balanced, a non-string-literal +/// is encountered, or if no string literals are encountered +/// TODO: destringize (replace all '\\' with a single `\` and all '\"' with a '"') +fn pasteStringsUnsafe(pp: *Preprocessor, toks: []const Token) ![]const u8 { + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + var unwrapped = toks; + if (toks.len >= 2 and toks[0].id == .l_paren and toks[toks.len - 1].id == .r_paren) { + unwrapped = toks[1 .. toks.len - 1]; + } + if (unwrapped.len == 0) return error.ExpectedStringLiteral; + + for (unwrapped) |tok| { + if (tok.id == .macro_ws) continue; + if (tok.id != .string_literal) return error.ExpectedStringLiteral; + const str = pp.expandedSlice(tok); + try pp.char_buf.appendSlice(str[1 .. str.len - 1]); + } + return pp.char_buf.items[char_top..]; +} + +/// Handle the _Pragma operator (implemented as a builtin macro) +fn pragmaOperator(pp: *Preprocessor, arg_tok: Token, operator_loc: Source.Location) !void { + const arg_slice = pp.expandedSlice(arg_tok); + const content = arg_slice[1 .. arg_slice.len - 1]; + const directive = "#pragma "; + + pp.char_buf.clearRetainingCapacity(); + const total_len = directive.len + content.len + 1; // destringify can never grow the string, + 1 for newline + try pp.char_buf.ensureUnusedCapacity(total_len); + pp.char_buf.appendSliceAssumeCapacity(directive); + pp.destringify(content); + pp.char_buf.appendAssumeCapacity('\n'); + + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.appendSlice(pp.char_buf.items); + var tmp_tokenizer = Tokenizer{ + .buf = pp.comp.generated_buf.items, + .comp = pp.comp, + .index = @intCast(start), + .source = .generated, + .line = pp.generated_line, + }; + pp.generated_line += 1; + const hash_tok = tmp_tokenizer.next(); + assert(hash_tok.id == .hash); + const pragma_tok = tmp_tokenizer.next(); + assert(pragma_tok.id == .keyword_pragma); + try pp.pragma(&tmp_tokenizer, pragma_tok, operator_loc, arg_tok.expansionSlice()); +} + +/// Inverts the output of the preprocessor stringify (#) operation +/// (except all whitespace is condensed to a single space) +/// writes output to pp.char_buf; assumes capacity is sufficient +/// backslash backslash -> backslash +/// backslash doublequote -> doublequote +/// All other characters remain the same +fn destringify(pp: *Preprocessor, str: []const u8) void { + var state: enum { start, backslash_seen } = .start; + for (str) |c| { + switch (c) { + '\\' => { + if (state == .backslash_seen) pp.char_buf.appendAssumeCapacity(c); + state = if (state == .start) .backslash_seen else .start; + }, + else => { + if (state == .backslash_seen and c != '"') pp.char_buf.appendAssumeCapacity('\\'); + pp.char_buf.appendAssumeCapacity(c); + state = .start; + }, + } + } +} + +/// Stringify `tokens` into pp.char_buf. +/// See https://gcc.gnu.org/onlinedocs/gcc-11.2.0/cpp/Stringizing.html#Stringizing +fn stringify(pp: *Preprocessor, tokens: []const Token) !void { + try pp.char_buf.append('"'); + var ws_state: enum { start, need, not_needed } = .start; + for (tokens) |tok| { + if (tok.id == .macro_ws) { + if (ws_state == .start) continue; + ws_state = .need; + continue; + } + if (ws_state == .need) try pp.char_buf.append(' '); + ws_state = .not_needed; + + // backslashes not inside strings are not escaped + const is_str = switch (tok.id) { + .string_literal, + .string_literal_utf_16, + .string_literal_utf_8, + .string_literal_utf_32, + .string_literal_wide, + .char_literal, + .char_literal_utf_16, + .char_literal_utf_32, + .char_literal_wide, + => true, + else => false, + }; + + for (pp.expandedSlice(tok)) |c| { + if (c == '"') + try pp.char_buf.appendSlice("\\\"") + else if (c == '\\' and is_str) + try pp.char_buf.appendSlice("\\\\") + else + try pp.char_buf.append(c); + } + } + if (pp.char_buf.items[pp.char_buf.items.len - 1] == '\\') { + const tok = tokens[tokens.len - 1]; + try pp.comp.diag.add(.{ + .tag = .invalid_pp_stringify_escape, + .loc = tok.loc, + }, tok.expansionSlice()); + pp.char_buf.items.len -= 1; + } + try pp.char_buf.appendSlice("\"\n"); +} + +fn reconstructIncludeString(pp: *Preprocessor, param_toks: []const Token) !?[]const u8 { + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + + // Trim leading/trailing whitespace + var begin: usize = 0; + var end: usize = param_toks.len; + while (begin < end and param_toks[begin].id == .macro_ws) : (begin += 1) {} + while (end > begin and param_toks[end - 1].id == .macro_ws) : (end -= 1) {} + const params = param_toks[begin..end]; + + if (params.len == 0) { + try pp.comp.diag.add(.{ + .tag = .expected_filename, + .loc = param_toks[0].loc, + }, param_toks[0].expansionSlice()); + return null; + } + // no string pasting + if (params[0].id == .string_literal and params.len > 1) { + try pp.comp.diag.add(.{ + .tag = .closing_paren, + .loc = params[1].loc, + }, params[1].expansionSlice()); + return null; + } + + for (params) |tok| { + const str = pp.expandedSliceExtra(tok, .preserve_macro_ws); + try pp.char_buf.appendSlice(str); + } + + const include_str = pp.char_buf.items[char_top..]; + if (include_str.len < 3) { + try pp.comp.diag.add(.{ + .tag = .empty_filename, + .loc = params[0].loc, + }, params[0].expansionSlice()); + return null; + } + + switch (include_str[0]) { + '<' => { + if (include_str[include_str.len - 1] != '>') { + // Ugly hack to find out where the '>' should go, since we don't have the closing ')' location + const start = params[0].loc; + try pp.comp.diag.add(.{ + .tag = .header_str_closing, + .loc = .{ .id = start.id, .byte_offset = start.byte_offset + @as(u32, @intCast(include_str.len)) + 1, .line = start.line }, + }, params[0].expansionSlice()); + try pp.comp.diag.add(.{ + .tag = .header_str_match, + .loc = params[0].loc, + }, params[0].expansionSlice()); + return null; + } + return include_str; + }, + '"' => return include_str, + else => { + try pp.comp.diag.add(.{ + .tag = .expected_filename, + .loc = params[0].loc, + }, params[0].expansionSlice()); + return null; + }, + } +} + +fn handleBuiltinMacro(pp: *Preprocessor, builtin: RawToken.Id, param_toks: []const Token, src_loc: Source.Location) Error!bool { + switch (builtin) { + .macro_param_has_attribute, + .macro_param_has_feature, + .macro_param_has_extension, + .macro_param_has_builtin, + => { + var invalid: ?Token = null; + var identifier: ?Token = null; + for (param_toks) |tok| { + if (tok.id == .macro_ws) continue; + if (!tok.id.isMacroIdentifier()) { + invalid = tok; + break; + } + if (identifier) |_| invalid = tok else identifier = tok; + } + if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; + if (invalid) |some| { + try pp.comp.diag.add( + .{ .tag = .feature_check_requires_identifier, .loc = some.loc }, + some.expansionSlice(), + ); + return false; + } + + const ident_str = pp.expandedSlice(identifier.?); + return switch (builtin) { + .macro_param_has_attribute => Attribute.fromString(.gnu, null, ident_str) != null, + .macro_param_has_feature => features.hasFeature(pp.comp, ident_str), + .macro_param_has_extension => features.hasExtension(pp.comp, ident_str), + .macro_param_has_builtin => pp.comp.hasBuiltin(ident_str), + else => unreachable, + }; + }, + .macro_param_has_warning => { + const actual_param = pp.pasteStringsUnsafe(param_toks) catch |er| switch (er) { + error.ExpectedStringLiteral => { + try pp.comp.diag.add(.{ + .tag = .expected_str_literal_in, + .loc = param_toks[0].loc, + .extra = .{ .str = "__has_warning" }, + }, param_toks[0].expansionSlice()); + return false; + }, + else => |e| return e, + }; + if (!mem.startsWith(u8, actual_param, "-W")) { + try pp.comp.diag.add(.{ + .tag = .malformed_warning_check, + .loc = param_toks[0].loc, + .extra = .{ .str = "__has_warning" }, + }, param_toks[0].expansionSlice()); + return false; + } + const warning_name = actual_param[2..]; + return Diagnostics.warningExists(warning_name); + }, + .macro_param_is_identifier => { + var invalid: ?Token = null; + var identifier: ?Token = null; + for (param_toks) |tok| switch (tok.id) { + .macro_ws => continue, + else => { + if (identifier) |_| invalid = tok else identifier = tok; + }, + }; + if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; + if (invalid) |some| { + try pp.comp.diag.add(.{ + .tag = .missing_tok_builtin, + .loc = some.loc, + .extra = .{ .tok_id_expected = .r_paren }, + }, some.expansionSlice()); + return false; + } + + const id = identifier.?.id; + return id == .identifier or id == .extended_identifier; + }, + .macro_param_has_include, .macro_param_has_include_next => { + const include_str = (try pp.reconstructIncludeString(param_toks)) orelse return false; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + const filename = include_str[1 .. include_str.len - 1]; + if (builtin == .macro_param_has_include or pp.include_depth == 0) { + if (builtin == .macro_param_has_include_next) { + try pp.comp.diag.add(.{ + .tag = .include_next_outside_header, + .loc = src_loc, + }, &.{}); + } + return pp.comp.hasInclude(filename, src_loc.id, include_type, .first); + } + return pp.comp.hasInclude(filename, src_loc.id, include_type, .next); + }, + else => unreachable, + } +} + +fn expandFuncMacro( + pp: *Preprocessor, + loc: Source.Location, + func_macro: *const Macro, + args: *const MacroArguments, + expanded_args: *const MacroArguments, +) MacroError!ExpandBuf { + var buf = ExpandBuf.init(pp.gpa); + try buf.ensureTotalCapacity(func_macro.tokens.len); + errdefer buf.deinit(); + + var expanded_variable_arguments = ExpandBuf.init(pp.gpa); + defer expanded_variable_arguments.deinit(); + var variable_arguments = ExpandBuf.init(pp.gpa); + defer variable_arguments.deinit(); + + if (func_macro.var_args) { + var i: usize = func_macro.params.len; + while (i < expanded_args.items.len) : (i += 1) { + try variable_arguments.appendSlice(args.items[i]); + try expanded_variable_arguments.appendSlice(expanded_args.items[i]); + if (i != expanded_args.items.len - 1) { + const comma = Token{ .id = .comma, .loc = .{ .id = .generated } }; + try variable_arguments.append(comma); + try expanded_variable_arguments.append(comma); + } + } + } + + // token concatenation and expansion phase + var tok_i: usize = 0; + while (tok_i < func_macro.tokens.len) : (tok_i += 1) { + const raw = func_macro.tokens[tok_i]; + switch (raw.id) { + .hash_hash => while (tok_i + 1 < func_macro.tokens.len) { + const raw_next = func_macro.tokens[tok_i + 1]; + tok_i += 1; + + const next = switch (raw_next.id) { + .macro_ws => continue, + .hash_hash => continue, + .macro_param, .macro_param_no_expand => if (args.items[raw_next.end].len > 0) + args.items[raw_next.end] + else + &[1]Token{tokFromRaw(.{ .id = .placemarker, .source = .generated })}, + .keyword_va_args => variable_arguments.items, + else => &[1]Token{tokFromRaw(raw_next)}, + }; + + try pp.pasteTokens(&buf, next); + if (next.len != 0) break; + }, + .macro_param_no_expand => { + const slice = if (args.items[raw.end].len > 0) + args.items[raw.end] + else + &[1]Token{tokFromRaw(.{ .id = .placemarker, .source = .generated })}; + const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; + try bufCopyTokens(&buf, slice, &.{raw_loc}); + }, + .macro_param => { + const arg = expanded_args.items[raw.end]; + const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; + try bufCopyTokens(&buf, arg, &.{raw_loc}); + }, + .keyword_va_args => { + const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; + try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); + }, + .stringify_param, .stringify_va_args => { + const arg = if (raw.id == .stringify_va_args) + variable_arguments.items + else + args.items[raw.end]; + + pp.char_buf.clearRetainingCapacity(); + try pp.stringify(arg); + + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.appendSlice(pp.char_buf.items); + + try buf.append(try pp.makeGeneratedToken(start, .string_literal, tokFromRaw(raw))); + }, + .macro_param_has_attribute, + .macro_param_has_warning, + .macro_param_has_feature, + .macro_param_has_extension, + .macro_param_has_builtin, + .macro_param_has_include, + .macro_param_has_include_next, + .macro_param_is_identifier, + => { + const arg = expanded_args.items[0]; + const result = if (arg.len == 0) blk: { + const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; + try pp.comp.diag.add(.{ .tag = .expected_arguments, .loc = loc, .extra = extra }, &.{}); + break :blk false; + } else try pp.handleBuiltinMacro(raw.id, arg, loc); + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.writer().print("{}\n", .{@intFromBool(result)}); + try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); + }, + .macro_param_pragma_operator => { + const param_toks = expanded_args.items[0]; + // Clang and GCC require exactly one token (so, no parentheses or string pasting) + // even though their error messages indicate otherwise. Ours is slightly more + // descriptive. + var invalid: ?Token = null; + var string: ?Token = null; + for (param_toks) |tok| switch (tok.id) { + .string_literal => { + if (string) |_| invalid = tok else string = tok; + }, + .macro_ws => continue, + else => { + invalid = tok; + break; + }, + }; + if (string == null and invalid == null) invalid = .{ .loc = loc, .id = .eof }; + if (invalid) |some| try pp.comp.diag.add( + .{ .tag = .pragma_operator_string_literal, .loc = some.loc }, + some.expansionSlice(), + ) else try pp.pragmaOperator(string.?, loc); + }, + .comma => { + if (tok_i + 2 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { + const hash_hash = func_macro.tokens[tok_i + 1]; + var maybe_va_args = func_macro.tokens[tok_i + 2]; + var consumed: usize = 2; + if (maybe_va_args.id == .macro_ws and tok_i + 3 < func_macro.tokens.len) { + consumed = 3; + maybe_va_args = func_macro.tokens[tok_i + 3]; + } + if (maybe_va_args.id == .keyword_va_args) { + // GNU extension: `, ##__VA_ARGS__` deletes the comma if __VA_ARGS__ is empty + tok_i += consumed; + if (func_macro.params.len == expanded_args.items.len) { + // Empty __VA_ARGS__, drop the comma + try pp.err(hash_hash, .comma_deletion_va_args); + } else if (func_macro.params.len == 0 and expanded_args.items.len == 1 and expanded_args.items[0].len == 0) { + // Ambiguous whether this is "empty __VA_ARGS__" or "__VA_ARGS__ omitted" + if (pp.comp.langopts.standard.isGNU()) { + // GNU standard, drop the comma + try pp.err(hash_hash, .comma_deletion_va_args); + } else { + // C standard, retain the comma + try buf.append(tokFromRaw(raw)); + } + } else { + try buf.append(tokFromRaw(raw)); + if (expanded_variable_arguments.items.len > 0 or variable_arguments.items.len == func_macro.params.len) { + try pp.err(hash_hash, .comma_deletion_va_args); + } + const raw_loc = Source.Location{ + .id = maybe_va_args.source, + .byte_offset = maybe_va_args.start, + .line = maybe_va_args.line, + }; + try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); + } + continue; + } + } + // Regular comma, no token pasting with __VA_ARGS__ + try buf.append(tokFromRaw(raw)); + }, + else => try buf.append(tokFromRaw(raw)), + } + } + removePlacemarkers(&buf); + + return buf; +} + +fn shouldExpand(tok: Token, macro: *Macro) bool { + // macro.loc.line contains the macros end index + if (tok.loc.id == macro.loc.id and + tok.loc.byte_offset >= macro.loc.byte_offset and + tok.loc.byte_offset <= macro.loc.line) + return false; + for (tok.expansionSlice()) |loc| { + if (loc.id == macro.loc.id and + loc.byte_offset >= macro.loc.byte_offset and + loc.byte_offset <= macro.loc.line) + return false; + } + if (tok.flags.expansion_disabled) return false; + + return true; +} + +fn bufCopyTokens(buf: *ExpandBuf, tokens: []const Token, src: []const Source.Location) !void { + try buf.ensureUnusedCapacity(tokens.len); + for (tokens) |tok| { + var copy = try tok.dupe(buf.allocator); + errdefer Token.free(copy.expansion_locs, buf.allocator); + try copy.addExpansionLocation(buf.allocator, src); + buf.appendAssumeCapacity(copy); + } +} + +fn nextBufToken( + pp: *Preprocessor, + tokenizer: *Tokenizer, + buf: *ExpandBuf, + start_idx: *usize, + end_idx: *usize, + extend_buf: bool, +) Error!Token { + start_idx.* += 1; + if (start_idx.* == buf.items.len and start_idx.* >= end_idx.*) { + if (extend_buf) { + const raw_tok = tokenizer.next(); + if (raw_tok.id.isMacroIdentifier() and + pp.poisoned_identifiers.get(pp.tokSlice(raw_tok)) != null) + try pp.err(raw_tok, .poisoned_identifier); + + if (raw_tok.id == .nl) pp.add_expansion_nl += 1; + + const new_tok = tokFromRaw(raw_tok); + end_idx.* += 1; + try buf.append(new_tok); + return new_tok; + } else { + return Token{ .id = .eof, .loc = .{ .id = .generated } }; + } + } else { + return buf.items[start_idx.*]; + } +} + +fn collectMacroFuncArguments( + pp: *Preprocessor, + tokenizer: *Tokenizer, + buf: *ExpandBuf, + start_idx: *usize, + end_idx: *usize, + extend_buf: bool, + is_builtin: bool, +) !MacroArguments { + const name_tok = buf.items[start_idx.*]; + const saved_tokenizer = tokenizer.*; + const old_end = end_idx.*; + + while (true) { + const tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); + switch (tok.id) { + .nl, .whitespace, .macro_ws => {}, + .l_paren => break, + else => { + if (is_builtin) { + try pp.comp.diag.add(.{ + .tag = .missing_lparen_after_builtin, + .loc = name_tok.loc, + .extra = .{ .str = pp.expandedSlice(name_tok) }, + }, tok.expansionSlice()); + } + // Not a macro function call, go over normal identifier, rewind + tokenizer.* = saved_tokenizer; + end_idx.* = old_end; + return error.MissingLParen; + }, + } + } + + // collect the arguments. + var parens: u32 = 0; + var args = MacroArguments.init(pp.gpa); + errdefer deinitMacroArguments(pp.gpa, &args); + var curArgument = std.ArrayList(Token).init(pp.gpa); + defer curArgument.deinit(); + while (true) { + var tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); + tok.flags.is_macro_arg = true; + switch (tok.id) { + .comma => { + if (parens == 0) { + const owned = try curArgument.toOwnedSlice(); + errdefer pp.gpa.free(owned); + try args.append(owned); + } else { + const duped = try tok.dupe(pp.gpa); + errdefer Token.free(duped.expansion_locs, pp.gpa); + try curArgument.append(duped); + } + }, + .l_paren => { + const duped = try tok.dupe(pp.gpa); + errdefer Token.free(duped.expansion_locs, pp.gpa); + try curArgument.append(duped); + parens += 1; + }, + .r_paren => { + if (parens == 0) { + const owned = try curArgument.toOwnedSlice(); + errdefer pp.gpa.free(owned); + try args.append(owned); + break; + } else { + const duped = try tok.dupe(pp.gpa); + errdefer Token.free(duped.expansion_locs, pp.gpa); + try curArgument.append(duped); + parens -= 1; + } + }, + .eof => { + { + const owned = try curArgument.toOwnedSlice(); + errdefer pp.gpa.free(owned); + try args.append(owned); + } + tokenizer.* = saved_tokenizer; + try pp.comp.diag.add( + .{ .tag = .unterminated_macro_arg_list, .loc = name_tok.loc }, + name_tok.expansionSlice(), + ); + return error.Unterminated; + }, + .nl, .whitespace => { + try curArgument.append(.{ .id = .macro_ws, .loc = tok.loc }); + }, + else => { + const duped = try tok.dupe(pp.gpa); + errdefer Token.free(duped.expansion_locs, pp.gpa); + try curArgument.append(duped); + }, + } + } + + return args; +} + +fn removeExpandedTokens(pp: *Preprocessor, buf: *ExpandBuf, start: usize, len: usize, moving_end_idx: *usize) !void { + for (buf.items[start .. start + len]) |tok| Token.free(tok.expansion_locs, pp.gpa); + try buf.replaceRange(start, len, &.{}); + moving_end_idx.* -|= len; +} + +/// The behavior of `defined` depends on whether we are in a preprocessor +/// expression context (#if or #elif) or not. +/// In a non-expression context it's just an identifier. Within a preprocessor +/// expression it is a unary operator or one-argument function. +const EvalContext = enum { + expr, + non_expr, +}; + +/// Helper for safely iterating over a slice of tokens while skipping whitespace +const TokenIterator = struct { + toks: []const Token, + i: usize, + + fn init(toks: []const Token) TokenIterator { + return .{ .toks = toks, .i = 0 }; + } + + fn nextNoWS(self: *TokenIterator) ?Token { + while (self.i < self.toks.len) : (self.i += 1) { + const tok = self.toks[self.i]; + if (tok.id == .whitespace or tok.id == .macro_ws) continue; + + self.i += 1; + return tok; + } + return null; + } +}; + +fn expandMacroExhaustive( + pp: *Preprocessor, + tokenizer: *Tokenizer, + buf: *ExpandBuf, + start_idx: usize, + end_idx: usize, + extend_buf: bool, + eval_ctx: EvalContext, +) MacroError!void { + var moving_end_idx = end_idx; + var advance_index: usize = 0; + // rescan loop + var do_rescan = true; + while (do_rescan) { + do_rescan = false; + // expansion loop + var idx: usize = start_idx + advance_index; + while (idx < moving_end_idx) { + const macro_tok = buf.items[idx]; + if (macro_tok.id == .keyword_defined and eval_ctx == .expr) { + idx += 1; + var it = TokenIterator.init(buf.items[idx..moving_end_idx]); + if (it.nextNoWS()) |tok| { + switch (tok.id) { + .l_paren => { + _ = it.nextNoWS(); // eat (what should be) identifier + _ = it.nextNoWS(); // eat (what should be) r paren + }, + .identifier, .extended_identifier => {}, + else => {}, + } + } + idx += it.i; + continue; + } + const macro_entry = pp.defines.getPtr(pp.expandedSlice(macro_tok)); + if (macro_entry == null or !shouldExpand(buf.items[idx], macro_entry.?)) { + idx += 1; + continue; + } + if (macro_entry) |macro| macro_handler: { + if (macro.is_func) { + var macro_scan_idx = idx; + // to be saved in case this doesn't turn out to be a call + const args = pp.collectMacroFuncArguments( + tokenizer, + buf, + ¯o_scan_idx, + &moving_end_idx, + extend_buf, + macro.is_builtin, + ) catch |er| switch (er) { + error.MissingLParen => { + if (!buf.items[idx].flags.is_macro_arg) buf.items[idx].flags.expansion_disabled = true; + idx += 1; + break :macro_handler; + }, + error.Unterminated => { + if (pp.comp.langopts.emulate == .gcc) idx += 1; + try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx, &moving_end_idx); + break :macro_handler; + }, + else => |e| return e, + }; + defer { + for (args.items) |item| { + pp.gpa.free(item); + } + args.deinit(); + } + + var args_count: u32 = @intCast(args.items.len); + // if the macro has zero arguments g() args_count is still 1 + // an empty token list g() and a whitespace-only token list g( ) + // counts as zero arguments for the purposes of argument-count validation + if (args_count == 1 and macro.params.len == 0) { + for (args.items[0]) |tok| { + if (tok.id != .macro_ws) break; + } else { + args_count = 0; + } + } + + // Validate argument count. + const extra = Diagnostics.Message.Extra{ + .arguments = .{ .expected = @intCast(macro.params.len), .actual = args_count }, + }; + if (macro.var_args and args_count < macro.params.len) { + try pp.comp.diag.add( + .{ .tag = .expected_at_least_arguments, .loc = buf.items[idx].loc, .extra = extra }, + buf.items[idx].expansionSlice(), + ); + idx += 1; + try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); + continue; + } + if (!macro.var_args and args_count != macro.params.len) { + try pp.comp.diag.add( + .{ .tag = .expected_arguments, .loc = buf.items[idx].loc, .extra = extra }, + buf.items[idx].expansionSlice(), + ); + idx += 1; + try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); + continue; + } + var expanded_args = MacroArguments.init(pp.gpa); + defer deinitMacroArguments(pp.gpa, &expanded_args); + try expanded_args.ensureTotalCapacity(args.items.len); + for (args.items) |arg| { + var expand_buf = ExpandBuf.init(pp.gpa); + errdefer expand_buf.deinit(); + try expand_buf.appendSlice(arg); + + try pp.expandMacroExhaustive(tokenizer, &expand_buf, 0, expand_buf.items.len, false, eval_ctx); + + expanded_args.appendAssumeCapacity(try expand_buf.toOwnedSlice()); + } + + var res = try pp.expandFuncMacro(macro_tok.loc, macro, &args, &expanded_args); + defer res.deinit(); + const tokens_added = res.items.len; + + const macro_expansion_locs = macro_tok.expansionSlice(); + for (res.items) |*tok| { + try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); + try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); + } + + const tokens_removed = macro_scan_idx - idx + 1; + for (buf.items[idx .. idx + tokens_removed]) |tok| Token.free(tok.expansion_locs, pp.gpa); + try buf.replaceRange(idx, tokens_removed, res.items); + + moving_end_idx += tokens_added; + // Overflow here means that we encountered an unterminated argument list + // while expanding the body of this macro. + moving_end_idx -|= tokens_removed; + idx += tokens_added; + do_rescan = true; + } else { + const res = try pp.expandObjMacro(macro); + defer res.deinit(); + + const macro_expansion_locs = macro_tok.expansionSlice(); + var increment_idx_by = res.items.len; + for (res.items, 0..) |*tok, i| { + tok.flags.is_macro_arg = macro_tok.flags.is_macro_arg; + try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); + try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); + if (tok.id == .keyword_defined and eval_ctx == .expr) { + try pp.comp.diag.add(.{ + .tag = .expansion_to_defined, + .loc = tok.loc, + }, tok.expansionSlice()); + } + + if (i < increment_idx_by and (tok.id == .keyword_defined or pp.defines.contains(pp.expandedSlice(tok.*)))) { + increment_idx_by = i; + } + } + + Token.free(buf.items[idx].expansion_locs, pp.gpa); + try buf.replaceRange(idx, 1, res.items); + idx += increment_idx_by; + moving_end_idx = moving_end_idx + res.items.len - 1; + do_rescan = true; + } + } + if (idx - start_idx == advance_index + 1 and !do_rescan) { + advance_index += 1; + } + } // end of replacement phase + } + // end of scanning phase + + // trim excess buffer + for (buf.items[moving_end_idx..]) |item| { + Token.free(item.expansion_locs, pp.gpa); + } + buf.items.len = moving_end_idx; +} + +/// Try to expand a macro after a possible candidate has been read from the `tokenizer` +/// into the `raw` token passed as argument +fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void { + var source_tok = tokFromRaw(raw); + if (!raw.id.isMacroIdentifier()) { + source_tok.id.simplifyMacroKeyword(); + return pp.tokens.append(pp.gpa, source_tok); + } + pp.top_expansion_buf.items.len = 0; + try pp.top_expansion_buf.append(source_tok); + pp.expansion_source_loc = source_tok.loc; + + try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); + try pp.tokens.ensureUnusedCapacity(pp.gpa, pp.top_expansion_buf.items.len); + for (pp.top_expansion_buf.items) |*tok| { + if (tok.id == .macro_ws and !pp.preserve_whitespace) { + Token.free(tok.expansion_locs, pp.gpa); + continue; + } + tok.id.simplifyMacroKeywordExtra(true); + pp.tokens.appendAssumeCapacity(tok.*); + } + if (pp.preserve_whitespace) { + try pp.tokens.ensureUnusedCapacity(pp.gpa, pp.add_expansion_nl); + while (pp.add_expansion_nl > 0) : (pp.add_expansion_nl -= 1) { + pp.tokens.appendAssumeCapacity(.{ .id = .nl, .loc = .{ .id = .generated } }); + } + } +} + +fn expandedSliceExtra(pp: *const Preprocessor, tok: Token, macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }) []const u8 { + if (tok.id.lexeme()) |some| { + if (!tok.id.allowsDigraphs(pp.comp) and !(tok.id == .macro_ws and macro_ws_handling == .preserve_macro_ws)) return some; + } + var tmp_tokenizer = Tokenizer{ + .buf = pp.comp.getSource(tok.loc.id).buf, + .comp = pp.comp, + .index = tok.loc.byte_offset, + .source = .generated, + }; + if (tok.id == .macro_string) { + while (true) : (tmp_tokenizer.index += 1) { + if (tmp_tokenizer.buf[tmp_tokenizer.index] == '>') break; + } + return tmp_tokenizer.buf[tok.loc.byte_offset .. tmp_tokenizer.index + 1]; + } + const res = tmp_tokenizer.next(); + return tmp_tokenizer.buf[res.start..res.end]; +} + +/// Get expanded token source string. +pub fn expandedSlice(pp: *Preprocessor, tok: Token) []const u8 { + return pp.expandedSliceExtra(tok, .single_macro_ws); +} + +/// Concat two tokens and add the result to pp.generated +fn pasteTokens(pp: *Preprocessor, lhs_toks: *ExpandBuf, rhs_toks: []const Token) Error!void { + const lhs = while (lhs_toks.popOrNull()) |lhs| { + if (lhs.id == .macro_ws) + Token.free(lhs.expansion_locs, pp.gpa) + else + break lhs; + } else { + return bufCopyTokens(lhs_toks, rhs_toks, &.{}); + }; + + var rhs_rest: u32 = 1; + const rhs = for (rhs_toks) |rhs| { + if (rhs.id != .macro_ws) break rhs; + rhs_rest += 1; + } else { + return lhs_toks.appendAssumeCapacity(lhs); + }; + defer Token.free(lhs.expansion_locs, pp.gpa); + + const start = pp.comp.generated_buf.items.len; + const end = start + pp.expandedSlice(lhs).len + pp.expandedSlice(rhs).len; + try pp.comp.generated_buf.ensureTotalCapacity(end + 1); // +1 for a newline + // We cannot use the same slices here since they might be invalidated by `ensureCapacity` + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(lhs)); + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(rhs)); + pp.comp.generated_buf.appendAssumeCapacity('\n'); + + // Try to tokenize the result. + var tmp_tokenizer = Tokenizer{ + .buf = pp.comp.generated_buf.items, + .comp = pp.comp, + .index = @intCast(start), + .source = .generated, + }; + const pasted_token = tmp_tokenizer.nextNoWS(); + const next = tmp_tokenizer.nextNoWS().id; + if (next != .nl and next != .eof) { + try pp.comp.diag.add(.{ + .tag = .pasting_formed_invalid, + .loc = lhs.loc, + .extra = .{ .str = try pp.comp.diag.arena.allocator().dupe( + u8, + pp.comp.generated_buf.items[start..end], + ) }, + }, lhs.expansionSlice()); + } + + const pasted_id = if (lhs.id == .placemarker and rhs.id == .placemarker) + .placemarker + else + pasted_token.id; + try lhs_toks.append(try pp.makeGeneratedToken(start, pasted_id, lhs)); + try bufCopyTokens(lhs_toks, rhs_toks[rhs_rest..], &.{}); +} + +fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: Token) !Token { + var pasted_token = Token{ .id = id, .loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + } }; + pp.generated_line += 1; + try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); + try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); + return pasted_token; +} + +/// Defines a new macro and warns if it is a duplicate +fn defineMacro(pp: *Preprocessor, name_tok: RawToken, macro: Macro) Error!void { + const name_str = pp.tokSlice(name_tok); + const gop = try pp.defines.getOrPut(name_str); + if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { + try pp.comp.diag.add(.{ + .tag = if (gop.value_ptr.is_builtin) .builtin_macro_redefined else .macro_redefined, + .loc = .{ .id = name_tok.source, .byte_offset = name_tok.start, .line = name_tok.line }, + .extra = .{ .str = name_str }, + }, &.{}); + // TODO add a previous definition note + } + if (pp.verbose) { + pp.verboseLog(name_tok, "macro {s} defined", .{name_str}); + } + gop.value_ptr.* = macro; +} + +/// Handle a #define directive. +fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { + // Get macro name and validate it. + const macro_name = tokenizer.nextNoWS(); + if (macro_name.id == .keyword_defined) { + try pp.err(macro_name, .defined_as_macro_name); + return skipToNl(tokenizer); + } + if (!macro_name.id.isMacroIdentifier()) { + try pp.err(macro_name, .macro_name_must_be_identifier); + return skipToNl(tokenizer); + } + var macro_name_token_id = macro_name.id; + macro_name_token_id.simplifyMacroKeyword(); + switch (macro_name_token_id) { + .identifier, .extended_identifier => {}, + else => if (macro_name_token_id.isMacroIdentifier()) { + try pp.err(macro_name, .keyword_macro); + }, + } + + // Check for function macros and empty defines. + var first = tokenizer.next(); + switch (first.id) { + .nl, .eof => return pp.defineMacro(macro_name, .{ + .params = undefined, + .tokens = undefined, + .var_args = false, + .loc = undefined, + .is_func = false, + }), + .whitespace => first = tokenizer.next(), + .l_paren => return pp.defineFn(tokenizer, macro_name, first), + else => try pp.err(first, .whitespace_after_macro_name), + } + if (first.id == .hash_hash) { + try pp.err(first, .hash_hash_at_start); + return skipToNl(tokenizer); + } + first.id.simplifyMacroKeyword(); + + pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. + + var need_ws = false; + // Collect the token body and validate any ## found. + var tok = first; + const end_index = while (true) { + tok.id.simplifyMacroKeyword(); + switch (tok.id) { + .hash_hash => { + const next = tokenizer.nextNoWS(); + switch (next.id) { + .nl, .eof => { + try pp.err(tok, .hash_hash_at_end); + return; + }, + .hash_hash => { + try pp.err(next, .hash_hash_at_end); + return; + }, + else => {}, + } + try pp.token_buf.append(tok); + try pp.token_buf.append(next); + }, + .nl, .eof => break tok.start, + .whitespace => need_ws = true, + else => { + if (tok.id != .whitespace and need_ws) { + need_ws = false; + try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); + } + try pp.token_buf.append(tok); + }, + } + tok = tokenizer.next(); + } else unreachable; + + const list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); + try pp.defineMacro(macro_name, .{ + .loc = .{ + .id = macro_name.source, + .byte_offset = first.start, + .line = end_index, + }, + .tokens = list, + .params = undefined, + .is_func = false, + .var_args = false, + }); +} + +/// Handle a function like #define directive. +fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_paren: RawToken) Error!void { + assert(macro_name.id.isMacroIdentifier()); + var params = std.ArrayList([]const u8).init(pp.gpa); + defer params.deinit(); + + // Parse the parameter list. + var gnu_var_args: []const u8 = ""; + var var_args = false; + const start_index = while (true) { + var tok = tokenizer.nextNoWS(); + if (tok.id == .r_paren) break tok.end; + if (tok.id == .eof) return pp.err(tok, .unterminated_macro_param_list); + if (tok.id == .ellipsis) { + var_args = true; + const r_paren = tokenizer.nextNoWS(); + if (r_paren.id != .r_paren) { + try pp.err(r_paren, .missing_paren_param_list); + try pp.err(l_paren, .to_match_paren); + return skipToNl(tokenizer); + } + break r_paren.end; + } + if (!tok.id.isMacroIdentifier()) { + try pp.err(tok, .invalid_token_param_list); + return skipToNl(tokenizer); + } + + try params.append(pp.tokSlice(tok)); + + tok = tokenizer.nextNoWS(); + if (tok.id == .ellipsis) { + try pp.err(tok, .gnu_va_macro); + gnu_var_args = params.pop(); + const r_paren = tokenizer.nextNoWS(); + if (r_paren.id != .r_paren) { + try pp.err(r_paren, .missing_paren_param_list); + try pp.err(l_paren, .to_match_paren); + return skipToNl(tokenizer); + } + break r_paren.end; + } else if (tok.id == .r_paren) { + break tok.end; + } else if (tok.id != .comma) { + try pp.err(tok, .expected_comma_param_list); + return skipToNl(tokenizer); + } + } else unreachable; + + var need_ws = false; + // Collect the body tokens and validate # and ##'s found. + pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. + const end_index = tok_loop: while (true) { + var tok = tokenizer.next(); + switch (tok.id) { + .nl, .eof => break tok.start, + .whitespace => need_ws = pp.token_buf.items.len != 0, + .hash => { + if (tok.id != .whitespace and need_ws) { + need_ws = false; + try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); + } + const param = tokenizer.nextNoWS(); + blk: { + if (var_args and param.id == .keyword_va_args) { + tok.id = .stringify_va_args; + try pp.token_buf.append(tok); + continue :tok_loop; + } + if (!param.id.isMacroIdentifier()) break :blk; + const s = pp.tokSlice(param); + if (mem.eql(u8, s, gnu_var_args)) { + tok.id = .stringify_va_args; + try pp.token_buf.append(tok); + continue :tok_loop; + } + for (params.items, 0..) |p, i| { + if (mem.eql(u8, p, s)) { + tok.id = .stringify_param; + tok.end = @intCast(i); + try pp.token_buf.append(tok); + continue :tok_loop; + } + } + } + try pp.err(param, .hash_not_followed_param); + return skipToNl(tokenizer); + }, + .hash_hash => { + need_ws = false; + // if ## appears at the beginning, the token buf is still empty + // in this case, error out + if (pp.token_buf.items.len == 0) { + try pp.err(tok, .hash_hash_at_start); + return skipToNl(tokenizer); + } + const saved_tokenizer = tokenizer.*; + const next = tokenizer.nextNoWS(); + if (next.id == .nl or next.id == .eof) { + try pp.err(tok, .hash_hash_at_end); + return; + } + tokenizer.* = saved_tokenizer; + // convert the previous token to .macro_param_no_expand if it was .macro_param + if (pp.token_buf.items[pp.token_buf.items.len - 1].id == .macro_param) { + pp.token_buf.items[pp.token_buf.items.len - 1].id = .macro_param_no_expand; + } + try pp.token_buf.append(tok); + }, + else => { + if (tok.id != .whitespace and need_ws) { + need_ws = false; + try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); + } + if (var_args and tok.id == .keyword_va_args) { + // do nothing + } else if (tok.id.isMacroIdentifier()) { + tok.id.simplifyMacroKeyword(); + const s = pp.tokSlice(tok); + if (mem.eql(u8, gnu_var_args, s)) { + tok.id = .keyword_va_args; + } else for (params.items, 0..) |param, i| { + if (mem.eql(u8, param, s)) { + // NOTE: it doesn't matter to assign .macro_param_no_expand + // here in case a ## was the previous token, because + // ## processing will eat this token with the same semantics + tok.id = .macro_param; + tok.end = @intCast(i); + break; + } + } + } + try pp.token_buf.append(tok); + }, + } + } else unreachable; + + const param_list = try pp.arena.allocator().dupe([]const u8, params.items); + const token_list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); + try pp.defineMacro(macro_name, .{ + .is_func = true, + .params = param_list, + .var_args = var_args or gnu_var_args.len != 0, + .tokens = token_list, + .loc = .{ + .id = macro_name.source, + .byte_offset = start_index, + .line = end_index, + }, + }); +} + +/// Handle an #embed directive +fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { + const first = tokenizer.nextNoWS(); + const filename_tok = pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof) catch |er| switch (er) { + error.InvalidInclude => return, + else => |e| return e, + }; + + // Check for empty filename. + const tok_slice = pp.expandedSlice(filename_tok); + if (tok_slice.len < 3) { + try pp.err(first, .empty_filename); + return; + } + const filename = tok_slice[1 .. tok_slice.len - 1]; + const include_type: Compilation.IncludeType = switch (filename_tok.id) { + .string_literal => .quotes, + .macro_string => .angle_brackets, + else => unreachable, + }; + + const embed_bytes = (try pp.comp.findEmbed(filename, first.source, include_type)) orelse return pp.fatal(first, "'{s}' not found", .{filename}); + defer pp.comp.gpa.free(embed_bytes); + + if (embed_bytes.len == 0) return; + + try pp.tokens.ensureUnusedCapacity(pp.comp.gpa, 2 * embed_bytes.len - 1); // N bytes and N-1 commas + + // TODO: We currently only support systems with CHAR_BIT == 8 + // If the target's CHAR_BIT is not 8, we need to write out correctly-sized embed_bytes + // and correctly account for the target's endianness + const writer = pp.comp.generated_buf.writer(); + + { + const byte = embed_bytes[0]; + const start = pp.comp.generated_buf.items.len; + try writer.print("{d}", .{byte}); + pp.tokens.appendAssumeCapacity(try pp.makeGeneratedToken(start, .embed_byte, filename_tok)); + } + + for (embed_bytes[1..]) |byte| { + const start = pp.comp.generated_buf.items.len; + try writer.print(",{d}", .{byte}); + pp.tokens.appendAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); + pp.tokens.appendAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, filename_tok)); + } + try pp.comp.generated_buf.append('\n'); +} + +// Handle a #include directive. +fn include(pp: *Preprocessor, tokenizer: *Tokenizer, which: Compilation.WhichInclude) MacroError!void { + const first = tokenizer.nextNoWS(); + const new_source = findIncludeSource(pp, tokenizer, first, which) catch |er| switch (er) { + error.InvalidInclude => return, + else => |e| return e, + }; + + // Prevent stack overflow + pp.include_depth += 1; + defer pp.include_depth -= 1; + if (pp.include_depth > max_include_depth) { + try pp.comp.diag.add(.{ + .tag = .too_many_includes, + .loc = .{ .id = first.source, .byte_offset = first.start, .line = first.line }, + }, &.{}); + return error.StopPreprocessing; + } + + if (pp.include_guards.get(new_source.id)) |guard| { + if (pp.defines.contains(guard)) return; + } + + if (pp.verbose) { + pp.verboseLog(first, "include file {s}", .{new_source.path}); + } + + _ = pp.preprocessExtra(new_source) catch |er| switch (er) { + error.StopPreprocessing => {}, + else => |e| return e, + }; +} + +/// tokens that are part of a pragma directive can happen in 3 ways: +/// 1. directly in the text via `#pragma ...` +/// 2. Via a string literal argument to `_Pragma` +/// 3. Via a stringified macro argument which is used as an argument to `_Pragma` +/// operator_loc: Location of `_Pragma`; null if this is from #pragma +/// arg_locs: expansion locations of the argument to _Pragma. empty if #pragma or a raw string literal was used +fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !Token { + var tok = tokFromRaw(raw); + if (operator_loc) |loc| { + try tok.addExpansionLocation(pp.gpa, &.{loc}); + } + try tok.addExpansionLocation(pp.gpa, arg_locs); + return tok; +} + +/// Handle a pragma directive +fn pragma(pp: *Preprocessor, tokenizer: *Tokenizer, pragma_tok: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !void { + const name_tok = tokenizer.nextNoWS(); + if (name_tok.id == .nl or name_tok.id == .eof) return; + + const name = pp.tokSlice(name_tok); + try pp.tokens.append(pp.gpa, try pp.makePragmaToken(pragma_tok, operator_loc, arg_locs)); + const pragma_start: u32 = @intCast(pp.tokens.len); + + const pragma_name_tok = try pp.makePragmaToken(name_tok, operator_loc, arg_locs); + try pp.tokens.append(pp.gpa, pragma_name_tok); + while (true) { + const next_tok = tokenizer.next(); + if (next_tok.id == .whitespace) continue; + if (next_tok.id == .eof) { + try pp.tokens.append(pp.gpa, .{ + .id = .nl, + .loc = .{ .id = .generated }, + }); + break; + } + try pp.tokens.append(pp.gpa, try pp.makePragmaToken(next_tok, operator_loc, arg_locs)); + if (next_tok.id == .nl) break; + } + if (pp.comp.getPragma(name)) |prag| unknown: { + return prag.preprocessorCB(pp, pragma_start) catch |er| switch (er) { + error.UnknownPragma => break :unknown, + else => |e| return e, + }; + } + return pp.comp.diag.add(.{ + .tag = .unknown_pragma, + .loc = pragma_name_tok.loc, + }, pragma_name_tok.expansionSlice()); +} + +fn findIncludeFilenameToken( + pp: *Preprocessor, + first_token: RawToken, + tokenizer: *Tokenizer, + trailing_token_behavior: enum { ignore_trailing_tokens, expect_nl_eof }, +) !Token { + const start = pp.tokens.len; + defer pp.tokens.len = start; + var first = first_token; + + if (first.id == .angle_bracket_left) to_end: { + // The tokenizer does not handle <foo> include strings so do it here. + while (tokenizer.index < tokenizer.buf.len) : (tokenizer.index += 1) { + switch (tokenizer.buf[tokenizer.index]) { + '>' => { + tokenizer.index += 1; + first.end = tokenizer.index; + first.id = .macro_string; + break :to_end; + }, + '\n' => break, + else => {}, + } + } + try pp.comp.diag.add(.{ + .tag = .header_str_closing, + .loc = .{ .id = first.source, .byte_offset = tokenizer.index, .line = first.line }, + }, &.{}); + try pp.err(first, .header_str_match); + } + // Try to expand if the argument is a macro. + try pp.expandMacro(tokenizer, first); + + // Check that we actually got a string. + const filename_tok = pp.tokens.get(start); + switch (filename_tok.id) { + .string_literal, .macro_string => {}, + else => { + try pp.err(first, .expected_filename); + try pp.expectNl(tokenizer); + return error.InvalidInclude; + }, + } + switch (trailing_token_behavior) { + .expect_nl_eof => { + // Error on extra tokens. + const nl = tokenizer.nextNoWS(); + if ((nl.id != .nl and nl.id != .eof) or pp.tokens.len > start + 1) { + skipToNl(tokenizer); + try pp.err(first, .extra_tokens_directive_end); + } + }, + .ignore_trailing_tokens => {}, + } + return filename_tok; +} + +fn findIncludeSource(pp: *Preprocessor, tokenizer: *Tokenizer, first: RawToken, which: Compilation.WhichInclude) !Source { + const filename_tok = try pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof); + + // Check for empty filename. + const tok_slice = pp.expandedSlice(filename_tok); + if (tok_slice.len < 3) { + try pp.err(first, .empty_filename); + return error.InvalidInclude; + } + + // Find the file. + const filename = tok_slice[1 .. tok_slice.len - 1]; + const include_type: Compilation.IncludeType = switch (filename_tok.id) { + .string_literal => .quotes, + .macro_string => .angle_brackets, + else => unreachable, + }; + + return (try pp.comp.findInclude(filename, first.source, include_type, which)) orelse + pp.fatal(first, "'{s}' not found", .{filename}); +} + +/// Pretty print tokens and try to preserve whitespace. +pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { + var i: u32 = 0; + while (true) : (i += 1) { + var cur: Token = pp.tokens.get(i); + switch (cur.id) { + .eof => { + if (pp.tokens.len > 1 and pp.tokens.items(.id)[i - 1] != .nl) try w.writeByte('\n'); + break; + }, + .nl => try w.writeAll("\n"), + .keyword_pragma => { + const pragma_name = pp.expandedSlice(pp.tokens.get(i + 1)); + const end_idx = mem.indexOfScalarPos(Token.Id, pp.tokens.items(.id), i, .nl) orelse i + 1; + const pragma_len = @as(u32, @intCast(end_idx)) - i; + + if (pp.comp.getPragma(pragma_name)) |prag| { + if (!prag.shouldPreserveTokens(pp, i + 1)) { + i += pragma_len; + cur = pp.tokens.get(i); + continue; + } + } + try w.writeAll("#pragma"); + i += 1; + while (true) : (i += 1) { + cur = pp.tokens.get(i); + if (cur.id == .nl) { + try w.writeByte('\n'); + break; + } + try w.writeByte(' '); + const slice = pp.expandedSlice(cur); + try w.writeAll(slice); + } + }, + .whitespace => { + var slice = pp.expandedSlice(cur); + while (mem.indexOfScalar(u8, slice, '\n')) |some| { + try w.writeByte('\n'); + slice = slice[some + 1 ..]; + } + for (slice) |_| try w.writeByte(' '); + }, + else => { + const slice = pp.expandedSlice(cur); + try w.writeAll(slice); + }, + } + } +} + +test "Preserve pragma tokens sometimes" { + const allocator = std.testing.allocator; + const Test = struct { + fn runPreprocessor(source_text: []const u8) ![]const u8 { + var buf = std.ArrayList(u8).init(allocator); + defer buf.deinit(); + + var comp = Compilation.init(allocator); + defer comp.deinit(); + + try comp.addDefaultPragmaHandlers(); + + var pp = Preprocessor.init(&comp); + defer pp.deinit(); + + pp.preserve_whitespace = true; + + const test_runner_macros = try comp.addSourceFromBuffer("<test_runner>", source_text); + const eof = try pp.preprocess(test_runner_macros); + try pp.tokens.append(pp.gpa, eof); + try pp.prettyPrintTokens(buf.writer()); + return allocator.dupe(u8, buf.items); + } + + fn check(source_text: []const u8, expected: []const u8) !void { + const output = try runPreprocessor(source_text); + defer allocator.free(output); + + try std.testing.expectEqualStrings(expected, output); + } + }; + const preserve_gcc_diagnostic = + \\#pragma GCC diagnostic error "-Wnewline-eof" + \\#pragma GCC warning error "-Wnewline-eof" + \\int x; + \\#pragma GCC ignored error "-Wnewline-eof" + \\ + ; + try Test.check(preserve_gcc_diagnostic, preserve_gcc_diagnostic); + + const omit_once = + \\#pragma once + \\int x; + \\#pragma once + \\ + ; + try Test.check(omit_once, "int x;\n"); + + const omit_poison = + \\#pragma GCC poison foobar + \\ + ; + try Test.check(omit_poison, ""); +} + +test "destringify" { + const allocator = std.testing.allocator; + const Test = struct { + fn testDestringify(pp: *Preprocessor, stringified: []const u8, destringified: []const u8) !void { + pp.char_buf.clearRetainingCapacity(); + try pp.char_buf.ensureUnusedCapacity(stringified.len); + pp.destringify(stringified); + try std.testing.expectEqualStrings(destringified, pp.char_buf.items); + } + }; + var comp = Compilation.init(allocator); + defer comp.deinit(); + var pp = Preprocessor.init(&comp); + defer pp.deinit(); + + try Test.testDestringify(&pp, "hello\tworld\n", "hello\tworld\n"); + try Test.testDestringify(&pp, + \\ \"FOO BAR BAZ\" + , + \\ "FOO BAR BAZ" + ); + try Test.testDestringify(&pp, + \\ \\t\\n + \\ + , + \\ \t\n + \\ + ); +} + +test "Include guards" { + const Test = struct { + /// This is here so that when #elifdef / #elifndef are added we don't forget + /// to test that they don't accidentally break include guard detection + fn pairsWithIfndef(tok_id: RawToken.Id) bool { + return switch (tok_id) { + .keyword_elif, + .keyword_elifdef, + .keyword_elifndef, + .keyword_else, + => true, + + .keyword_include, + .keyword_include_next, + .keyword_embed, + .keyword_define, + .keyword_defined, + .keyword_undef, + .keyword_ifdef, + .keyword_ifndef, + .keyword_error, + .keyword_warning, + .keyword_pragma, + .keyword_line, + .keyword_endif, + => false, + else => unreachable, + }; + } + + fn skippable(tok_id: RawToken.Id) bool { + return switch (tok_id) { + .keyword_defined, .keyword_va_args, .keyword_endif => true, + else => false, + }; + } + + fn testIncludeGuard(allocator: std.mem.Allocator, comptime template: []const u8, tok_id: RawToken.Id, expected_guards: u32) !void { + var comp = Compilation.init(allocator); + defer comp.deinit(); + var pp = Preprocessor.init(&comp); + defer pp.deinit(); + + const path = try std.fs.path.join(allocator, &.{ ".", "bar.h" }); + defer allocator.free(path); + + _ = try comp.addSourceFromBuffer(path, "int bar = 5;\n"); + + var buf = std.ArrayList(u8).init(allocator); + defer buf.deinit(); + + var writer = buf.writer(); + switch (tok_id) { + .keyword_include, .keyword_include_next => try writer.print(template, .{ tok_id.lexeme().?, " \"bar.h\"" }), + .keyword_define, .keyword_undef => try writer.print(template, .{ tok_id.lexeme().?, " BAR" }), + .keyword_ifndef, + .keyword_ifdef, + .keyword_elifdef, + .keyword_elifndef, + => try writer.print(template, .{ tok_id.lexeme().?, " BAR\n#endif" }), + else => try writer.print(template, .{ tok_id.lexeme().?, "" }), + } + const source = try comp.addSourceFromBuffer("test.h", buf.items); + _ = try pp.preprocess(source); + + try std.testing.expectEqual(expected_guards, pp.include_guards.count()); + } + }; + const tags = std.meta.tags(RawToken.Id); + for (tags) |tag| { + if (Test.skippable(tag)) continue; + var copy = tag; + copy.simplifyMacroKeyword(); + if (copy != tag or tag == .keyword_else) { + const inside_ifndef_template = + \\//Leading comment (should be ignored) + \\ + \\#ifndef FOO + \\#{s}{s} + \\#endif + ; + const expected_guards: u32 = if (Test.pairsWithIfndef(tag)) 0 else 1; + try Test.testIncludeGuard(std.testing.allocator, inside_ifndef_template, tag, expected_guards); + + const outside_ifndef_template = + \\#ifndef FOO + \\#endif + \\#{s}{s} + ; + try Test.testIncludeGuard(std.testing.allocator, outside_ifndef_template, tag, 0); + } + } +} |
