diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/grammar.txt | 26 | ||||
| -rw-r--r-- | src/parsergen.cpp | 171 | ||||
| -rw-r--r-- | src/tokenizer.hpp | 6 |
3 files changed, 171 insertions, 32 deletions
diff --git a/src/grammar.txt b/src/grammar.txt index 4c3584d313..4de2a95b3e 100644 --- a/src/grammar.txt +++ b/src/grammar.txt @@ -1,46 +1,50 @@ -Root : many(FnDecl) token(EOF) { +Root<node> : many(FnDecl) token(EOF) { $$ = ast_create_root($1); }; -FnDecl : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(token(Arrow) Type) Block { +FnDecl<node> : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(ReturnType) Block { $$ = ast_create_fn_decl($2, $4, $6, $7); }; -ParamDecl : token(Symbol) token(Colon) Type { +ReturnType<node> : token(Arrow) Type { + $$ = $2; +}; + +ParamDecl<node> : token(Symbol) token(Colon) Type { $$ = ast_create_param_decl($1, $2); }; -Type : token(Symbol) { +Type<node> : token(Symbol) { $$ = ast_create_symbol_type($1); } | PointerType { $$ = $1; }; -PointerType : token(Star) token(Const) Type { +PointerType<node> : token(Star) token(Const) Type { $$ = ast_create_pointer_type($2, $3); } | token(Star) token(Mut) Type { $$ = ast_create_pointer_type($2, $3); }; -Block : token(LBrace) many(Statement) option(Expression) token(RBrace) { +Block<node> : token(LBrace) many(Statement) option(Expression) token(RBrace) { $$ = ast_create_block($2, $3); }; -Statement : ExpressionStatement { +Statement<node> : ExpressionStatement { $$ = $1; } | ReturnStatement { $$ = $1; }; -ExpressionStatement : Expression token(Semicolon) { +ExpressionStatement<node> : Expression token(Semicolon) { $$ = ast_create_expression_statement($1); }; -ReturnStatement : token(Return) Expression token(Semicolon) { +ReturnStatement<node> : token(Return) Expression token(Semicolon) { $$ = ast_create_return_statement($2); }; -Expression : token(Number) { +Expression<node> : token(Number) { $$ = ast_create_number($1); } | token(String) { $$ = ast_create_string($1); @@ -48,6 +52,6 @@ Expression : token(Number) { $$ = $1; }; -FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) { +FnCall<node> : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) { $$ = ast_create_fn_call($1, $3); }; diff --git a/src/parsergen.cpp b/src/parsergen.cpp index 31bb1a8493..952b4592f2 100644 --- a/src/parsergen.cpp +++ b/src/parsergen.cpp @@ -136,6 +136,7 @@ struct RuleTuple { Buf name; ZigList<RuleNode *> children; Buf body; + Buf union_field_name; }; struct RuleMany { @@ -161,6 +162,9 @@ struct RuleList { struct RuleSubRule { RuleNode *child; + + // for lexer use only + Buf name; }; enum RuleNodeType { @@ -175,6 +179,8 @@ enum RuleNodeType { struct RuleNode { RuleNodeType type; + int lex_line; + int lex_column; union { RuleTuple tuple; RuleMany many; @@ -205,6 +211,7 @@ struct CodeGenCapture { Buf *body; bool is_root; Buf *field_names; + Buf *union_field_name; }; struct CodeGen { @@ -225,6 +232,8 @@ struct ParserState { enum LexState { LexStateStart, LexStateRuleName, + LexStateRuleFieldNameStart, + LexStateRuleFieldName, LexStateWaitForColon, LexStateTupleRule, LexStateFnName, @@ -232,6 +241,7 @@ enum LexState { LexStateToken, LexStateBody, LexStateEndOrOr, + LexStateSubTupleName, }; struct LexStack { @@ -258,6 +268,8 @@ struct Gen { int lex_token_name_begin; int lex_body_begin; int lex_body_end; + int lex_sub_tuple_begin; + int lex_field_name_begin; }; static ParserState *create_state(Gen *g) { @@ -303,12 +315,13 @@ static void state_add_push_node(ParserState *state) { state_add_code(state, code); } -static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count) { +static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count, Buf *union_field_name) { CodeGen *code = allocate<CodeGen>(1); code->type = CodeGenTypeCapture; code->capture.body = body; code->capture.is_root = is_root; code->capture.field_names = allocate<Buf>(field_name_count); + code->capture.union_field_name = union_field_name; return code; } @@ -325,6 +338,7 @@ static void state_add_eat_token(ParserState *state) { } static void gen(Gen *g, RuleNode *node, Buf *out_field_name) { + assert(node); switch (node->type) { case RuleNodeTypeToken: { @@ -346,13 +360,14 @@ static void gen(Gen *g, RuleNode *node, Buf *out_field_name) { break; case RuleNodeTypeTuple: { - buf_init_from_str(out_field_name, "node"); + buf_init_from_buf(out_field_name, &node->tuple.union_field_name); state_add_push_node(g->cur_state); bool is_root = (node == g->root); int field_name_count = node->tuple.children.length; - CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count); + CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count, + &node->tuple.union_field_name); for (int i = 0; i < node->tuple.children.length; i += 1) { RuleNode *child = node->tuple.children.at(i); @@ -411,7 +426,7 @@ static void lex_error(Gen *g, const char *format, ...) { va_list ap; va_start(ap, format); - fprintf(stderr, "Error: Line %d, column %d: ", line, column); + fprintf(stderr, "Grammar Error: Line %d, column %d: ", line, column); vfprintf(stderr, format, ap); fprintf(stderr, "\n"); va_end(ap); @@ -428,10 +443,16 @@ static void lex_pop_stack(Gen *g) { g->lex_stack.pop(); } +static RuleNode *create_rule_node(Gen *g) { + RuleNode *node = allocate<RuleNode>(1); + node->lex_line = g->lex_line; + node->lex_column = g->lex_column; + return node; +} static void begin_rule(Gen *g) { assert(!g->lex_cur_rule); - g->lex_cur_rule = allocate<RuleNode>(1); + g->lex_cur_rule = create_rule_node(g); g->lex_cur_rule->type = RuleNodeTypeTuple; g->lex_cur_rule_begin = g->lex_pos; @@ -452,6 +473,18 @@ static void end_rule_name(Gen *g) { buf_init_from_mem(&g->lex_cur_rule->tuple.name, ptr, len); } +static void begin_rule_field_name(Gen *g) { + assert(g->lex_cur_rule); + g->lex_field_name_begin = g->lex_pos; +} + +static void end_rule_field_name(Gen *g) { + assert(g->lex_cur_rule); + char *ptr = &buf_ptr(g->in_buf)[g->lex_field_name_begin]; + int len = g->lex_pos - g->lex_field_name_begin; + buf_init_from_mem(&g->lex_cur_rule->tuple.union_field_name, ptr, len); +} + static void begin_fn_name(Gen *g) { g->lex_fn_name_begin = g->lex_pos; lex_push_stack(g); @@ -478,15 +511,13 @@ static void end_token_name(Gen *g) { buf_init_from_mem(&token_name, ptr, len); Token *token = find_or_create_token(g, &token_name); - RuleNode *node = allocate<RuleNode>(1); + RuleNode *node = create_rule_node(g); node->type = RuleNodeTypeToken; node->token.token = token; assert(g->lex_cur_rule->type == RuleNodeTypeTuple); g->lex_cur_rule->tuple.children.append(node); - g->biggest_tuple_len = max(g->biggest_tuple_len, g->lex_cur_rule->tuple.children.length); - lex_pop_stack(g); } @@ -504,6 +535,36 @@ static void end_tuple_body(Gen *g) { buf_init_from_mem(&g->lex_cur_rule->tuple.body, ptr, len); } +static void begin_sub_tuple(Gen *g) { + g->lex_sub_tuple_begin = g->lex_pos; + lex_push_stack(g); +} + +static void end_sub_tuple(Gen *g) { + assert(g->lex_cur_rule->type == RuleNodeTypeTuple); + char *ptr = &buf_ptr(g->in_buf)[g->lex_sub_tuple_begin]; + int len = g->lex_pos - g->lex_sub_tuple_begin; + + RuleNode *node = create_rule_node(g); + node->type = RuleNodeTypeSubRule; + buf_init_from_mem(&node->sub_rule.name, ptr, len); + + g->lex_cur_rule->tuple.children.append(node); + + lex_pop_stack(g); +} + +static RuleNode *find_rule_node(Gen *g, Buf *name) { + for (int i = 0; i < g->rules.length; i += 1) { + RuleNode *node = g->rules.at(i); + assert(node->type == RuleNodeTypeTuple); + if (buf_eql_buf(&node->tuple.name, name)) { + return node; + } + } + return nullptr; +} + static void initialize_rules(Gen *g) { g->lex_state = LexStateStart; for (g->lex_pos = 0; g->lex_pos < buf_len(g->in_buf); g->lex_pos += 1) { @@ -524,18 +585,36 @@ static void initialize_rules(Gen *g) { break; case LexStateRuleName: switch (c) { - case WHITESPACE: + case '<': end_rule_name(g); - g->lex_state = LexStateWaitForColon; + g->lex_state = LexStateRuleFieldNameStart; break; - case ':': - end_rule_name(g); - g->lex_state = LexStateTupleRule; + case SYMBOL_CHAR: + // ok break; + default: + lex_error(g, "expected '<', not '%c'", c); + } + break; + case LexStateRuleFieldNameStart: + switch (c) { case SYMBOL_CHAR: + begin_rule_field_name(g); + g->lex_state = LexStateRuleFieldName; break; default: - lex_error(g, "invalid char: '%c'", c); + lex_error(g, "expected field name, not '%c'", c); + } + break; + case LexStateRuleFieldName: + switch (c) { + case SYMBOL_CHAR: + // ok + break; + case '>': + end_rule_field_name(g); + g->lex_state = LexStateWaitForColon; + break; } break; case LexStateWaitForColon: @@ -559,12 +638,16 @@ static void initialize_rules(Gen *g) { begin_fn_name(g); g->lex_state = LexStateFnName; break; + case UPPER_ALPHA: + begin_sub_tuple(g); + g->lex_state = LexStateSubTupleName; + break; case '{': begin_tuple_body(g); g->lex_state = LexStateBody; break; default: - lex_error(g, "invalid char: '%c'", c); + lex_error(g, "expected rule, not '%c'", c); } break; case LexStateFnName: @@ -589,7 +672,7 @@ static void initialize_rules(Gen *g) { g->lex_state = LexStateToken; break; default: - lex_error(g, "invalid char '%c'", c); + lex_error(g, "expected token name, not '%c'", c); } break; case LexStateToken: @@ -601,7 +684,7 @@ static void initialize_rules(Gen *g) { end_token_name(g); break; default: - lex_error(g, "invalid char '%c'", c); + lex_error(g, "expected token name or ')', not '%c'", c); } break; case LexStateBody: @@ -627,6 +710,20 @@ static void initialize_rules(Gen *g) { default: lex_error(g, "expected ';' or '|'"); } + break; + case LexStateSubTupleName: + switch (c) { + case ALPHA: + // ignore + break; + case WHITESPACE: + end_sub_tuple(g); + assert(g->lex_state == LexStateTupleRule); + break; + default: + lex_error(g, "expected rule name, not '%c'", c); + } + break; } if (c == '\n') { g->lex_line += 1; @@ -647,9 +744,40 @@ static void initialize_rules(Gen *g) { case LexStateTokenStart: case LexStateToken: case LexStateBody: + case LexStateSubTupleName: + case LexStateRuleFieldNameStart: + case LexStateRuleFieldName: lex_error(g, "unexpected EOF"); break; } + + // Resolve child references into pointers + for (int tuple_i = 0; tuple_i < g->rules.length; tuple_i += 1) { + RuleNode *node = g->rules.at(tuple_i); + assert(node->type == RuleNodeTypeTuple); + + for (int child_i = 0; child_i < node->tuple.children.length; child_i += 1) { + RuleNode *child = node->tuple.children.at(child_i); + if (child->type == RuleNodeTypeSubRule) { + int line = child->lex_line + 1; + int column = child->lex_column + 1; + RuleNode *referenced_node = find_rule_node(g, &child->sub_rule.name); + if (!referenced_node) { + fprintf(stderr, "Grammar Error: Line %d, column %d: Rule not defined: '%s'\n", + line, column, buf_ptr(&child->sub_rule.name)); + } + child->sub_rule.child = referenced_node; + } + } + } + + + // calculate the biggest tuple len + for (int i = 0; i < g->rules.length; i += 1) { + RuleNode *node = g->rules.at(i); + assert(node->type == RuleNodeTypeTuple); + g->biggest_tuple_len = max(g->biggest_tuple_len, node->tuple.children.length); + } } enum TemplateState { @@ -828,6 +956,8 @@ int main(int argc, char **argv) { fprintf(out_f, " state = transition[%d][token->id];\n", state->index); break; case CodeGenTypeError: + fprintf(out_f, " token_index -= 1;\n"); + fprintf(out_f, " token = &tokens->at(token_index);\n"); fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(code->error.msg)); break; case CodeGenTypeSave: @@ -843,7 +973,12 @@ int main(int argc, char **argv) { fprintf(out_f, "%s\n", buf_ptr(code_text)); fprintf(out_f, " return root;\n"); } else { - zig_panic("TODO capture non-root"); + fprintf(out_f, " ParserGenNode *parent_node = stack.at(stack.length - 2);\n"); + Buf *dest = buf_sprintf("parent_node->data[parent_node->next_index++].%s", + buf_ptr(code->capture.union_field_name)); + Buf *code_text = fill_template(code->capture.body, buf_ptr(dest), + code->capture.field_names); + fprintf(out_f, "%s\n", buf_ptr(code_text)); } break; case CodeGenTypePopNode: diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 5baf9bc231..177b6daaae 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -36,15 +36,15 @@ enum TokenId { // TODO: debug delete this enum TokenId { - TokenIdStar = 0, - TokenIdLParen = 1, + TokenIdLParen = 0, + TokenIdRParen = 1, TokenIdEof = 2, + TokenIdStar = 3, TokenIdSymbol, TokenIdKeywordFn, TokenIdKeywordReturn, TokenIdKeywordMut, TokenIdKeywordConst, - TokenIdRParen, TokenIdComma, TokenIdLBrace, TokenIdRBrace, |
