diff options
| author | Andrew Kelley <superjoe30@gmail.com> | 2016-04-21 15:48:13 -0700 |
|---|---|---|
| committer | Andrew Kelley <superjoe30@gmail.com> | 2016-04-21 15:48:13 -0700 |
| commit | 35362f8137b2c5109e6bc39cb12048c016b5b580 (patch) | |
| tree | 7c0506d8fdb92341cb412ee984288e23c98187b6 /src/c_tokenizer.cpp | |
| parent | a380b803ac8b4eefcde4d3d552cdcbc8010aa798 (diff) | |
| download | zig-35362f8137b2c5109e6bc39cb12048c016b5b580.tar.gz zig-35362f8137b2c5109e6bc39cb12048c016b5b580.zip | |
better parsing of C macros
See #88
Diffstat (limited to 'src/c_tokenizer.cpp')
| -rw-r--r-- | src/c_tokenizer.cpp | 651 |
1 files changed, 651 insertions, 0 deletions
diff --git a/src/c_tokenizer.cpp b/src/c_tokenizer.cpp new file mode 100644 index 0000000000..ddcb5ba152 --- /dev/null +++ b/src/c_tokenizer.cpp @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2016 Andrew Kelley + * + * This file is part of zig, which is MIT licensed. + * See http://opensource.org/licenses/MIT + */ + +#include "c_tokenizer.hpp" +#include <inttypes.h> + +#define WHITESPACE_EXCEPT_N \ + ' ': \ + case '\t': \ + case '\v': \ + case '\f' + +#define DIGIT_NON_ZERO \ + '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9' + +#define DIGIT \ + '0': \ + case DIGIT_NON_ZERO + +#define ALPHA \ + 'a': \ + case 'b': \ + case 'c': \ + case 'd': \ + case 'e': \ + case 'f': \ + case 'g': \ + case 'h': \ + case 'i': \ + case 'j': \ + case 'k': \ + case 'l': \ + case 'm': \ + case 'n': \ + case 'o': \ + case 'p': \ + case 'q': \ + case 'r': \ + case 's': \ + case 't': \ + case 'u': \ + case 'v': \ + case 'w': \ + case 'x': \ + case 'y': \ + case 'z': \ + case 'A': \ + case 'B': \ + case 'C': \ + case 'D': \ + case 'E': \ + case 'F': \ + case 'G': \ + case 'H': \ + case 'I': \ + case 'J': \ + case 'K': \ + case 'L': \ + case 'M': \ + case 'N': \ + case 'O': \ + case 'P': \ + case 'Q': \ + case 'R': \ + case 'S': \ + case 'T': \ + case 'U': \ + case 'V': \ + case 'W': \ + case 'X': \ + case 'Y': \ + case 'Z' + +#define IDENT_START \ + ALPHA: \ + case '_' + +#define IDENT \ + IDENT_START: \ + case DIGIT + + +static void begin_token(CTokenize *ctok, CTokId id) { + assert(ctok->cur_tok == nullptr); + ctok->tokens.add_one(); + ctok->cur_tok = &ctok->tokens.last(); + ctok->cur_tok->id = id; + + switch (id) { + case CTokIdStrLit: + memset(&ctok->cur_tok->data.str_lit, 0, sizeof(Buf)); + buf_resize(&ctok->cur_tok->data.str_lit, 0); + break; + case CTokIdSymbol: + memset(&ctok->cur_tok->data.symbol, 0, sizeof(Buf)); + buf_resize(&ctok->cur_tok->data.symbol, 0); + break; + case CTokIdCharLit: + case CTokIdNumLitInt: + case CTokIdNumLitFloat: + case CTokIdMinus: + break; + } +} + +static void end_token(CTokenize *ctok) { + ctok->cur_tok = nullptr; +} + +static void mark_error(CTokenize *ctok) { + ctok->error = true; +} + +static void add_char(CTokenize *ctok, uint8_t c) { + assert(ctok->cur_tok); + if (ctok->cur_tok->id == CTokIdCharLit) { + ctok->cur_tok->data.char_lit = c; + ctok->state = CTokStateExpectEndQuot; + } else if (ctok->cur_tok->id == CTokIdStrLit) { + buf_append_char(&ctok->cur_tok->data.str_lit, c); + ctok->state = CTokStateString; + } else { + zig_unreachable(); + } +} + +static void hex_digit(CTokenize *ctok, uint8_t value) { + // TODO @mul_with_overflow + ctok->cur_tok->data.num_lit_int *= 16; + // TODO @add_with_overflow + ctok->cur_tok->data.num_lit_int += value; + + static const uint8_t hex_digit[] = "0123456789abcdef"; + buf_append_char(&ctok->buf, hex_digit[value]); +} + +static void end_float(CTokenize *ctok) { + // TODO detect errors, overflow, and underflow + double value = strtod(buf_ptr(&ctok->buf), nullptr); + + ctok->cur_tok->data.num_lit_float = value; + + end_token(ctok); + ctok->state = CTokStateStart; + +} + +void tokenize_c_macro(CTokenize *ctok, const uint8_t *c) { + ctok->tokens.resize(0); + ctok->state = CTokStateStart; + ctok->error = false; + ctok->cur_tok = nullptr; + + buf_resize(&ctok->buf, 0); + + for (; *c; c += 1) { + switch (ctok->state) { + case CTokStateStart: + switch (*c) { + case WHITESPACE_EXCEPT_N: + break; + case '\'': + ctok->state = CTokStateExpectChar; + begin_token(ctok, CTokIdCharLit); + break; + case '\"': + ctok->state = CTokStateString; + begin_token(ctok, CTokIdStrLit); + break; + case '/': + ctok->state = CTokStateOpenComment; + break; + case '\\': + ctok->state = CTokStateBackslash; + break; + case '\n': + goto found_end_of_macro; + case IDENT_START: + ctok->state = CTokStateIdentifier; + begin_token(ctok, CTokIdSymbol); + buf_append_char(&ctok->cur_tok->data.symbol, *c); + break; + case DIGIT_NON_ZERO: + ctok->state = CTokStateDecimal; + ctok->unsigned_suffix = false; + ctok->long_suffix = false; + begin_token(ctok, CTokIdNumLitInt); + ctok->cur_tok->data.num_lit_int = *c - '0'; + buf_resize(&ctok->buf, 0); + buf_append_char(&ctok->buf, *c); + break; + case '0': + ctok->state = CTokStateGotZero; + ctok->unsigned_suffix = false; + ctok->long_suffix = false; + begin_token(ctok, CTokIdNumLitInt); + ctok->cur_tok->data.num_lit_int = 0; + buf_resize(&ctok->buf, 0); + buf_append_char(&ctok->buf, '0'); + break; + case '.': + begin_token(ctok, CTokIdNumLitFloat); + ctok->state = CTokStateFloat; + buf_init_from_str(&ctok->buf, "0."); + break; + default: + return mark_error(ctok); + } + break; + case CTokStateFloat: + switch (*c) { + case 'e': + case 'E': + buf_append_char(&ctok->buf, 'e'); + ctok->state = CTokStateExpSign; + break; + case 'f': + case 'F': + case 'l': + case 'L': + end_float(ctok); + break; + case DIGIT: + buf_append_char(&ctok->buf, *c); + break; + default: + c -= 1; + end_float(ctok); + continue; + } + break; + case CTokStateExpSign: + switch (*c) { + case '+': + case '-': + ctok->state = CTokStateFloatExpFirst; + buf_append_char(&ctok->buf, *c); + break; + case DIGIT: + ctok->state = CTokStateFloatExp; + buf_append_char(&ctok->buf, *c); + break; + default: + return mark_error(ctok); + } + break; + case CTokStateFloatExpFirst: + switch (*c) { + case DIGIT: + buf_append_char(&ctok->buf, *c); + ctok->state = CTokStateFloatExp; + break; + default: + return mark_error(ctok); + } + break; + case CTokStateFloatExp: + switch (*c) { + case DIGIT: + buf_append_char(&ctok->buf, *c); + break; + case 'f': + case 'F': + case 'l': + case 'L': + end_float(ctok); + break; + default: + c -= 1; + end_float(ctok); + continue; + } + break; + case CTokStateDecimal: + switch (*c) { + case DIGIT: + buf_append_char(&ctok->buf, *c); + + // TODO @mul_with_overflow + ctok->cur_tok->data.num_lit_int *= 10; + // TODO @add_with_overflow + ctok->cur_tok->data.num_lit_int += *c - '0'; + break; + case '\'': + break; + case 'u': + case 'U': + ctok->unsigned_suffix = true; + ctok->state = CTokStateIntSuffix; + break; + case 'l': + case 'L': + ctok->long_suffix = true; + ctok->state = CTokStateIntSuffixLong; + break; + case '.': + buf_append_char(&ctok->buf, '.'); + ctok->cur_tok->id = CTokIdNumLitFloat; + ctok->state = CTokStateFloat; + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateIntSuffix: + switch (*c) { + case 'l': + case 'L': + if (ctok->long_suffix) { + return mark_error(ctok); + } + ctok->long_suffix = true; + ctok->state = CTokStateIntSuffixLong; + break; + case 'u': + case 'U': + if (ctok->unsigned_suffix) { + return mark_error(ctok); + } + ctok->unsigned_suffix = true; + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateIntSuffixLong: + switch (*c) { + case 'l': + case 'L': + ctok->state = CTokStateIntSuffix; + break; + case 'u': + case 'U': + if (ctok->unsigned_suffix) { + return mark_error(ctok); + } + ctok->unsigned_suffix = true; + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateGotZero: + switch (*c) { + case 'x': + case 'X': + ctok->state = CTokStateHex; + break; + case '.': + ctok->state = CTokStateFloat; + ctok->cur_tok->id = CTokIdNumLitFloat; + buf_append_char(&ctok->buf, '.'); + break; + default: + c -= 1; + ctok->state = CTokStateOctal; + continue; + } + break; + case CTokStateOctal: + switch (*c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // TODO @mul_with_overflow + ctok->cur_tok->data.num_lit_int *= 8; + // TODO @add_with_overflow + ctok->cur_tok->data.num_lit_int += *c - '0'; + break; + case '8': + case '9': + return mark_error(ctok); + case '\'': + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateHex: + switch (*c) { + case '0': + hex_digit(ctok, 0); + break; + case '1': + hex_digit(ctok, 1); + break; + case '2': + hex_digit(ctok, 2); + break; + case '3': + hex_digit(ctok, 3); + break; + case '4': + hex_digit(ctok, 4); + break; + case '5': + hex_digit(ctok, 5); + break; + case '6': + hex_digit(ctok, 6); + break; + case '7': + hex_digit(ctok, 7); + break; + case '8': + hex_digit(ctok, 8); + break; + case '9': + hex_digit(ctok, 9); + break; + case 'a': + case 'A': + hex_digit(ctok, 10); + break; + case 'b': + case 'B': + hex_digit(ctok, 11); + break; + case 'c': + case 'C': + hex_digit(ctok, 12); + break; + case 'd': + case 'D': + hex_digit(ctok, 13); + break; + case 'e': + case 'E': + hex_digit(ctok, 14); + break; + case 'f': + case 'F': + hex_digit(ctok, 15); + break; + case 'p': + case 'P': + ctok->cur_tok->id = CTokIdNumLitFloat; + ctok->state = CTokStateExpSign; + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateIdentifier: + switch (*c) { + case IDENT: + buf_append_char(&ctok->cur_tok->data.symbol, *c); + break; + default: + c -= 1; + end_token(ctok); + ctok->state = CTokStateStart; + continue; + } + break; + case CTokStateString: + switch (*c) { + case '\\': + ctok->state = CTokStateCharEscape; + break; + case '\"': + end_token(ctok); + ctok->state = CTokStateStart; + break; + default: + buf_append_char(&ctok->cur_tok->data.str_lit, *c); + } + break; + case CTokStateExpectChar: + switch (*c) { + case '\\': + ctok->state = CTokStateCharEscape; + break; + case '\'': + return mark_error(ctok); + default: + ctok->cur_tok->data.char_lit = *c; + ctok->state = CTokStateExpectEndQuot; + } + break; + case CTokStateCharEscape: + switch (*c) { + case '\'': + case '"': + case '?': + case '\\': + add_char(ctok, *c); + break; + case 'a': + add_char(ctok, '\a'); + break; + case 'b': + add_char(ctok, '\b'); + break; + case 'f': + add_char(ctok, '\f'); + break; + case 'n': + add_char(ctok, '\n'); + break; + case 'r': + add_char(ctok, '\r'); + break; + case 't': + add_char(ctok, '\t'); + break; + case 'v': + add_char(ctok, '\v'); + break; + case DIGIT: + zig_panic("TODO octal"); + break; + case 'x': + zig_panic("TODO hex"); + break; + case 'u': + zig_panic("TODO unicode"); + break; + case 'U': + zig_panic("TODO Unicode"); + break; + default: + return mark_error(ctok); + } + break; + case CTokStateExpectEndQuot: + switch (*c) { + case '\'': + end_token(ctok); + ctok->state = CTokStateStart; + break; + default: + return mark_error(ctok); + } + break; + case CTokStateOpenComment: + switch (*c) { + case '/': + ctok->state = CTokStateLineComment; + break; + case '*': + ctok->state = CTokStateComment; + break; + default: + return mark_error(ctok); + } + break; + case CTokStateLineComment: + if (*c == '\n') { + ctok->state = CTokStateStart; + goto found_end_of_macro; + } + break; + case CTokStateComment: + switch (*c) { + case '*': + ctok->state = CTokStateCommentStar; + break; + default: + break; + } + break; + case CTokStateCommentStar: + switch (*c) { + case '/': + ctok->state = CTokStateStart; + break; + case '*': + break; + default: + ctok->state = CTokStateComment; + break; + } + break; + case CTokStateBackslash: + switch (*c) { + case '\n': + ctok->state = CTokStateStart; + break; + default: + return mark_error(ctok); + } + break; + } + } +found_end_of_macro: + + switch (ctok->state) { + case CTokStateStart: + break; + case CTokStateIdentifier: + case CTokStateDecimal: + case CTokStateHex: + case CTokStateOctal: + case CTokStateGotZero: + case CTokStateIntSuffix: + case CTokStateIntSuffixLong: + end_token(ctok); + break; + case CTokStateFloat: + case CTokStateFloatExp: + end_float(ctok); + break; + case CTokStateExpectChar: + case CTokStateExpectEndQuot: + case CTokStateOpenComment: + case CTokStateLineComment: + case CTokStateComment: + case CTokStateCommentStar: + case CTokStateCharEscape: + case CTokStateBackslash: + case CTokStateString: + case CTokStateExpSign: + case CTokStateFloatExpFirst: + return mark_error(ctok); + } + + assert(ctok->cur_tok == nullptr); +} |
