aboutsummaryrefslogtreecommitdiff
path: root/src/c_tokenizer.cpp
diff options
context:
space:
mode:
authorAndrew Kelley <superjoe30@gmail.com>2016-04-21 15:48:13 -0700
committerAndrew Kelley <superjoe30@gmail.com>2016-04-21 15:48:13 -0700
commit35362f8137b2c5109e6bc39cb12048c016b5b580 (patch)
tree7c0506d8fdb92341cb412ee984288e23c98187b6 /src/c_tokenizer.cpp
parenta380b803ac8b4eefcde4d3d552cdcbc8010aa798 (diff)
downloadzig-35362f8137b2c5109e6bc39cb12048c016b5b580.tar.gz
zig-35362f8137b2c5109e6bc39cb12048c016b5b580.zip
better parsing of C macros
See #88
Diffstat (limited to 'src/c_tokenizer.cpp')
-rw-r--r--src/c_tokenizer.cpp651
1 files changed, 651 insertions, 0 deletions
diff --git a/src/c_tokenizer.cpp b/src/c_tokenizer.cpp
new file mode 100644
index 0000000000..ddcb5ba152
--- /dev/null
+++ b/src/c_tokenizer.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2016 Andrew Kelley
+ *
+ * This file is part of zig, which is MIT licensed.
+ * See http://opensource.org/licenses/MIT
+ */
+
+#include "c_tokenizer.hpp"
+#include <inttypes.h>
+
+#define WHITESPACE_EXCEPT_N \
+ ' ': \
+ case '\t': \
+ case '\v': \
+ case '\f'
+
+#define DIGIT_NON_ZERO \
+ '1': \
+ case '2': \
+ case '3': \
+ case '4': \
+ case '5': \
+ case '6': \
+ case '7': \
+ case '8': \
+ case '9'
+
+#define DIGIT \
+ '0': \
+ case DIGIT_NON_ZERO
+
+#define ALPHA \
+ 'a': \
+ case 'b': \
+ case 'c': \
+ case 'd': \
+ case 'e': \
+ case 'f': \
+ case 'g': \
+ case 'h': \
+ case 'i': \
+ case 'j': \
+ case 'k': \
+ case 'l': \
+ case 'm': \
+ case 'n': \
+ case 'o': \
+ case 'p': \
+ case 'q': \
+ case 'r': \
+ case 's': \
+ case 't': \
+ case 'u': \
+ case 'v': \
+ case 'w': \
+ case 'x': \
+ case 'y': \
+ case 'z': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'E': \
+ case 'F': \
+ case 'G': \
+ case 'H': \
+ case 'I': \
+ case 'J': \
+ case 'K': \
+ case 'L': \
+ case 'M': \
+ case 'N': \
+ case 'O': \
+ case 'P': \
+ case 'Q': \
+ case 'R': \
+ case 'S': \
+ case 'T': \
+ case 'U': \
+ case 'V': \
+ case 'W': \
+ case 'X': \
+ case 'Y': \
+ case 'Z'
+
+#define IDENT_START \
+ ALPHA: \
+ case '_'
+
+#define IDENT \
+ IDENT_START: \
+ case DIGIT
+
+
+static void begin_token(CTokenize *ctok, CTokId id) {
+ assert(ctok->cur_tok == nullptr);
+ ctok->tokens.add_one();
+ ctok->cur_tok = &ctok->tokens.last();
+ ctok->cur_tok->id = id;
+
+ switch (id) {
+ case CTokIdStrLit:
+ memset(&ctok->cur_tok->data.str_lit, 0, sizeof(Buf));
+ buf_resize(&ctok->cur_tok->data.str_lit, 0);
+ break;
+ case CTokIdSymbol:
+ memset(&ctok->cur_tok->data.symbol, 0, sizeof(Buf));
+ buf_resize(&ctok->cur_tok->data.symbol, 0);
+ break;
+ case CTokIdCharLit:
+ case CTokIdNumLitInt:
+ case CTokIdNumLitFloat:
+ case CTokIdMinus:
+ break;
+ }
+}
+
+static void end_token(CTokenize *ctok) {
+ ctok->cur_tok = nullptr;
+}
+
+static void mark_error(CTokenize *ctok) {
+ ctok->error = true;
+}
+
+static void add_char(CTokenize *ctok, uint8_t c) {
+ assert(ctok->cur_tok);
+ if (ctok->cur_tok->id == CTokIdCharLit) {
+ ctok->cur_tok->data.char_lit = c;
+ ctok->state = CTokStateExpectEndQuot;
+ } else if (ctok->cur_tok->id == CTokIdStrLit) {
+ buf_append_char(&ctok->cur_tok->data.str_lit, c);
+ ctok->state = CTokStateString;
+ } else {
+ zig_unreachable();
+ }
+}
+
+static void hex_digit(CTokenize *ctok, uint8_t value) {
+ // TODO @mul_with_overflow
+ ctok->cur_tok->data.num_lit_int *= 16;
+ // TODO @add_with_overflow
+ ctok->cur_tok->data.num_lit_int += value;
+
+ static const uint8_t hex_digit[] = "0123456789abcdef";
+ buf_append_char(&ctok->buf, hex_digit[value]);
+}
+
+static void end_float(CTokenize *ctok) {
+ // TODO detect errors, overflow, and underflow
+ double value = strtod(buf_ptr(&ctok->buf), nullptr);
+
+ ctok->cur_tok->data.num_lit_float = value;
+
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+
+}
+
+void tokenize_c_macro(CTokenize *ctok, const uint8_t *c) {
+ ctok->tokens.resize(0);
+ ctok->state = CTokStateStart;
+ ctok->error = false;
+ ctok->cur_tok = nullptr;
+
+ buf_resize(&ctok->buf, 0);
+
+ for (; *c; c += 1) {
+ switch (ctok->state) {
+ case CTokStateStart:
+ switch (*c) {
+ case WHITESPACE_EXCEPT_N:
+ break;
+ case '\'':
+ ctok->state = CTokStateExpectChar;
+ begin_token(ctok, CTokIdCharLit);
+ break;
+ case '\"':
+ ctok->state = CTokStateString;
+ begin_token(ctok, CTokIdStrLit);
+ break;
+ case '/':
+ ctok->state = CTokStateOpenComment;
+ break;
+ case '\\':
+ ctok->state = CTokStateBackslash;
+ break;
+ case '\n':
+ goto found_end_of_macro;
+ case IDENT_START:
+ ctok->state = CTokStateIdentifier;
+ begin_token(ctok, CTokIdSymbol);
+ buf_append_char(&ctok->cur_tok->data.symbol, *c);
+ break;
+ case DIGIT_NON_ZERO:
+ ctok->state = CTokStateDecimal;
+ ctok->unsigned_suffix = false;
+ ctok->long_suffix = false;
+ begin_token(ctok, CTokIdNumLitInt);
+ ctok->cur_tok->data.num_lit_int = *c - '0';
+ buf_resize(&ctok->buf, 0);
+ buf_append_char(&ctok->buf, *c);
+ break;
+ case '0':
+ ctok->state = CTokStateGotZero;
+ ctok->unsigned_suffix = false;
+ ctok->long_suffix = false;
+ begin_token(ctok, CTokIdNumLitInt);
+ ctok->cur_tok->data.num_lit_int = 0;
+ buf_resize(&ctok->buf, 0);
+ buf_append_char(&ctok->buf, '0');
+ break;
+ case '.':
+ begin_token(ctok, CTokIdNumLitFloat);
+ ctok->state = CTokStateFloat;
+ buf_init_from_str(&ctok->buf, "0.");
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateFloat:
+ switch (*c) {
+ case 'e':
+ case 'E':
+ buf_append_char(&ctok->buf, 'e');
+ ctok->state = CTokStateExpSign;
+ break;
+ case 'f':
+ case 'F':
+ case 'l':
+ case 'L':
+ end_float(ctok);
+ break;
+ case DIGIT:
+ buf_append_char(&ctok->buf, *c);
+ break;
+ default:
+ c -= 1;
+ end_float(ctok);
+ continue;
+ }
+ break;
+ case CTokStateExpSign:
+ switch (*c) {
+ case '+':
+ case '-':
+ ctok->state = CTokStateFloatExpFirst;
+ buf_append_char(&ctok->buf, *c);
+ break;
+ case DIGIT:
+ ctok->state = CTokStateFloatExp;
+ buf_append_char(&ctok->buf, *c);
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateFloatExpFirst:
+ switch (*c) {
+ case DIGIT:
+ buf_append_char(&ctok->buf, *c);
+ ctok->state = CTokStateFloatExp;
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateFloatExp:
+ switch (*c) {
+ case DIGIT:
+ buf_append_char(&ctok->buf, *c);
+ break;
+ case 'f':
+ case 'F':
+ case 'l':
+ case 'L':
+ end_float(ctok);
+ break;
+ default:
+ c -= 1;
+ end_float(ctok);
+ continue;
+ }
+ break;
+ case CTokStateDecimal:
+ switch (*c) {
+ case DIGIT:
+ buf_append_char(&ctok->buf, *c);
+
+ // TODO @mul_with_overflow
+ ctok->cur_tok->data.num_lit_int *= 10;
+ // TODO @add_with_overflow
+ ctok->cur_tok->data.num_lit_int += *c - '0';
+ break;
+ case '\'':
+ break;
+ case 'u':
+ case 'U':
+ ctok->unsigned_suffix = true;
+ ctok->state = CTokStateIntSuffix;
+ break;
+ case 'l':
+ case 'L':
+ ctok->long_suffix = true;
+ ctok->state = CTokStateIntSuffixLong;
+ break;
+ case '.':
+ buf_append_char(&ctok->buf, '.');
+ ctok->cur_tok->id = CTokIdNumLitFloat;
+ ctok->state = CTokStateFloat;
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateIntSuffix:
+ switch (*c) {
+ case 'l':
+ case 'L':
+ if (ctok->long_suffix) {
+ return mark_error(ctok);
+ }
+ ctok->long_suffix = true;
+ ctok->state = CTokStateIntSuffixLong;
+ break;
+ case 'u':
+ case 'U':
+ if (ctok->unsigned_suffix) {
+ return mark_error(ctok);
+ }
+ ctok->unsigned_suffix = true;
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateIntSuffixLong:
+ switch (*c) {
+ case 'l':
+ case 'L':
+ ctok->state = CTokStateIntSuffix;
+ break;
+ case 'u':
+ case 'U':
+ if (ctok->unsigned_suffix) {
+ return mark_error(ctok);
+ }
+ ctok->unsigned_suffix = true;
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateGotZero:
+ switch (*c) {
+ case 'x':
+ case 'X':
+ ctok->state = CTokStateHex;
+ break;
+ case '.':
+ ctok->state = CTokStateFloat;
+ ctok->cur_tok->id = CTokIdNumLitFloat;
+ buf_append_char(&ctok->buf, '.');
+ break;
+ default:
+ c -= 1;
+ ctok->state = CTokStateOctal;
+ continue;
+ }
+ break;
+ case CTokStateOctal:
+ switch (*c) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ // TODO @mul_with_overflow
+ ctok->cur_tok->data.num_lit_int *= 8;
+ // TODO @add_with_overflow
+ ctok->cur_tok->data.num_lit_int += *c - '0';
+ break;
+ case '8':
+ case '9':
+ return mark_error(ctok);
+ case '\'':
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateHex:
+ switch (*c) {
+ case '0':
+ hex_digit(ctok, 0);
+ break;
+ case '1':
+ hex_digit(ctok, 1);
+ break;
+ case '2':
+ hex_digit(ctok, 2);
+ break;
+ case '3':
+ hex_digit(ctok, 3);
+ break;
+ case '4':
+ hex_digit(ctok, 4);
+ break;
+ case '5':
+ hex_digit(ctok, 5);
+ break;
+ case '6':
+ hex_digit(ctok, 6);
+ break;
+ case '7':
+ hex_digit(ctok, 7);
+ break;
+ case '8':
+ hex_digit(ctok, 8);
+ break;
+ case '9':
+ hex_digit(ctok, 9);
+ break;
+ case 'a':
+ case 'A':
+ hex_digit(ctok, 10);
+ break;
+ case 'b':
+ case 'B':
+ hex_digit(ctok, 11);
+ break;
+ case 'c':
+ case 'C':
+ hex_digit(ctok, 12);
+ break;
+ case 'd':
+ case 'D':
+ hex_digit(ctok, 13);
+ break;
+ case 'e':
+ case 'E':
+ hex_digit(ctok, 14);
+ break;
+ case 'f':
+ case 'F':
+ hex_digit(ctok, 15);
+ break;
+ case 'p':
+ case 'P':
+ ctok->cur_tok->id = CTokIdNumLitFloat;
+ ctok->state = CTokStateExpSign;
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateIdentifier:
+ switch (*c) {
+ case IDENT:
+ buf_append_char(&ctok->cur_tok->data.symbol, *c);
+ break;
+ default:
+ c -= 1;
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ continue;
+ }
+ break;
+ case CTokStateString:
+ switch (*c) {
+ case '\\':
+ ctok->state = CTokStateCharEscape;
+ break;
+ case '\"':
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ break;
+ default:
+ buf_append_char(&ctok->cur_tok->data.str_lit, *c);
+ }
+ break;
+ case CTokStateExpectChar:
+ switch (*c) {
+ case '\\':
+ ctok->state = CTokStateCharEscape;
+ break;
+ case '\'':
+ return mark_error(ctok);
+ default:
+ ctok->cur_tok->data.char_lit = *c;
+ ctok->state = CTokStateExpectEndQuot;
+ }
+ break;
+ case CTokStateCharEscape:
+ switch (*c) {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\':
+ add_char(ctok, *c);
+ break;
+ case 'a':
+ add_char(ctok, '\a');
+ break;
+ case 'b':
+ add_char(ctok, '\b');
+ break;
+ case 'f':
+ add_char(ctok, '\f');
+ break;
+ case 'n':
+ add_char(ctok, '\n');
+ break;
+ case 'r':
+ add_char(ctok, '\r');
+ break;
+ case 't':
+ add_char(ctok, '\t');
+ break;
+ case 'v':
+ add_char(ctok, '\v');
+ break;
+ case DIGIT:
+ zig_panic("TODO octal");
+ break;
+ case 'x':
+ zig_panic("TODO hex");
+ break;
+ case 'u':
+ zig_panic("TODO unicode");
+ break;
+ case 'U':
+ zig_panic("TODO Unicode");
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateExpectEndQuot:
+ switch (*c) {
+ case '\'':
+ end_token(ctok);
+ ctok->state = CTokStateStart;
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateOpenComment:
+ switch (*c) {
+ case '/':
+ ctok->state = CTokStateLineComment;
+ break;
+ case '*':
+ ctok->state = CTokStateComment;
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ case CTokStateLineComment:
+ if (*c == '\n') {
+ ctok->state = CTokStateStart;
+ goto found_end_of_macro;
+ }
+ break;
+ case CTokStateComment:
+ switch (*c) {
+ case '*':
+ ctok->state = CTokStateCommentStar;
+ break;
+ default:
+ break;
+ }
+ break;
+ case CTokStateCommentStar:
+ switch (*c) {
+ case '/':
+ ctok->state = CTokStateStart;
+ break;
+ case '*':
+ break;
+ default:
+ ctok->state = CTokStateComment;
+ break;
+ }
+ break;
+ case CTokStateBackslash:
+ switch (*c) {
+ case '\n':
+ ctok->state = CTokStateStart;
+ break;
+ default:
+ return mark_error(ctok);
+ }
+ break;
+ }
+ }
+found_end_of_macro:
+
+ switch (ctok->state) {
+ case CTokStateStart:
+ break;
+ case CTokStateIdentifier:
+ case CTokStateDecimal:
+ case CTokStateHex:
+ case CTokStateOctal:
+ case CTokStateGotZero:
+ case CTokStateIntSuffix:
+ case CTokStateIntSuffixLong:
+ end_token(ctok);
+ break;
+ case CTokStateFloat:
+ case CTokStateFloatExp:
+ end_float(ctok);
+ break;
+ case CTokStateExpectChar:
+ case CTokStateExpectEndQuot:
+ case CTokStateOpenComment:
+ case CTokStateLineComment:
+ case CTokStateComment:
+ case CTokStateCommentStar:
+ case CTokStateCharEscape:
+ case CTokStateBackslash:
+ case CTokStateString:
+ case CTokStateExpSign:
+ case CTokStateFloatExpFirst:
+ return mark_error(ctok);
+ }
+
+ assert(ctok->cur_tok == nullptr);
+}