diff options
Diffstat (limited to 'src/tokenizer.c')
-rw-r--r-- | src/tokenizer.c | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/src/tokenizer.c b/src/tokenizer.c new file mode 100644 index 0000000..f1763a5 --- /dev/null +++ b/src/tokenizer.c @@ -0,0 +1,186 @@ +#include "../include/tokenizer.h" +#include "../include/mem.h" +#include <assert.h> +#include <limits.h> +#include <stdio.h> +#include <stdarg.h> + +static int isAlpha(int c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int isDigit(int c) { + return c >= '0' && c <= '9'; +} + +static int isAlphaDigit(int c) { + return isAlpha(c) || isDigit(c); +} + +int tokenizer_init(Tokenizer *self, BufferView code) { + assert(code.size <= INT_MAX); + self->code = code; + self->index = 0; + self->prev_index = 0; + self->line = 1; + return 0; +} + +void tokenizer_deinit(Tokenizer *self) { + (void)self; +} + +static int tokenizer_get_char(Tokenizer *self) { + assert(self->index >= 0 && self->index < (int)self->code.size); + return self->code.data[self->index]; +} + +static Token tokenizer_skip_whitespace(Tokenizer *self) { + int c; + for(;;) { + if(self->index >= (int)self->code.size) + return TOK_END_OF_FILE; + + c = self->code.data[self->index]; + switch(c) { + case '\n': + ++self->line; + /* fallthrough */ + case ' ': + case '\t': + break; + default: + return TOK_NONE; + } + ++self->index; + } +} + +int tokenizer_next(Tokenizer *self, Token *token) { + Token last_token; + int c; + + last_token = tokenizer_skip_whitespace(self); + if(last_token == TOK_END_OF_FILE) { + *token = TOK_END_OF_FILE; + return TOKENIZER_OK; + } + + self->prev_index = self->index; + c = tokenizer_get_char(self); + if(isAlpha(c) || c == '_') { + int identifier_start; + identifier_start = self->index; + ++self->index; + + while(self->index < (int)self->code.size) { + c = tokenizer_get_char(self); + if(isAlphaDigit(c) || c == '_') + ++self->index; + else + break; + } + + self->value.identifier = create_buffer_view(self->code.data + identifier_start, self->index - identifier_start); + + if(am_memeql(self->value.identifier.data, "const", 5)) + *token = TOK_CONST; + else if(am_memeql(self->value.identifier.data, "var", 3)) + *token = TOK_VAR; + else + *token = TOK_IDENTIFIER; + } else if(c == '=') { + ++self->index; + *token = TOK_EQUALS; + } else if(c == '(') { + ++self->index; + *token = TOK_OPEN_PAREN; + } else if(c == ')') { + ++self->index; + *token = TOK_CLOSING_PAREN; + } else if(c == '{') { + ++self->index; + *token = TOK_OPEN_BRACE; + } else if(c == '}') { + ++self->index; + *token = TOK_CLOSING_BRACE; + } else { + /*self.printError("Unexpected symbol '{c}'", c);*/ + tokenizer_print_error(self, "Unexpected symbol '%c'", c); + return TOKENIZER_UNEXPECTED_TOKEN; + } + return TOKENIZER_OK; +} + +int tokenizer_accept(Tokenizer *self, Token expected_token) { + Token actual_token; + return_if_error(tokenizer_next(self, &actual_token)); + if(actual_token == expected_token) + return TOKENIZER_OK; + + /* Todo: convert token to string */ + tokenizer_print_error(self, "Expected %d, got %d", expected_token, actual_token); + return TOKENIZER_UNEXPECTED_TOKEN; +} + +int tokenizer_consume_if(Tokenizer *self, Token expected_token, bool *result) { + int index; + int line; + Token actual_token; + + index = self->index; + line = self->line; + return_if_error(tokenizer_next(self, &actual_token)); + if(actual_token == expected_token) { + *result = bool_true; + } else { + /* No need to restore self.prev_index as it's updated on the next call to tokenizer_next */ + self->index = index; + self->line = line; + *result = bool_false; + } + return TOKENIZER_OK; +} + +static int tokenizer_get_start_of_line_from_index(Tokenizer *self, int index) { + int c; + while(index >= 0) { + c = self->code.data[(usize)index]; + if(c == '\n' || c == '\r') { + return index + 1; + } + --index; + } + return 0; +} + +static int tokenizer_get_end_of_line_from_index(Tokenizer *self, int index) { + int c; + while(index < (int)self->code.size) { + c = self->code.data[(usize)index]; + if(c == '\n' || c == '\r') + break; + ++index; + } + return index; +} + +void tokenizer_print_error(Tokenizer *self, const char *fmt, ...) { + va_list args; + int line_start; + int line_end; + int prev_column; + int i; + + va_start(args, fmt); + line_start = tokenizer_get_start_of_line_from_index(self, self->prev_index); + line_end = tokenizer_get_end_of_line_from_index(self, self->prev_index); + prev_column = self->prev_index - line_start; + fprintf(stderr, "\x1b[1;37m%s:%d:%d:\x1b[0m \x1b[1;31merror:\x1b[0m ", "file.am", self->line, 1 + prev_column); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n%.*s\n", line_end - line_start, self->code.data + line_start); + for(i = 0; i < prev_column; ++i) + fprintf(stderr, " "); + fprintf(stderr, "\x1b[1;32m^\x1b[0m\n"); + va_end(args); +}
\ No newline at end of file |