#include "../include/tokenizer.h" #include "../include/std/mem.h" #include "../include/std/log.h" #include "../include/std/thread.h" #include #include #include #include static int isAlpha(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static int isDigit(int c) { return c >= '0' && c <= '9'; } static int isAlphaDigit(int c) { return isAlpha(c) || isDigit(c); } int tokenizer_init(Tokenizer *self, BufferView code, BufferView code_name) { assert(code.size <= INT_MAX); self->code = code; self->index = 0; self->prev_index = 0; self->line = 1; self->token = TOK_NONE; self->needs_update = bool_true; self->code_name = code_name.data ? code_name : create_buffer_view("", 8); return 0; } static int tokenizer_get_char(Tokenizer *self) { assert(self->index >= 0 && self->index < (int)self->code.size); return self->code.data[self->index]; } static Token tokenizer_skip_whitespace(Tokenizer *self) { int c; for(;;) { if(self->index >= (int)self->code.size) return TOK_END_OF_FILE; c = self->code.data[self->index]; switch(c) { case '\n': ++self->line; /* fallthrough */ case ' ': case '\t': break; default: self->prev_index = self->index; return TOK_NONE; } ++self->index; } self->prev_index = self->index; } /* Returns -1 if end of string can't be found */ static int find_end_of_string(BufferView buf, int index) { int c; bool escape_quote; escape_quote = bool_false; for(; index < (int)buf.size; ++index) { c = buf.data[index]; if(c == '\\') escape_quote = !escape_quote; else if(!escape_quote && c == '"') return index; else escape_quote = bool_false; } return -1; } static CHECK_RESULT int tokenizer_next(Tokenizer *self, Token *token); static CHECK_RESULT int __tokenizer_next(Tokenizer *self, Token *token) { Token last_token; int c; int result; last_token = tokenizer_skip_whitespace(self); if(last_token == TOK_END_OF_FILE) { *token = TOK_END_OF_FILE; return TOKENIZER_OK; } c = tokenizer_get_char(self); if(isAlpha(c) || c == '_') { int identifier_start; identifier_start = self->index; ++self->index; while(self->index < (int)self->code.size) { c = tokenizer_get_char(self); if(isAlphaDigit(c) || c == '_') ++self->index; else break; } self->value.identifier = create_buffer_view(self->code.data + identifier_start, self->index - identifier_start); if(am_memeql(self->value.identifier.data, "const", 5)) *token = TOK_CONST; else if(am_memeql(self->value.identifier.data, "var", 3)) *token = TOK_VAR; else *token = TOK_IDENTIFIER; } else if(c == '"') { int string_end; ++self->index; string_end = find_end_of_string(self->code, self->index); if(string_end == -1) { tokenizer_print_error(self, "String end not found. Did you forget '\"' or did you have a mismatch of number of '\"'?"); return TOKENIZER_ERR; } self->value.string.data = &self->code.data[self->index]; self->value.string.size = string_end - self->index; self->index = string_end + 1; *token = TOK_STRING; return TOKENIZER_OK; } else if(c == '=') { ++self->index; *token = TOK_EQUALS; } else if(c == '(') { ++self->index; *token = TOK_OPEN_PAREN; } else if(c == ')') { ++self->index; *token = TOK_CLOSING_PAREN; } else if(c == '{') { ++self->index; *token = TOK_OPEN_BRACE; } else if(c == '}') { ++self->index; *token = TOK_CLOSING_BRACE; } else if(c == '@') { const char *err_msg; ++self->index; if(self->index + 6 >= (int)self->code.size || !am_memeql(self->code.data + self->index, "import", 6)) { err_msg = "Expected '@import(path)'"; goto import_error; } self->index += 6; result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_OPEN_PAREN) { err_msg = "Expected '(' after @import"; goto import_error; } result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_STRING) { err_msg = "Expected string after @import("; goto import_error; } if(self->value.string.size == 0) { err_msg = "Path in @import can't be empty"; goto import_error; } result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_CLOSING_PAREN) { err_msg = "Expected ')' after @import(path"; goto import_error; } *token = TOK_IMPORT; return TOKENIZER_OK; import_error: tokenizer_print_error(self, err_msg); return TOKENIZER_ERR; } else { tokenizer_print_error(self, "Unexpected symbol '%c'", c); return TOKENIZER_UNEXPECTED_TOKEN; } return TOKENIZER_OK; } /* Wrapper around __tokenizer_next to store last parsed token */ int tokenizer_next(Tokenizer *self, Token *token) { int result; result = __tokenizer_next(self, token); self->token = *token; return result; } int tokenizer_accept(Tokenizer *self, Token expected_token) { Token actual_token; return_if_error(tokenizer_next(self, &actual_token)); if(actual_token == expected_token) { self->needs_update = bool_true; return TOKENIZER_OK; } /* Todo: convert token to string */ tokenizer_print_error(self, "Expected %d, got %d", expected_token, actual_token); return TOKENIZER_UNEXPECTED_TOKEN; } int tokenizer_consume_if(Tokenizer *self, Token expected_token, bool *result) { Token actual_token; if(!self->needs_update) { *result = (self->token == expected_token); self->needs_update = *result; return TOKENIZER_OK; } return_if_error(tokenizer_next(self, &actual_token)); *result = (actual_token == expected_token); self->needs_update = *result; return TOKENIZER_OK; } static int tokenizer_get_start_of_line_from_index(Tokenizer *self, int index) { int c; while(index >= 0) { c = self->code.data[(usize)index]; if(c == '\n' || c == '\r') { return index + 1; } --index; } return 0; } static int tokenizer_get_end_of_line_from_index(Tokenizer *self, int index) { int c; while(index < (int)self->code.size) { c = self->code.data[(usize)index]; if(c == '\n' || c == '\r') break; ++index; } return index; } void tokenizer_print_error(Tokenizer *self, const char *fmt, ...) { va_list args; int line_start; int line_end; int prev_column; int i; amal_mutex *mutex; mutex = amal_log_get_mutex(); ignore_result_int(amal_mutex_lock(mutex, "tokenizer_print_error")); va_start(args, fmt); line_start = tokenizer_get_start_of_line_from_index(self, self->prev_index); line_end = tokenizer_get_end_of_line_from_index(self, self->prev_index); prev_column = self->prev_index - line_start; fprintf(stderr, "\x1b[1;37m%.*s:%d:%d:\x1b[0m \x1b[1;31merror:\x1b[0m ", (int)self->code_name.size, self->code_name.data, self->line, 1 + prev_column); vfprintf(stderr, fmt, args); fprintf(stderr, "\n%.*s\n", line_end - line_start, self->code.data + line_start); for(i = 0; i < prev_column; ++i) fprintf(stderr, " "); fprintf(stderr, "\x1b[1;32m^\x1b[0m\n"); va_end(args); ignore_result_int(amal_mutex_unlock(mutex)); }