#include "../include/tokenizer.h" #include "../include/std/mem.h" #include "../include/std/log.h" #include "../include/std/thread.h" #include "../include/std/arena_allocator.h" #include #include #include #include #include static int isAlpha(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static int isDigit(int c) { return c >= '0' && c <= '9'; } static int isAlphaDigit(int c) { return isAlpha(c) || isDigit(c); } static int tokenizer_get_start_of_line_from_index(Tokenizer *self, int index); static int tokenizer_get_end_of_line_from_index(Tokenizer *self, int index); /* Returns -1 if end of multiline comment was not found */ static int tokenizer_get_end_of_multiline_comment(Tokenizer *self, int index); int tokenizer_init(Tokenizer *self, ArenaAllocator *allocator, BufferView code, BufferView code_name, const amal_compiler_options *compiler_options) { assert(code.size <= INT_MAX); assert(compiler_options); self->code = code; self->index = 0; self->prev_index = 0; self->token = TOK_NONE; self->needs_update = bool_true; self->code_name = code_name.data ? code_name : create_buffer_view("", 8); self->allocator = allocator; self->compiler_options = compiler_options; /* Skip UTF-8 BOM */ if(self->code.size >= 3 && am_memcmp(self->code.data, "\xEF\xBB\xBF", 3) == 0) { self->code.data += 3; self->code.size -= 3; } /* Skip shebang */ if(self->code.size >= 2 && am_memcmp(self->code.data, "#!", 2) == 0) { self->index = tokenizer_get_end_of_line_from_index(self, self->index + 2); } return 0; } static int tokenizer_get_char(Tokenizer *self) { assert(self->index >= 0 && self->index < (int)self->code.size); return self->code.data[self->index]; } static Token tokenizer_skip_whitespace(Tokenizer *self) { int c; for(;;) { if(self->index >= (int)self->code.size) return TOK_END_OF_FILE; c = self->code.data[self->index]; switch(c) { case '\n': case ' ': case '\t': break; default: self->prev_index = self->index; return TOK_NONE; } ++self->index; } self->prev_index = self->index; return TOK_NONE; } /* Returns -1 if end of string can't be found */ static CHECK_RESULT int find_end_of_string(BufferView buf, int index) { int c; bool escape_quote; escape_quote = bool_false; for(; index < (int)buf.size; ++index) { c = buf.data[index]; if(c == '\\') escape_quote = !escape_quote; else if(!escape_quote && c == '"') return index; else escape_quote = bool_false; } return -1; } /* TODO: Optimize string to integer and string to float */ /* TODO: Correctly check multiplication overflow */ #define I64_OVERFLOW_ERROR -1 static CHECK_RESULT int string_to_integer_unchecked(BufferView str, i64 *result) { int i; i64 value; value = 0; for(i = 0; i < (int)str.size; ++i) { i64 value_before; value_before = value; value = value * 10 + (str.data[i] - '0'); /* overflow */ if(value < value_before) return I64_OVERFLOW_ERROR; } *result = value; return 0; } #define F64_OVERFLOW_ERROR -1 static CHECK_RESULT int string_to_float_unchecked(BufferView str, int dot_index, f64 *result) { int i; f64 value_before_dot; f64 value_after_dot; f64 div_factor; value_before_dot = 0.0; for(i = 0; i < dot_index; ++i) { f64 value_before; value_before = value_before_dot; value_before_dot = value_before_dot * 10.0 + (str.data[i] - '0'); /* overflow */ if(value_before_dot < value_before) return F64_OVERFLOW_ERROR; } value_after_dot = 0.0; div_factor = 1.0; for(i = dot_index + 1; i < (int)str.size; ++i) { value_after_dot = value_after_dot * 10.0 + (str.data[i] - '0'); div_factor *= 10.0; /* TODO: Check mantissa overflow */ } *result = value_before_dot + (value_after_dot / div_factor); return 0; } #define SET_BINOP(_binop_type) do { *token = TOK_BINOP; self->value.binop_type = (_binop_type); } while(0) static CHECK_RESULT int tokenizer_next(Tokenizer *self, Token *token); static CHECK_RESULT int __tokenizer_next(Tokenizer *self, Token *token) { Token last_token; int c; int result; last_token = tokenizer_skip_whitespace(self); if(last_token == TOK_END_OF_FILE) { *token = TOK_END_OF_FILE; return TOKENIZER_OK; } c = tokenizer_get_char(self); if(isAlpha(c) || c == '_') { int identifier_start; identifier_start = self->index; ++self->index; while(self->index < (int)self->code.size) { c = tokenizer_get_char(self); if(isAlphaDigit(c) || c == '_') ++self->index; else break; } self->value.identifier = create_buffer_view(self->code.data + identifier_start, self->index - identifier_start); switch(self->value.identifier.size) { case 2: { if(am_memeql(self->value.identifier.data, "fn", 2)) { *token = TOK_FN; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "if", 2)) { *token = TOK_IF; return TOKENIZER_OK; } break; } case 3: { if(am_memeql(self->value.identifier.data, "var", 3)) { *token = TOK_VAR; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "pub", 3)) { *token = TOK_PUB; return TOKENIZER_OK; } break; } case 4: { if(am_memeql(self->value.identifier.data, "else", 4)) { *token = TOK_ELSE; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "true", 4)) { *token = TOK_BOOL; self->bool_value = bool_true; return TOKENIZER_OK; } break; } case 5: { if(am_memeql(self->value.identifier.data, "const", 5)) { *token = TOK_CONST; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "while", 5)) { *token = TOK_WHILE; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "false", 5)) { *token = TOK_BOOL; self->bool_value = bool_false; return TOKENIZER_OK; } break; } case 6: { if(am_memeql(self->value.identifier.data, "struct", 6)) { *token = TOK_STRUCT; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "extern", 6)) { *token = TOK_EXTERN; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "export", 6)) { *token = TOK_EXPORT; return TOKENIZER_OK; } else if(am_memeql(self->value.identifier.data, "return", 6)) { *token = TOK_RETURN; return TOKENIZER_OK; } break; } } *token = TOK_IDENTIFIER; } else if(isDigit(c)) { BufferView number_str; int number_start = self->index; int dot_index = -1; ++self->index; while(self->index < (int)self->code.size) { c = tokenizer_get_char(self); if(isDigit(c)) ++self->index; else if(c == '.' && dot_index == -1) { dot_index = self->index - number_start; ++self->index; } else { break; } } number_str = create_buffer_view(self->code.data + number_start, self->index - number_start); /* TODO: Support octal (0o...), hex (0x...) and binary (0b...) syntax */ if(number_str.size > 1 && number_str.data[0] == '0') { tokenizer_print_error(self, number_start, "Invalid number %.*s", (int)number_str.size, number_str.data); return TOKENIZER_ERR; } /* TODO: Check if the number fits in the result bits */ if(dot_index == -1) { int result = string_to_integer_unchecked(number_str, &self->number.value.integer); if(result != 0) { /* TODO */ tokenizer_print_error(self, self->prev_index, "Integer value %.*s is too large to fit in signed 64-bit. Support for large integers is not supported yet", number_str.size, number_str.data); return TOKENIZER_ERR; } self->number.type = AMAL_NUMBER_SIGNED_INTEGER; self->number.bits = 32; } else { int result = string_to_float_unchecked(number_str, dot_index, &self->number.value.floating); if(result != 0) { /* TODO */ tokenizer_print_error(self, self->prev_index, "Float value %.*s is too large to fit in 64-bit. Support for large floating numbers is not supported yet", number_str.size, number_str.data); return TOKENIZER_ERR; } self->number.type = AMAL_NUMBER_FLOAT; self->number.bits = 32; } if(self->index < (int)self->code.size) { AmalNumberType number_type = -1; char suffix_char = self->code.data[self->index]; if(suffix_char == 'i') { number_type = AMAL_NUMBER_SIGNED_INTEGER; } else if(suffix_char == 'u') { number_type = AMAL_NUMBER_UNSIGNED_INTEGER; } else if(suffix_char == 'f') { number_type = AMAL_NUMBER_FLOAT; } if((int)number_type != -1) { usize suffix_num_start; usize suffix_num_len; i64 suffix_num; if(dot_index != -1 && number_type != AMAL_NUMBER_FLOAT) { tokenizer_print_error(self, self->index, "Floating point number has invalid suffix '%c'. Expected 'f' or nothing", suffix_char); return TOKENIZER_ERR; } ++self->index; suffix_num_start = self->index; while(self->index < (int)self->code.size) { c = tokenizer_get_char(self); if(isDigit(c)) ++self->index; else break; } suffix_num_len = self->index - suffix_num_start; if(suffix_num_len == 0) { tokenizer_print_error(self, self->index, "Missing number suffix"); return TOKENIZER_ERR; } number_str = create_buffer_view(self->code.data + suffix_num_start, self->index - suffix_num_start); int result = string_to_integer_unchecked(number_str, &suffix_num); /* TODO: Allow larger than 256 bits suffix */ if(result != 0 || suffix_num > 256) { /* TODO */ tokenizer_print_error(self, suffix_num_start, "Number suffix can't be larger than 256"); return TOKENIZER_ERR; } self->number.type = number_type; self->number.bits = (int)suffix_num; } } self->value.identifier.data = self->code.data + number_start; self->value.identifier.size = self->index - number_start; *token = TOK_NUMBER; } else if(c == '"') { int string_end; ++self->index; string_end = find_end_of_string(self->code, self->index); if(string_end == -1) { tokenizer_print_error(self, self->prev_index, "String end not found. Did you forget '\"' or did you have a mismatch of number of '\"'?"); return TOKENIZER_ERR; } self->value.string.data = &self->code.data[self->index]; self->value.string.size = string_end - self->index; self->index = string_end + 1; *token = TOK_STRING; } else if(c == '.') { const char *start = self->code.data + self->index; ++self->index; /* ... */ if((usize)self->index + 2 < self->code.size && am_memcmp(self->code.data + self->index, "..", 2) == 0) { self->index += 2; self->value.identifier.data = start; self->value.identifier.size = 3; *token = TOK_C_VARARGS; } else { SET_BINOP(BINOP_DOT); } } else if(c == '+') { ++self->index; SET_BINOP(BINOP_ADD); } else if(c == '-') { ++self->index; SET_BINOP(BINOP_SUB); } else if(c == '*') { ++self->index; SET_BINOP(BINOP_MUL); } else if(c == '/') { ++self->index; if(self->index < (int)self->code.size) { c = tokenizer_get_char(self); /* Single line comment */ if(c == '/') { ++self->index; self->index = tokenizer_get_end_of_line_from_index(self, self->index); return __tokenizer_next(self, token); } else if(c == '*') { ++self->index; self->index = tokenizer_get_end_of_multiline_comment(self, self->index); if(self->index == -1) { tokenizer_print_error(self, self->prev_index, "End of multiline comment not found"); return TOKENIZER_ERR; } return __tokenizer_next(self, token); } } SET_BINOP(BINOP_DIV); } else if(c == '=') { ++self->index; if(self->index < (int)self->code.size && tokenizer_get_char(self) == '=') { ++self->index; SET_BINOP(BINOP_EQUALS); } else { *token = TOK_EQUALS; } } else if(c == '!') { ++self->index; if(self->index < (int)self->code.size && tokenizer_get_char(self) == '=') { ++self->index; SET_BINOP(BINOP_NOT_EQUAL); } else { *token = TOK_NOT; } } else if(c == '&') { ++self->index; if(self->index < (int)self->code.size && tokenizer_get_char(self) == '&') { ++self->index; SET_BINOP(BINOP_AND); } else { *token = TOK_AMPERSAND; } } else if(c == '<') { ++self->index; if(self->index < (int)self->code.size && tokenizer_get_char(self) == '=') { ++self->index; SET_BINOP(BINOP_LESS_EQUAL); } else { SET_BINOP(BINOP_LESS); } } else if(c == '>') { ++self->index; if(self->index < (int)self->code.size && tokenizer_get_char(self) == '=') { ++self->index; SET_BINOP(BINOP_GREATER_EQUAL); } else { SET_BINOP(BINOP_GREATER); } } else if(c == '(') { ++self->index; *token = TOK_OPEN_PAREN; } else if(c == ')') { ++self->index; *token = TOK_CLOSING_PAREN; } else if(c == ',') { ++self->index; *token = TOK_COMMA; } else if(c == '{') { ++self->index; *token = TOK_OPEN_BRACE; } else if(c == '}') { ++self->index; *token = TOK_CLOSING_BRACE; } else if(c == ';') { ++self->index; *token = TOK_SEMICOLON; } else if(c == ':') { ++self->index; *token = TOK_COLON; } else if(c == '?') { ++self->index; *token = TOK_QUESTION_MARK; } else if(c == '@') { const char *err_msg; ++self->index; if(self->index + 6 >= (int)self->code.size || !am_memeql(self->code.data + self->index, "import", 6)) { err_msg = "Expected '@import(path)'"; goto import_error; } self->index += 6; /* TODO: This should be moved to the parser */ result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_OPEN_PAREN) { err_msg = "Expected '(' after @import"; goto import_error; } result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_STRING) { err_msg = "Expected string after @import("; goto import_error; } if(self->value.string.size == 0) { err_msg = "Path in @import can't be empty"; goto import_error; } result = tokenizer_next(self, &last_token); if(result != 0 || last_token != TOK_CLOSING_PAREN) { err_msg = "Expected ')' after @import(path"; goto import_error; } *token = TOK_IMPORT; return TOKENIZER_OK; import_error: tokenizer_print_error(self, self->prev_index, err_msg); return TOKENIZER_ERR; } else { tokenizer_print_error(self, self->prev_index, "Unexpected symbol '%c'", c); return TOKENIZER_UNEXPECTED_TOKEN; } return TOKENIZER_OK; } /* Wrapper around __tokenizer_next to store last parsed token */ int tokenizer_next(Tokenizer *self, Token *token) { int result; result = __tokenizer_next(self, token); self->token = *token; return result; } /* static const char* binop_to_string(BinopType binop_type) { switch(binop_type) { case BINOP_DOT: return "."; case BINOP_ADD: return "+"; case BINOP_SUB: return "-"; case BINOP_MUL: return "*"; case BINOP_DIV: return "/"; } assert(bool_false && "binop_to_string not implemented for binop_type"); } */ static BufferView tokenizer_expected_token_as_string(Token token) { const char *str; str = ""; switch(token) { case TOK_NONE: str = "none"; break; case TOK_END_OF_FILE: str = ""; break; case TOK_IDENTIFIER: str = "identifier"; break; case TOK_CONST: str = "const"; break; case TOK_VAR: str = "var"; break; case TOK_STRING: str = "string"; break; case TOK_NUMBER: str = "number"; break; case TOK_BOOL: str = "bool"; break; case TOK_FN: str = "fn"; break; case TOK_STRUCT: str = "struct"; break; case TOK_EQUALS: str = "="; break; case TOK_NOT: str = "!"; break; case TOK_OPEN_PAREN: str = "("; break; case TOK_CLOSING_PAREN: str = ")"; break; case TOK_COMMA: str = ","; break; case TOK_OPEN_BRACE: str = "{"; break; case TOK_CLOSING_BRACE: str = "}"; break; case TOK_IMPORT: str = "import"; break; case TOK_BINOP: /* TODO: binop_to_string */ str = "binop"; break; case TOK_SEMICOLON: str = ";"; break; case TOK_COLON: str = ":"; break; case TOK_PUB: str = "pub"; break; case TOK_IF: str = "if"; break; case TOK_ELSE: str = "else"; break; case TOK_WHILE: str = "while"; break; case TOK_EXTERN: str = "extern"; break; case TOK_EXPORT: str = "export"; break; case TOK_RETURN: str = "return"; break; case TOK_QUESTION_MARK: str = "?"; break; case TOK_AMPERSAND: str = "&"; break; case TOK_C_VARARGS: str = "..."; break; } return create_buffer_view(str, strlen(str)); } static BufferView tokenizer_actual_token_as_string(Tokenizer *self) { #if 0 const char *str; switch(self->token) { case TOK_NONE: str = "none"; break; case TOK_END_OF_FILE: str = ""; break; case TOK_IDENTIFIER: /*return self->value.identifier; */ str = "identifier"; break; case TOK_CONST: str = "const"; break; case TOK_VAR: str = "var"; break; case TOK_STRING: /*return self->value.string;*/ str = "string"; break; case TOK_EQUALS: str = "="; break; case TOK_OPEN_PAREN: str = "("; break; case TOK_CLOSING_PAREN: str = ")"; break; case TOK_OPEN_BRACE: str = "{"; break; case TOK_CLOSING_BRACE: str = "}"; break; case TOK_IMPORT: str = "import"; break; case TOK_COMMA: str = ","; break; default: str = "Unknown token"; break; } return create_buffer_view(str, strlen(str)); #endif return tokenizer_expected_token_as_string(self->token); } int tokenizer_accept(Tokenizer *self, Token expected_token) { Token actual_token; BufferView actual_token_str; BufferView expected_token_str; if(!self->needs_update) { bool match; match = (self->token == expected_token); self->needs_update = match; if(match) return TOKENIZER_OK; } else { return_if_error(tokenizer_next(self, &actual_token)); if(actual_token == expected_token) { self->needs_update = bool_true; return TOKENIZER_OK; } } expected_token_str = tokenizer_expected_token_as_string(expected_token); actual_token_str = tokenizer_actual_token_as_string(self); tokenizer_print_error(self, self->prev_index, "Expected \"%.*s\", got \"%.*s\"", expected_token_str.size, expected_token_str.data, actual_token_str.size, actual_token_str.data); return TOKENIZER_UNEXPECTED_TOKEN; } int tokenizer_consume_if(Tokenizer *self, Token expected_token, bool *result) { Token actual_token; if(!self->needs_update) { *result = (self->token == expected_token); self->needs_update = *result; return TOKENIZER_OK; } return_if_error(tokenizer_next(self, &actual_token)); *result = (actual_token == expected_token); self->needs_update = *result; return TOKENIZER_OK; } int tokenizer_get_start_of_line_from_index(Tokenizer *self, int index) { char c; while(index >= 0) { c = self->code.data[index]; if(c == '\n' || c == '\r') { return index + 1; } --index; } return 0; } int tokenizer_get_end_of_line_from_index(Tokenizer *self, int index) { while(index < (int)self->code.size) { char c = self->code.data[index]; if(c == '\n' || c == '\r') break; ++index; } return index; } /* static int find_non_whitespace(const char *str, usize size) { usize i; for(i = 0; i < size; ++i) { char c; c = str[i]; if(c != ' ' && c != '\t') return i; } return -1; } */ int tokenizer_get_end_of_multiline_comment(Tokenizer *self, int index) { char c; int comment_count; comment_count = 1; ++index; while(index < (int)self->code.size) { c = self->code.data[index]; if(c == '*' && self->code.data[index - 1] == '/') { ++comment_count; } else if(c == '/' && self->code.data[index - 1] == '*') { --comment_count; if(comment_count == 0) return index + 1; } ++index; } return -1; } /* TODO: Optimize */ static int tokenizer_get_line_by_index(Tokenizer *self, int index) { int i; int line; if(index < 0 || index >= (int)self->code.size) return -1; line = 1; for(i = 0; i < index; ++i) { if(self->code.data[i] == '\n') ++line; } return line; } static int max(int a, int b) { return a > b ? a : b; } void tokenizer_print_error_args(Tokenizer *self, int index, const char *fmt, va_list args) { int line; int line_start; int line_end; /*int code_start;*/ int prev_column; int i; line = tokenizer_get_line_by_index(self, index); line_start = tokenizer_get_start_of_line_from_index(self, index); line_end = tokenizer_get_end_of_line_from_index(self, index); /*code_start = find_non_whitespace(&self->code.data[line_start], line_end - line_start); if(code_start != -1) line_start += code_start;*/ prev_column = index - line_start; if(self->compiler_options->error_callback) { char buffer[2048]; int bytes_copied; bytes_copied = 0; bytes_copied += max(0, snprintf(buffer + bytes_copied, sizeof(buffer) - bytes_copied, "%.*s:%d:%d: error: ", (int)self->code_name.size, self->code_name.data, line, 1 + prev_column)); if(sizeof(buffer) - bytes_copied > 0) { bytes_copied += max(0, vsnprintf(buffer + bytes_copied, sizeof(buffer) - bytes_copied, fmt, args)); } if(sizeof(buffer) - bytes_copied > 0) bytes_copied += max(0, snprintf(buffer + bytes_copied, sizeof(buffer) - bytes_copied, "\n%.*s\n", line_end - line_start, self->code.data + line_start)); if(sizeof(buffer) - bytes_copied > 0) { for(i = 0; i < prev_column; ++i) bytes_copied += max(0, snprintf(buffer + bytes_copied, sizeof(buffer) - bytes_copied, " ")); } if(sizeof(buffer) - bytes_copied > 0) bytes_copied += max(0, snprintf(buffer + bytes_copied, sizeof(buffer) - bytes_copied, "^\n")); self->compiler_options->error_callback(buffer, bytes_copied, self->compiler_options->error_callback_userdata); } else { amal_mutex *mutex; mutex = amal_log_get_mutex(); ignore_result_int(amal_mutex_lock(mutex, "tokenizer_print_error")); fprintf(stderr, "\x1b[1;37m%.*s:%d:%d:\x1b[0m \x1b[1;31merror:\x1b[0m ", (int)self->code_name.size, self->code_name.data, line, 1 + prev_column); vfprintf(stderr, fmt, args); fprintf(stderr, "\n%.*s\n", line_end - line_start, self->code.data + line_start); for(i = 0; i < prev_column; ++i) fprintf(stderr, " "); fprintf(stderr, "\x1b[1;32m^\x1b[0m\n"); ignore_result_int(amal_mutex_unlock(mutex)); } } void tokenizer_print_error(Tokenizer *self, int index, const char *fmt, ...) { va_list args; va_start(args, fmt); tokenizer_print_error_args(self, index, fmt, args); va_end(args); } void tokenizer_print_error_object(Tokenizer *self, TokenizerError *error) { tokenizer_print_error(self, error->index, "%s", error->str); } TokenizerError tokenizer_create_error(Tokenizer *self, int index, const char *fmt, ...) { TokenizerError result; va_list args; char buffer[1024]; int bytes_copied; va_start(args, fmt); bytes_copied = max(0, vsnprintf(buffer, sizeof(buffer), fmt, args)); va_end(args); result.index = index; result.str = NULL; ignore_result_int(arena_allocator_alloc(self->allocator, bytes_copied + 1, (void**)&result.str)); if(result.str && bytes_copied > 0) am_memcpy(result.str, buffer, bytes_copied + 1); return result; } int tokenizer_get_error_index(Tokenizer *self) { return self->prev_index; } int tokenizer_get_code_reference_index(Tokenizer *self, const char *ref) { if(!ref) return -1; return ref - self->code.data; } bool tokenizer_contains_code_reference(Tokenizer *self, const char *code_ref) { return code_ref >= self->code.data && code_ref < self->code.data + self->code.size; }