#include "../include/tokenizer.h" #include #include #include #include void tsl_tokenizer_init(TslTokenizer *self, char *code, size_t code_size) { self->code = code; self->code_size = code_size; self->code_index = 0; self->prev_code_index = 0; self->peek.token = -1; self->peek.code_index = 0; self->peek.prev_code_index = 0; self->identifier.data = NULL; self->identifier.size = 0; self->string.data = NULL; self->string.size = 0; self->bool_value = 0; self->number_value = 0; self->arithmetic_symbol = '\0'; } static char tsl_tokenizer_get_char(TslTokenizer *self) { if(self->code_index < self->code_size) return self->code[self->code_index]; return '\0'; } static int is_whitespace(char c) { switch(c) { case ' ': case '\n': case '\t': case '\r': return 1; default: return 0; } } static void tsl_tokenizer_skip_whitespace(TslTokenizer *self) { for(;;) { char c = tsl_tokenizer_get_char(self); if(is_whitespace(c)) ++self->code_index; else return; } } static int is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static int is_num(char c) { return c >= '0' && c <= '9'; } static int is_identifier_start(char c) { return is_alpha(c) || c == '_'; } static int is_identifier_continue(char c) { return is_alpha(c) || is_num(c) || c == '_'; } const int64_t num_multipliers[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000, 10000000000000000, 100000000000000000, 1000000000000000000 }; /* TODO: Check if the number if too large to fit into the result */ static int64_t string_to_int(TslStringView *str) { int64_t num = 0; size_t i = 0; for(; i < str->size; ++i) { char digit = str->data[str->size - 1 - i] - '0'; num += digit * num_multipliers[i]; } return num; } static int tsl_tokenizer_goto_end_of_string(TslTokenizer *self, char string_start_symbol) { int string_escape = 0; for(;;) { char c = tsl_tokenizer_get_char(self); if(c == string_start_symbol) { ++self->code_index; if(!string_escape) return 1; string_escape = 0; } else if(c == '\\') { ++self->code_index; string_escape = !string_escape; } else if(c == '\0') { return 0; } else { ++self->code_index; string_escape = 0; } } } static TslToken tsl_tokenizer_next_internal(TslTokenizer *self) { char c; self->prev_code_index = self->code_index; tsl_tokenizer_skip_whitespace(self); c = tsl_tokenizer_get_char(self); if(is_identifier_start(c)) { size_t identifier_start = self->code_index; ++self->code_index; for(;;) { c = tsl_tokenizer_get_char(self); if(!is_identifier_continue(c)) break; ++self->code_index; } self->identifier.data = self->code + identifier_start; self->identifier.size = self->code_index - identifier_start; switch(self->identifier.size) { case 2: { if(memcmp(self->identifier.data, "fn", 2) == 0) { return TSL_TOKEN_FN; } break; } case 4: { if(memcmp(self->identifier.data, "true", 4) == 0) { self->bool_value = 1; return TSL_TOKEN_BOOL; } else if(memcmp(self->identifier.data, "null", 4) == 0) { return TSL_TOKEN_NULL; } break; } case 5: { if(memcmp(self->identifier.data, "false", 5) == 0) { self->bool_value = 0; return TSL_TOKEN_BOOL; } break; } } return TSL_TOKEN_IDENTIFIER; } else if(is_num(c)) { size_t num_start = self->code_index; ++self->code_index; for(;;) { c = tsl_tokenizer_get_char(self); if(!is_num(c)) break; ++self->code_index; } self->identifier.data = self->code + num_start; self->identifier.size = self->code_index - num_start; /* TODO: Check if the result of string_to_int is too large to fit into double */ self->number_value = string_to_int(&self->identifier); return TSL_TOKEN_NUM; } else if(c == '"') { char string_start_symbol = c; size_t string_start; ++self->code_index; string_start = self->code_index; if(tsl_tokenizer_goto_end_of_string(self, string_start_symbol)) { self->string.data = self->code + string_start; self->string.size = self->code_index - 1 - string_start; return TSL_TOKEN_STRING; } else { return TSL_TOKEN_END_OF_FILE; } } else if(c == '=') { ++self->code_index; return TSL_TOKEN_EQUAL; } else if(c == '{') { ++self->code_index; return TSL_TOKEN_LBRACE; } else if(c == '}') { ++self->code_index; return TSL_TOKEN_RBRACE; } else if(c == '[') { ++self->code_index; return TSL_TOKEN_LBRACKET; } else if(c == ']') { ++self->code_index; return TSL_TOKEN_RBRACKET; } else if(c == '(') { ++self->code_index; return TSL_TOKEN_LPAREN; } else if(c == ')') { ++self->code_index; return TSL_TOKEN_RPAREN; } else if(c == ',') { ++self->code_index; return TSL_TOKEN_COMMA; } else if(c == ':') { ++self->code_index; return TSL_TOKEN_COLON; } else if(c == '$') { ++self->code_index; return TSL_TOKEN_DOLLAR_SIGN; } else if(c == '+' || c == '-' || c == '*' || c == '/') { self->arithmetic_symbol = c; ++self->code_index; return TSL_TOKEN_ARITHMETIC; } else if(c == '#') { /* Comment */ ++self->code_index; for(;;) { c = tsl_tokenizer_get_char(self); if(c == '\n') { ++self->code_index; break; } else if(c == '\0') { break; } ++self->code_index; } return tsl_tokenizer_next_internal(self); } else if(c == '\0') { return TSL_TOKEN_END_OF_FILE; } else { fprintf(stderr, "Unexpected symbol '%c'\n", c); return TSL_TOKEN_UNEXPECTED_SYMBOL; } } TslToken tsl_tokenizer_next(TslTokenizer *self) { if((int)self->peek.token == -1) { return tsl_tokenizer_next_internal(self); } else { return tsl_tokenizer_consume_peek(self); } } int tsl_tokenizer_accept(TslTokenizer *self, TslToken expected_token) { TslToken actual_token; if((int)self->peek.token == -1) { actual_token = tsl_tokenizer_next_internal(self); } else { actual_token = tsl_tokenizer_consume_peek(self); } if(actual_token != expected_token) { fprintf(stderr, "Error: Expected TODO(%d), got TODO(%d)\n", expected_token, actual_token); return 0; } return 1; } TslToken tsl_tokenizer_peek(TslTokenizer *self) { if((int)self->peek.token == -1) { size_t p_prev_code_index = self->prev_code_index; size_t p_code_index = self->code_index; self->peek.token = tsl_tokenizer_next_internal(self); self->peek.code_index = self->code_index; self->peek.prev_code_index = self->prev_code_index; self->prev_code_index = p_prev_code_index; self->code_index = p_code_index; return self->peek.token; } else { return self->peek.token; } } TslToken tsl_tokenizer_consume_peek(TslTokenizer *self) { TslToken token = self->peek.token; assert((int)token != -1); self->code_index = self->peek.code_index; self->prev_code_index = self->peek.prev_code_index; self->peek.token = -1; return token; } TslCommandToken tsl_tokenizer_next_command_arg(TslTokenizer *self, TslStringView *arg) { char c; assert((int)self->peek.token == -1); self->prev_code_index = self->code_index; tsl_tokenizer_skip_whitespace(self); c = tsl_tokenizer_get_char(self); if(c == ')') { ++self->code_index; return TSL_COMMAND_TOKEN_END; } else if(c == '"') { char string_start_symbol = c; size_t string_start; ++self->code_index; string_start = self->code_index; if(tsl_tokenizer_goto_end_of_string(self, string_start_symbol)) { arg->data = self->code + string_start; arg->size = self->code_index - 1 - string_start; return TSL_COMMAND_TOKEN_ARG; } else { return TSL_COMMAND_TOKEN_END_OF_FILE; } } else if(c == '\0') { return TSL_COMMAND_TOKEN_END_OF_FILE; } else { /* TODO: When hitting ", parse to the end of it and make it part of this arg instead of separating them into two args */ size_t arg_start = self->code_index; int escape_char = tsl_tokenizer_get_char(self) == '\\'; ++self->code_index; for(;;) { c = tsl_tokenizer_get_char(self); if(is_whitespace(c) || c == ')' || c == '\0') { break; } else if(c == '"') { if(!escape_char) break; escape_char = 0; } else if(c == '\\') { escape_char = !escape_char; } else { escape_char = 0; } ++self->code_index; } arg->data = self->code + arg_start; arg->size = self->code_index - arg_start; return TSL_COMMAND_TOKEN_ARG; } } int tsl_tokenizer_get_line_by_index(TslTokenizer *self, size_t index) { size_t i = 0; int line = 1; for(; i < index; ++i) { if(self->code[i] == '\n') ++line; } return line; }