From e27bd78c8211532bf0d39d87d2051222f7e86e26 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Tue, 14 Jan 2020 07:27:47 +0100 Subject: start --- src/main.c | 41 ++++++++++++++++ src/tokenizer.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 src/main.c create mode 100644 src/tokenizer.c (limited to 'src') diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..6653ba2 --- /dev/null +++ b/src/main.c @@ -0,0 +1,41 @@ +#include "../include/tokenizer.h" +#include +#include + +int main() { + TslTokenizer tokenizer; + const char *code = +"value1 = 1\n" +"value2 = true\n" +"value3 = null\n" +"value4 = \"hello world\"\n" +"value5 = {\"hello\", \"world\", 5}\n" +"value6 = {\"hello\": \"world\", \"value\": 23}\n" +"value7 = fn () {}\n" +"value8 = fn (value) {}\n" +"value9 = {\n" +" \"hello\": \"world\",\n" +" \"sayHello\": fn() {\n" +" \n" +" }\n" +"}\n" +"\n" +"str = value9[\"hello\"]\n" +"value9[\"sayHello\"]()"; + tsl_tokenizer_init(&tokenizer, code, strlen(code)); + + for(;;) { + TslToken token = tsl_tokenizer_next(&tokenizer); + if(token == TSL_TOKEN_END_OF_FILE) { + break; + } else if(token == TSL_TOKEN_IDENTIFIER) { + printf("identifier: %.*s\n", (int)tokenizer.identifier.size, tokenizer.identifier.data); + } else if(token == TSL_TOKEN_NUM) { + printf("num: %ld\n", tokenizer.number_value); + } else { + return 1; + } + } + + return 0; +} \ No newline at end of file diff --git a/src/tokenizer.c b/src/tokenizer.c new file mode 100644 index 0000000..1ab34db --- /dev/null +++ b/src/tokenizer.c @@ -0,0 +1,148 @@ +#include "../include/tokenizer.h" +#include +#include + +void tsl_tokenizer_init(TslTokenizer *self, const char *code, size_t code_size) { + self->code = code; + self->code_size = code_size; + self->code_index = 0; + + self->identifier.data = NULL; + self->identifier.size = 0; + self->bool_value = 0; + self->number_value = 0; +} + +static char tsl_tokenizer_get_char(TslTokenizer *self) { + if(self->code_index < self->code_size) + return self->code[self->code_index]; + return '\0'; +} + +static void tsl_tokenizer_skip_whitespace(TslTokenizer *self) { + for(;;) { + char c = tsl_tokenizer_get_char(self); + switch(c) { + case ' ': + case '\n': + case '\t': + case '\r': + ++self->code_index; + break; + default: + return; + } + } +} + +static int is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int is_num(char c) { + return c >= '0' && c <= '9'; +} + +static int is_identifier_start(char c) { + return is_alpha(c) || c == '_'; +} + +static int is_identifier_continue(char c) { + return is_alpha(c) || is_num(c) || c == '_'; +} + +const int64_t num_multipliers[] = { + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000 +}; + +/* TODO: Check if the number if too large to fit into the result */ +static int64_t string_to_int(TslStringView *str) { + int64_t num = 0; + for(size_t i = 0; i < str->size; ++i) { + char digit = str->data[str->size - 1 - i] - '0'; + num += digit * num_multipliers[i]; + } + return num; +} + +TslToken tsl_tokenizer_next(TslTokenizer *self) { + char c; + tsl_tokenizer_skip_whitespace(self); + + c = tsl_tokenizer_get_char(self); + if(is_identifier_start(c)) { + size_t identifier_start = self->code_index; + ++self->code_index; + for(;;) { + c = tsl_tokenizer_get_char(self); + if(!is_identifier_continue(c)) + break; + ++self->code_index; + } + self->identifier.data = self->code + identifier_start; + self->identifier.size = self->code_index - identifier_start; + + switch(self->identifier.size) { + case 3: { + if(memcmp(self->identifier.data, "null", 3) == 0) { + return TSL_TOKEN_NULL; + } + break; + } + case 4: { + if(memcmp(self->identifier.data, "true", 4) == 0) { + self->bool_value = 1; + return TSL_TOKEN_BOOL; + } + break; + } + case 5: { + if(memcmp(self->identifier.data, "false", 5) == 0) { + self->bool_value = 0; + return TSL_TOKEN_BOOL; + } + break; + } + } + return TSL_TOKEN_IDENTIFIER; + } else if(is_num(c)) { + size_t num_start = self->code_index; + ++self->code_index; + for(;;) { + c = tsl_tokenizer_get_char(self); + if(!is_num(c)) + break; + ++self->code_index; + } + self->identifier.data = self->code + num_start; + self->identifier.size = self->code_index - num_start; + self->number_value = string_to_int(&self->identifier); + return TSL_TOKEN_NUM; + } else if(c == '=') { + ++self->code_index; + return TSL_TOKEN_EQUAL; + } else if(c == '\0') { + return TSL_TOKEN_END_OF_FILE; + } else { + fprintf(stderr, "Unexpected symbol '%c'\n", c); + return TSL_TOKEN_UNEXPECTED_SYMBOL; + } +} -- cgit v1.2.3