diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 10 | ||||
-rw-r--r-- | SYNTAX.md | 19 | ||||
-rw-r--r-- | include/tokenizer.h | 37 | ||||
-rw-r--r-- | src/main.c | 41 | ||||
-rw-r--r-- | src/tokenizer.c | 148 |
6 files changed, 257 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c6275a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.o +tsl diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9176922 --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +CFLAGS=-Wall -Wextra -g3 + +all: main.o tokenizer.o + cc -o tsl main.o tokenizer.o -fPIE + +main.o: src/main.c include/tokenizer.h + cc -c src/main.c $(CFLAGS) + +tokenizer.o: src/tokenizer.c include/tokenizer.h + cc -c src/tokenizer.c $(CFLAGS) diff --git a/SYNTAX.md b/SYNTAX.md new file mode 100644 index 0000000..149633e --- /dev/null +++ b/SYNTAX.md @@ -0,0 +1,19 @@ +``` +value1 = 1 +value2 = true +value3 = null +value4 = "hello world" +value5 = {"hello", "world", 5} +value6 = {"hello": "world", "value": 23} +value7 = fn () {} +value8 = fn (value) {} +value9 = { + "hello": "world", + "sayHello": fn() { + + } +} + +str = value9["hello"] +value9["sayHello"]() +``` diff --git a/include/tokenizer.h b/include/tokenizer.h new file mode 100644 index 0000000..decdae4 --- /dev/null +++ b/include/tokenizer.h @@ -0,0 +1,37 @@ +#ifndef TSL_TOKENIZER_H +#define TSL_TOKENIZER_H + +#include <stddef.h> +#include <stdint.h> + +typedef struct { + const char *data; + size_t size; +} TslStringView; + +typedef enum { + TSL_TOKEN_END_OF_FILE, + TSL_TOKEN_UNEXPECTED_SYMBOL, + TSL_TOKEN_IDENTIFIER, + TSL_TOKEN_NUM, + TSL_TOKEN_BOOL, + TSL_TOKEN_NULL, + TSL_TOKEN_EQUAL +} TslToken; + +typedef struct { + const char *code; + size_t code_size; + size_t code_index; + + TslStringView identifier; + int bool_value; + int64_t number_value; +} TslTokenizer; + +void tsl_tokenizer_init(TslTokenizer *self, const char *code, size_t code_size); + +TslToken tsl_tokenizer_next(TslTokenizer *self); + +#endif /* TSL_TOKENIZER_H */ + diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..6653ba2 --- /dev/null +++ b/src/main.c @@ -0,0 +1,41 @@ +#include "../include/tokenizer.h" +#include <string.h> +#include <stdio.h> + +int main() { + TslTokenizer tokenizer; + const char *code = +"value1 = 1\n" +"value2 = true\n" +"value3 = null\n" +"value4 = \"hello world\"\n" +"value5 = {\"hello\", \"world\", 5}\n" +"value6 = {\"hello\": \"world\", \"value\": 23}\n" +"value7 = fn () {}\n" +"value8 = fn (value) {}\n" +"value9 = {\n" +" \"hello\": \"world\",\n" +" \"sayHello\": fn() {\n" +" \n" +" }\n" +"}\n" +"\n" +"str = value9[\"hello\"]\n" +"value9[\"sayHello\"]()"; + tsl_tokenizer_init(&tokenizer, code, strlen(code)); + + for(;;) { + TslToken token = tsl_tokenizer_next(&tokenizer); + if(token == TSL_TOKEN_END_OF_FILE) { + break; + } else if(token == TSL_TOKEN_IDENTIFIER) { + printf("identifier: %.*s\n", (int)tokenizer.identifier.size, tokenizer.identifier.data); + } else if(token == TSL_TOKEN_NUM) { + printf("num: %ld\n", tokenizer.number_value); + } else { + return 1; + } + } + + return 0; +}
\ No newline at end of file diff --git a/src/tokenizer.c b/src/tokenizer.c new file mode 100644 index 0000000..1ab34db --- /dev/null +++ b/src/tokenizer.c @@ -0,0 +1,148 @@ +#include "../include/tokenizer.h" +#include <string.h> +#include <stdio.h> + +void tsl_tokenizer_init(TslTokenizer *self, const char *code, size_t code_size) { + self->code = code; + self->code_size = code_size; + self->code_index = 0; + + self->identifier.data = NULL; + self->identifier.size = 0; + self->bool_value = 0; + self->number_value = 0; +} + +static char tsl_tokenizer_get_char(TslTokenizer *self) { + if(self->code_index < self->code_size) + return self->code[self->code_index]; + return '\0'; +} + +static void tsl_tokenizer_skip_whitespace(TslTokenizer *self) { + for(;;) { + char c = tsl_tokenizer_get_char(self); + switch(c) { + case ' ': + case '\n': + case '\t': + case '\r': + ++self->code_index; + break; + default: + return; + } + } +} + +static int is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int is_num(char c) { + return c >= '0' && c <= '9'; +} + +static int is_identifier_start(char c) { + return is_alpha(c) || c == '_'; +} + +static int is_identifier_continue(char c) { + return is_alpha(c) || is_num(c) || c == '_'; +} + +const int64_t num_multipliers[] = { + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000 +}; + +/* TODO: Check if the number if too large to fit into the result */ +static int64_t string_to_int(TslStringView *str) { + int64_t num = 0; + for(size_t i = 0; i < str->size; ++i) { + char digit = str->data[str->size - 1 - i] - '0'; + num += digit * num_multipliers[i]; + } + return num; +} + +TslToken tsl_tokenizer_next(TslTokenizer *self) { + char c; + tsl_tokenizer_skip_whitespace(self); + + c = tsl_tokenizer_get_char(self); + if(is_identifier_start(c)) { + size_t identifier_start = self->code_index; + ++self->code_index; + for(;;) { + c = tsl_tokenizer_get_char(self); + if(!is_identifier_continue(c)) + break; + ++self->code_index; + } + self->identifier.data = self->code + identifier_start; + self->identifier.size = self->code_index - identifier_start; + + switch(self->identifier.size) { + case 3: { + if(memcmp(self->identifier.data, "null", 3) == 0) { + return TSL_TOKEN_NULL; + } + break; + } + case 4: { + if(memcmp(self->identifier.data, "true", 4) == 0) { + self->bool_value = 1; + return TSL_TOKEN_BOOL; + } + break; + } + case 5: { + if(memcmp(self->identifier.data, "false", 5) == 0) { + self->bool_value = 0; + return TSL_TOKEN_BOOL; + } + break; + } + } + return TSL_TOKEN_IDENTIFIER; + } else if(is_num(c)) { + size_t num_start = self->code_index; + ++self->code_index; + for(;;) { + c = tsl_tokenizer_get_char(self); + if(!is_num(c)) + break; + ++self->code_index; + } + self->identifier.data = self->code + num_start; + self->identifier.size = self->code_index - num_start; + self->number_value = string_to_int(&self->identifier); + return TSL_TOKEN_NUM; + } else if(c == '=') { + ++self->code_index; + return TSL_TOKEN_EQUAL; + } else if(c == '\0') { + return TSL_TOKEN_END_OF_FILE; + } else { + fprintf(stderr, "Unexpected symbol '%c'\n", c); + return TSL_TOKEN_UNEXPECTED_SYMBOL; + } +} |