aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer.c')
-rw-r--r--src/tokenizer.c148
1 files changed, 148 insertions, 0 deletions
diff --git a/src/tokenizer.c b/src/tokenizer.c
new file mode 100644
index 0000000..1ab34db
--- /dev/null
+++ b/src/tokenizer.c
@@ -0,0 +1,148 @@
+#include "../include/tokenizer.h"
+#include <string.h>
+#include <stdio.h>
+
+void tsl_tokenizer_init(TslTokenizer *self, const char *code, size_t code_size) {
+ self->code = code;
+ self->code_size = code_size;
+ self->code_index = 0;
+
+ self->identifier.data = NULL;
+ self->identifier.size = 0;
+ self->bool_value = 0;
+ self->number_value = 0;
+}
+
+static char tsl_tokenizer_get_char(TslTokenizer *self) {
+ if(self->code_index < self->code_size)
+ return self->code[self->code_index];
+ return '\0';
+}
+
+static void tsl_tokenizer_skip_whitespace(TslTokenizer *self) {
+ for(;;) {
+ char c = tsl_tokenizer_get_char(self);
+ switch(c) {
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ ++self->code_index;
+ break;
+ default:
+ return;
+ }
+ }
+}
+
+static int is_alpha(char c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+static int is_num(char c) {
+ return c >= '0' && c <= '9';
+}
+
+static int is_identifier_start(char c) {
+ return is_alpha(c) || c == '_';
+}
+
+static int is_identifier_continue(char c) {
+ return is_alpha(c) || is_num(c) || c == '_';
+}
+
+const int64_t num_multipliers[] = {
+ 1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000
+};
+
+/* TODO: Check if the number if too large to fit into the result */
+static int64_t string_to_int(TslStringView *str) {
+ int64_t num = 0;
+ for(size_t i = 0; i < str->size; ++i) {
+ char digit = str->data[str->size - 1 - i] - '0';
+ num += digit * num_multipliers[i];
+ }
+ return num;
+}
+
+TslToken tsl_tokenizer_next(TslTokenizer *self) {
+ char c;
+ tsl_tokenizer_skip_whitespace(self);
+
+ c = tsl_tokenizer_get_char(self);
+ if(is_identifier_start(c)) {
+ size_t identifier_start = self->code_index;
+ ++self->code_index;
+ for(;;) {
+ c = tsl_tokenizer_get_char(self);
+ if(!is_identifier_continue(c))
+ break;
+ ++self->code_index;
+ }
+ self->identifier.data = self->code + identifier_start;
+ self->identifier.size = self->code_index - identifier_start;
+
+ switch(self->identifier.size) {
+ case 3: {
+ if(memcmp(self->identifier.data, "null", 3) == 0) {
+ return TSL_TOKEN_NULL;
+ }
+ break;
+ }
+ case 4: {
+ if(memcmp(self->identifier.data, "true", 4) == 0) {
+ self->bool_value = 1;
+ return TSL_TOKEN_BOOL;
+ }
+ break;
+ }
+ case 5: {
+ if(memcmp(self->identifier.data, "false", 5) == 0) {
+ self->bool_value = 0;
+ return TSL_TOKEN_BOOL;
+ }
+ break;
+ }
+ }
+ return TSL_TOKEN_IDENTIFIER;
+ } else if(is_num(c)) {
+ size_t num_start = self->code_index;
+ ++self->code_index;
+ for(;;) {
+ c = tsl_tokenizer_get_char(self);
+ if(!is_num(c))
+ break;
+ ++self->code_index;
+ }
+ self->identifier.data = self->code + num_start;
+ self->identifier.size = self->code_index - num_start;
+ self->number_value = string_to_int(&self->identifier);
+ return TSL_TOKEN_NUM;
+ } else if(c == '=') {
+ ++self->code_index;
+ return TSL_TOKEN_EQUAL;
+ } else if(c == '\0') {
+ return TSL_TOKEN_END_OF_FILE;
+ } else {
+ fprintf(stderr, "Unexpected symbol '%c'\n", c);
+ return TSL_TOKEN_UNEXPECTED_SYMBOL;
+ }
+}