diff options
-rw-r--r-- | include/HtmlParser.h | 6 | ||||
-rw-r--r-- | src/HtmlParser.c | 51 |
2 files changed, 37 insertions, 20 deletions
diff --git a/include/HtmlParser.h b/include/HtmlParser.h index 45dc1e1..7536777 100644 --- a/include/HtmlParser.h +++ b/include/HtmlParser.h @@ -9,7 +9,7 @@ #include <stddef.h> typedef struct { - const char *data; + char *data; size_t size; } HtmlStringView; @@ -28,7 +28,7 @@ typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_t #define UNCLOSED_TAGS_SIZE 2048 struct HtmlParser { - const char *source; + char *source; size_t source_len; size_t offset; HtmlParseCallback parse_callback; @@ -48,7 +48,7 @@ struct HtmlParser { }; /* Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END */ -void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); +void html_parser_init(HtmlParser *self, char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); void html_parser_deinit(HtmlParser *self); void html_parser_parse(HtmlParser *self); diff --git a/src/HtmlParser.c b/src/HtmlParser.c index 965368e..8b27d6d 100644 --- a/src/HtmlParser.c +++ b/src/HtmlParser.c @@ -46,7 +46,7 @@ static int is_newline(int c) { return c == '\n' || c == '\r'; } -static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { +static void lstrip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; while(i < size && strip_filter_func(str[i])) { ++i; @@ -55,7 +55,7 @@ static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size = size - i; } -static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { +static void rstrip(char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { ssize_t i = size - 1; while(i >= 0 && strip_filter_func(str[i])) { --i; @@ -63,11 +63,20 @@ static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip *output_size = i + 1; } -static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { +static void strip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } +static void html_string_view_to_lowercase(HtmlStringView *string_view) { + size_t i = 0; + for(; i < string_view->size; ++i) { + char c = string_view->data[i]; + if(c >= 'A' && c <= 'Z') + string_view->data[i] += 32; + } +} + static int is_void_tag(HtmlStringView *tag_name) { HtmlStringView *tag_iter = &void_tags[0]; /* !DOCTYPE, !--, etc.... */ @@ -98,7 +107,7 @@ static void html_parser_reset(HtmlParser *self) { self->unclosed_tags_offset = 0; } -void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { +void html_parser_init(HtmlParser *self, char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { self->source = html_source; self->source_len = len; self->parse_callback = parse_callback; @@ -131,19 +140,7 @@ static void html_parser_advance_char(HtmlParser *self) { ++self->offset; } -static int is_alpha(char c) { - return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); -} - -static int is_digit(char c) { - return c >= '0' && c <= '9'; -} - -static int is_identifier_char(char c) { - return is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '!' || c == ':'; -} - -static void html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) { +static void html_parser_try_append_unclosed_tag(HtmlParser *self, char *data, size_t size) { if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) { fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n"); return; @@ -196,6 +193,24 @@ static int is_attribute_value_char(char c) { } } +static int is_identifier_char(char c) { + switch(c) { + case ' ': + case '\t': + case '\n': + case '\v': + case '"': + case '\'': + case '<': + case '>': + case '/': + case '=': + return 0; + default: + return 1; + } +} + /* TODO: Unescape html characters in attribute value */ static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) { self->attribute_value.data = self->source + self->offset; @@ -330,6 +345,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) { if(tag_name_found) { /* attribute name */ self->attribute_key = identifier; + html_string_view_to_lowercase(&self->attribute_key); self->attribute_value.data = NULL; self->attribute_value.size = 0; @@ -351,6 +367,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) { } else { /* tag name */ self->tag_name = identifier; + html_string_view_to_lowercase(&self->tag_name); tag_name_found = 1; if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) { html_parser_goto_comment_end(self); |