#include "../include/HtmlParser.h" #include #include #include static StringView void_tags[] = { {.data = "area", .size = 4}, {.data = "base", .size = 4}, {.data = "br", .size = 2}, {.data = "col", .size = 3}, {.data = "command", .size = 7}, {.data = "embed", .size = 5}, {.data = "hr", .size = 2}, {.data = "img", .size = 3}, {.data = "input", .size = 5}, {.data = "keygen", .size = 6}, {.data = "link", .size = 4}, {.data = "meta", .size = 4}, {.data = "param", .size = 5}, {.data = "source", .size = 6}, {.data = "track", .size = 5}, {.data = "wbr", .size = 3}, {.data = NULL, .size = 0} }; static StringView script_tag = {.data = "script", .size = 6}; static int string_view_equals(StringView *self, StringView *other) { return self->size == other->size && memcmp(self->data, other->data, self->size) == 0; } static int is_whitespace(int c) { switch(c) { case ' ': case '\n': case '\r': case '\t': case '\v': return 1; default: return 0; } } static int is_newline(int c) { return c == '\n' || c == '\r'; } static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; while(i < size && strip_filter_func(str[i])) { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { ssize_t i = size - 1; while(i >= 0 && strip_filter_func(str[i])) { --i; } *output_size = i + 1; } static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } static int is_void_tag(StringView *tag_name) { StringView *tag_iter = &void_tags[0]; /* !DOCTYPE, !--, etc.... */ if(tag_name->size > 0 && tag_name->data[0] == '!') return 1; while(tag_iter->data) { if(string_view_equals(tag_name, tag_iter)) return 1; ++tag_iter; } return 0; } static void html_parser_reset(HtmlParser *self) { self->offset = 0; self->tag_name.data = NULL; self->tag_name.size = 0; self->attribute_key.data = NULL; self->attribute_key.size = 0; self->attribute_value.data = NULL; self->attribute_value.size = 0; self->text.data = NULL; self->text.size = 0; self->is_tag_void = 0; self->inside_script_tag = 0; self->unclosed_tags_offset = 0; } void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { self->source = html_source; self->source_len = len; self->parse_callback = parse_callback; self->callback_userdata = userdata; } void html_parser_deinit(HtmlParser *self) { } static char html_parser_next_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; ++self->offset; return c; } return '\0'; } static char html_parser_peek_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; return c; } return '\0'; } static void html_parser_advance_char(HtmlParser *self) { ++self->offset; } static int is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static int is_digit(char c) { return c >= '0' && c <= '9'; } static int is_identifier_char(char c) { return is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '!'; } static void html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) { if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) { fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n"); return; } self->unclosed_tags[self->unclosed_tags_offset].data = data; self->unclosed_tags[self->unclosed_tags_offset].size = size; ++self->unclosed_tags_offset; } static void html_parser_pop_unclosed_tag(HtmlParser *self) { assert(self->unclosed_tags_offset > 0); --self->unclosed_tags_offset; } static void html_parser_try_pop_unclosed_tag(HtmlParser *self) { if(self->unclosed_tags_offset > 0) --self->unclosed_tags_offset; } static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, StringView *result) { if(self->unclosed_tags_offset > 0) { *result = self->unclosed_tags[self->unclosed_tags_offset - 1]; return 1; } return 0; } static void html_parser_skip_whitespace(HtmlParser *self) { for(;;) { char c = html_parser_peek_char(self); if(is_whitespace(c)) { html_parser_advance_char(self); } else { break; } } } static int is_attribute_value_char(char c) { switch(c) { case '"': case '\'': case '`': case '<': case '>': case '&': return 0; default: return 1; } } /* TODO: Unescape html characters in attribute value */ static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(c == quote_symbol) { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; html_parser_advance_char(self); break; } else if(c == '\0') { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; break; } else { html_parser_advance_char(self); } } strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline); } static void html_parser_parse_attribute_value(HtmlParser *self) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(!is_attribute_value_char(c) || c == '\0') break; else html_parser_advance_char(self); } self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; } static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbol) { int escape_quote = 0; for(;;) { char c = html_parser_next_char(self); if(!escape_quote && c == quote_symbol) { return; } else if(c == '\\') { escape_quote = !escape_quote; } else if(c == '\0') { return; } else { escape_quote = 0; } } } static void html_parser_goto_script_end_tag(HtmlParser *self) { self->text.data = self->source + self->offset; self->text.size = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '"' || c == '\'') { html_parser_advance_char(self); html_parser_goto_end_of_js_string(self, c); } else if(c == '<' && self->offset + 7 < self->source_len && strncmp(self->source + self->offset + 1, "/script", 7) == 0) { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); self->offset += 7; for(;;) { c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); break; } else if(c == '\0') { break; } else { html_parser_advance_char(self); } } break; } else if(c == '\0') { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); break; } else { html_parser_advance_char(self); } } if(self->text.size > 0) self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); } static void html_parser_parse_tag_start(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_next_char(self); if(c == '>') { if(self->is_tag_void) self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); self->is_tag_void = 0; if(self->inside_script_tag) { self->inside_script_tag = 0; /* inside a javascript string */ html_parser_goto_script_end_tag(self); } return; } else if(c == '/') { if(html_parser_peek_char(self) == '>') { html_parser_advance_char(self); if(tag_name_found) { self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(!self->is_tag_void) html_parser_try_pop_unclosed_tag(self); } self->is_tag_void = 0; self->inside_script_tag = 0; return; } } else if(is_identifier_char(c)) { StringView identifier; identifier.data = self->source + self->offset - 1; for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } identifier.size = (self->source + self->offset) - identifier.data; if(tag_name_found) { /* attribute name */ self->attribute_key = identifier; self->attribute_value.data = NULL; self->attribute_value.size = 0; html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '=') { html_parser_advance_char(self); html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '"' || c == '\'' || c == '`') { html_parser_advance_char(self); html_parser_parse_attribute_value_quoted(self, c); } else if(is_attribute_value_char(c)) { html_parser_advance_char(self); html_parser_parse_attribute_value(self); } } self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); } else { /* tag name */ self->tag_name = identifier; tag_name_found = 1; self->is_tag_void = is_void_tag(&self->tag_name); if(!self->is_tag_void) { html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); self->inside_script_tag = string_view_equals(&self->tag_name, &script_tag); } self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); } } else if(c == '\0') { return; } } } static void html_parser_parse_tag_end(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); return; } else if(!tag_name_found && is_identifier_char(c)) { StringView tag_end_name; tag_end_name.data = self->source + self->offset; html_parser_advance_char(self); for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } tag_end_name.size = (self->source + self->offset) - tag_end_name.data; /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */ if(is_void_tag(&tag_end_name)) { fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", tag_end_name.size, tag_end_name.data); continue; } StringView top_unclosed_tag; while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { self->tag_name = top_unclosed_tag; self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); html_parser_pop_unclosed_tag(self); if(string_view_equals(&top_unclosed_tag, &tag_end_name)) break; } } else if(c == '\0') { return; } else { html_parser_advance_char(self); } } } void html_parser_parse(HtmlParser *self) { html_parser_reset(self); for(;;) { char c = html_parser_next_char(self); if(c == '<') { if(html_parser_peek_char(self) == '/') { html_parser_advance_char(self); html_parser_parse_tag_end(self); } else { html_parser_parse_tag_start(self); } } else if(c == '\0') { break; } else { self->text.data = (self->source + self->offset) - 1; for(;;) { c = html_parser_peek_char(self); if(c == '<' || c == '\0') break; else html_parser_advance_char(self); } self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); if(self->text.size > 0) self->parse_callback(self, HTML_PARSE_TEXT, self->callback_userdata); } } StringView top_unclosed_tag; while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { self->tag_name = top_unclosed_tag; self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); html_parser_pop_unclosed_tag(self); } }