#include "../include/HtmlParser.h" #include static HtmlStringView void_tags[] = { {"area", 4}, {"base", 4}, {"br", 2}, {"col", 3}, {"command", 7}, {"embed", 5}, {"hr", 2}, {"img", 3}, {"input", 5}, {"keygen", 6}, {"link", 4}, {"meta", 4}, {"param", 5}, {"source", 6}, {"track", 5}, {"wbr", 3}, {"xml", 3}, {NULL, 0} }; static HtmlStringView script_tag = {"script", 6}; static char to_upper(char c) { if(c >= 'a' && c <= 'z') return c - 32; else return c; } int html_string_view_equals_case_insensitive(HtmlStringView *self, HtmlStringView *other) { size_t i = 0; if(self->size != other->size) return 0; for(; i < self->size; ++i) { if(to_upper(self->data[i]) != to_upper(other->data[i])) return 0; } return 1; } static int memeql(const void *m1, const void *m2, size_t size) { size_t i = 0; for(; i < size; ++i) { const char c1 = ((const char*)m1)[i]; const char c2 = ((const char*)m2)[i]; if(c1 != c2) return 0; } return 1; } static int is_whitespace(int c) { switch(c) { case ' ': case '\n': case '\r': case '\t': case '\v': return 1; default: return 0; } } static int is_newline(int c) { return c == '\n' || c == '\r'; } static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; if(size == 0) { *output_str = str; *output_size = size; return; } while(i < size && strip_filter_func(str[i])) { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { long i = (long)size - 1; if(size == 0) { *output_size = size; return; } while(i >= 0 && strip_filter_func(str[i])) { --i; } *output_size = i + 1; } static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } /*static void html_string_view_to_lowercase(HtmlStringView *string_view) { size_t i = 0; for(; i < string_view->size; ++i) { string_view->data[i] = to_lower(string_view->data[i]); } }*/ static int is_void_tag(HtmlStringView *tag_name) { HtmlStringView *tag_iter = &void_tags[0]; /* !DOCTYPE, !--, etc.... */ if(tag_name->size > 0 && tag_name->data[0] == '!') return 1; while(tag_iter->data) { if(html_string_view_equals_case_insensitive(tag_name, tag_iter)) return 1; ++tag_iter; } return 0; } static void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { /* Utf8 BOM */ if(len >= 3 && memeql(html_source, "\xef\xbb\xbf", 3)) { html_source += 3; len -= 3; } self->source = html_source; self->source_len = len; self->parse_callback = parse_callback; self->callback_userdata = userdata; self->offset = 0; self->tag_name.data = NULL; self->tag_name.size = 0; self->attribute_key.data = NULL; self->attribute_key.size = 0; self->attribute_value.data = NULL; self->attribute_value.size = 0; self->text.data = NULL; self->text.size = 0; self->text_stripped.data = NULL; self->text_stripped.size = 0; self->tag_before_void_tag.data = NULL; self->tag_before_void_tag.size = 0; self->is_tag_void = 0; self->inside_script_tag = 0; self->unclosed_tags_offset = 0; } static char html_parser_next_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; ++self->offset; return c; } return '\0'; } static char html_parser_peek_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; return c; } return '\0'; } static void html_parser_advance_char(HtmlParser *self) { if(self->offset < self->source_len) ++self->offset; } static int html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) { if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) { /*fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n");*/ return 1; } self->unclosed_tags[self->unclosed_tags_offset].data = data; self->unclosed_tags[self->unclosed_tags_offset].size = size; ++self->unclosed_tags_offset; return 0; } static void html_parser_pop_unclosed_tag(HtmlParser *self) { assert(self->unclosed_tags_offset > 0); --self->unclosed_tags_offset; } static void html_parser_try_pop_unclosed_tag(HtmlParser *self) { if(self->unclosed_tags_offset > 0) --self->unclosed_tags_offset; } static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, HtmlStringView *result) { if(self->unclosed_tags_offset > 0) { *result = self->unclosed_tags[self->unclosed_tags_offset - 1]; return 1; } return 0; } static void html_parser_skip_whitespace(HtmlParser *self) { for(;;) { char c = html_parser_peek_char(self); if(is_whitespace(c)) { html_parser_advance_char(self); } else { break; } } } static int is_attribute_value_char(char c) { switch(c) { case '"': case '\'': case '`': case '<': case '>': case '&': case '\0': return 0; default: return 1; } } static int is_identifier_char(char c) { switch(c) { case ' ': case '\t': case '\n': case '\v': case '"': case '\'': case '<': case '>': case '/': case '=': case '\0': return 0; default: return 1; } } /* TODO: Unescape html characters in attribute value */ static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(c == quote_symbol) { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; html_parser_advance_char(self); break; } else if(c == '\0') { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; break; } else { html_parser_advance_char(self); } } strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline); } static void html_parser_parse_attribute_value(HtmlParser *self) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(!is_attribute_value_char(c) || is_whitespace(c) || c == '\0') break; else html_parser_advance_char(self); } self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; } static int html_parser_goto_script_end_tag(HtmlParser *self) { int res = 0; self->text.data = self->source + self->offset; self->text.size = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '<' && self->offset + 7 < self->source_len && memeql(self->source + self->offset + 1, "/script", 7)) { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace); self->offset += 7; for(;;) { c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); break; } else if(c == '\0') { break; } else { html_parser_advance_char(self); } } break; } else if(c == '\0') { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace); break; } else { html_parser_advance_char(self); } } if(self->text.size > 0) { res = self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); if(res != 0) return res; } res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(res != 0) return res; html_parser_try_pop_unclosed_tag(self); return res; } static void html_parser_goto_comment_end(HtmlParser *self) { for(;;) { if(self->source_len - self->offset >= 3 && memeql(self->source + self->offset, "-->", 3)) { self->offset += 3; break; } html_parser_advance_char(self); } } static int html_parser_parse_tag_start(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_next_char(self); if(c == '>') { if(tag_name_found && self->is_tag_void) { int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(res != 0) return res; self->tag_name = self->tag_before_void_tag; } self->is_tag_void = 0; if(self->inside_script_tag) { self->inside_script_tag = 0; return html_parser_goto_script_end_tag(self); } return 0; } else if(c == '/') { html_parser_skip_whitespace(self); if(html_parser_peek_char(self) == '>') { html_parser_advance_char(self); if(tag_name_found) { int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(res != 0) return res; if(self->is_tag_void) self->tag_name = self->tag_before_void_tag; else html_parser_try_pop_unclosed_tag(self); } self->is_tag_void = 0; self->inside_script_tag = 0; return 0; } } else if(is_identifier_char(c)) { HtmlStringView identifier; identifier.data = self->source + self->offset - 1; for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } identifier.size = (self->source + self->offset) - identifier.data; if(tag_name_found) { int res = 0; /* attribute name */ self->attribute_key = identifier; /*html_string_view_to_lowercase(&self->attribute_key);*/ self->attribute_value.data = NULL; self->attribute_value.size = 0; html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '=') { html_parser_advance_char(self); html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '"' || c == '\'' || c == '`') { html_parser_advance_char(self); html_parser_parse_attribute_value_quoted(self, c); } else if(is_attribute_value_char(c)) { html_parser_parse_attribute_value(self); } } res = self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); if(res != 0) return res; } else { int res = 0; /* tag name */ HtmlStringView prev_tag_name = self->tag_name; self->tag_name = identifier; /*html_string_view_to_lowercase(&self->tag_name);*/ tag_name_found = 1; if(self->tag_name.size == 3 && memeql(self->tag_name.data, "!--", 3)) { html_parser_goto_comment_end(self); return 0; } self->is_tag_void = is_void_tag(&self->tag_name); if(self->is_tag_void) { self->tag_before_void_tag = prev_tag_name; } else { res = html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); if(res != 0) return res; self->inside_script_tag = html_string_view_equals_case_insensitive(&self->tag_name, &script_tag); } res = self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); if(res != 0) return res; } } else if(c == '\0') { return 0; } } } static int html_parser_parse_tag_end(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); return 0; } else if(!tag_name_found && is_identifier_char(c)) { HtmlStringView tag_end_name; long found_start_tag_index; long i; tag_end_name.data = self->source + self->offset; html_parser_advance_char(self); for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } tag_end_name.size = (self->source + self->offset) - tag_end_name.data; tag_name_found = 1; /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */ if(is_void_tag(&tag_end_name)) { /*fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);*/ continue; } found_start_tag_index = -1; for(i = self->unclosed_tags_offset - 1; i >= 0; --i) { if(html_string_view_equals_case_insensitive(&self->unclosed_tags[i], &tag_end_name)) { found_start_tag_index = i; break; } } if(found_start_tag_index != -1) { for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) { int res = 0; self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1]; res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(res != 0) return res; } } else { /*fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);*/ } } else if(c == '\0') { return 0; } else { html_parser_advance_char(self); } } } int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { HtmlStringView top_unclosed_tag; HtmlParser self; html_parser_init(&self, html_source, len, parse_callback, userdata); for(;;) { char c = html_parser_next_char(&self); if(c == '<') { int res = 0; html_parser_skip_whitespace(&self); if(html_parser_peek_char(&self) == '/') { html_parser_advance_char(&self); res = html_parser_parse_tag_end(&self); } else { res = html_parser_parse_tag_start(&self); } if(res != 0) return res; } else if(c == '\0') { break; } else { int res = 0; self.text.data = (self.source + self.offset) - 1; for(;;) { c = html_parser_peek_char(&self); if(c == '<' || c == '\0') break; else html_parser_advance_char(&self); } self.text.size = (self.source + self.offset) - self.text.data; strip(self.text.data, self.text.size, &self.text_stripped.data, &self.text_stripped.size, is_whitespace); if(self.text.size > 0) { res = self.parse_callback(&self, HTML_PARSE_TEXT, self.callback_userdata); if(res != 0) return res; } } } while(html_parser_try_get_top_unclosed_tag(&self, &top_unclosed_tag)) { int res = 0; self.tag_name = top_unclosed_tag; res = self.parse_callback(&self, HTML_PARSE_TAG_END, self.callback_userdata); if(res != 0) return res; html_parser_pop_unclosed_tag(&self); } return 0; }