#include "../include/HtmlParser.h" #include #include #include static HtmlStringView void_tags[] = { {"area", 4}, {"base", 4}, {"br", 2}, {"col", 3}, {"command", 7}, {"embed", 5}, {"hr", 2}, {"img", 3}, {"input", 5}, {"keygen", 6}, {"link", 4}, {"meta", 4}, {"param", 5}, {"source", 6}, {"track", 5}, {"wbr", 3}, {NULL, 0} }; static HtmlStringView script_tag = {"script", 6}; static int string_view_equals(HtmlStringView *self, HtmlStringView *other) { return self->size == other->size && memcmp(self->data, other->data, self->size) == 0; } static int is_whitespace(int c) { switch(c) { case ' ': case '\n': case '\r': case '\t': case '\v': return 1; default: return 0; } } static int is_newline(int c) { return c == '\n' || c == '\r'; } static void lstrip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; while(i < size && strip_filter_func(str[i])) { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip(char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { ssize_t i = size - 1; while(i >= 0 && strip_filter_func(str[i])) { --i; } *output_size = i + 1; } static void strip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } static void html_string_view_to_lowercase(HtmlStringView *string_view) { size_t i = 0; for(; i < string_view->size; ++i) { char c = string_view->data[i]; if(c >= 'A' && c <= 'Z') string_view->data[i] += 32; } } static int is_void_tag(HtmlStringView *tag_name) { HtmlStringView *tag_iter = &void_tags[0]; /* !DOCTYPE, !--, etc.... */ if(tag_name->size > 0 && tag_name->data[0] == '!') return 1; while(tag_iter->data) { if(string_view_equals(tag_name, tag_iter)) return 1; ++tag_iter; } return 0; } static void html_parser_reset(HtmlParser *self) { self->offset = 0; self->tag_name.data = NULL; self->tag_name.size = 0; self->attribute_key.data = NULL; self->attribute_key.size = 0; self->attribute_value.data = NULL; self->attribute_value.size = 0; self->text.data = NULL; self->text.size = 0; self->text_stripped.data = NULL; self->text_stripped.size = 0; self->tag_before_void_tag.data = NULL; self->tag_before_void_tag.size = 0; self->is_tag_void = 0; self->inside_script_tag = 0; self->unclosed_tags_offset = 0; } void html_parser_init(HtmlParser *self, char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { self->source = html_source; self->source_len = len; self->parse_callback = parse_callback; self->callback_userdata = userdata; } void html_parser_deinit(HtmlParser *self) { (void)self; } static char html_parser_next_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; ++self->offset; return c; } return '\0'; } static char html_parser_peek_char(HtmlParser *self) { if(self->offset < self->source_len) { char c = self->source[self->offset]; return c; } return '\0'; } static void html_parser_advance_char(HtmlParser *self) { if(self->offset < self->source_len) ++self->offset; } static void html_parser_try_append_unclosed_tag(HtmlParser *self, char *data, size_t size) { if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) { fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n"); return; } self->unclosed_tags[self->unclosed_tags_offset].data = data; self->unclosed_tags[self->unclosed_tags_offset].size = size; ++self->unclosed_tags_offset; } static void html_parser_pop_unclosed_tag(HtmlParser *self) { assert(self->unclosed_tags_offset > 0); --self->unclosed_tags_offset; } static void html_parser_try_pop_unclosed_tag(HtmlParser *self) { if(self->unclosed_tags_offset > 0) --self->unclosed_tags_offset; } static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, HtmlStringView *result) { if(self->unclosed_tags_offset > 0) { *result = self->unclosed_tags[self->unclosed_tags_offset - 1]; return 1; } return 0; } static void html_parser_skip_whitespace(HtmlParser *self) { for(;;) { char c = html_parser_peek_char(self); if(is_whitespace(c)) { html_parser_advance_char(self); } else { break; } } } static int is_attribute_value_char(char c) { switch(c) { case '"': case '\'': case '`': case '<': case '>': case '&': return 0; default: return 1; } } static int is_identifier_char(char c) { switch(c) { case ' ': case '\t': case '\n': case '\v': case '"': case '\'': case '<': case '>': case '/': case '=': return 0; default: return 1; } } /* TODO: Unescape html characters in attribute value */ static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(c == quote_symbol) { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; html_parser_advance_char(self); break; } else if(c == '\0') { self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; break; } else { html_parser_advance_char(self); } } strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline); } static void html_parser_parse_attribute_value(HtmlParser *self) { self->attribute_value.data = self->source + self->offset; for(;;) { char c = html_parser_peek_char(self); if(!is_attribute_value_char(c) || is_whitespace(c) || c == '\0') break; else html_parser_advance_char(self); } self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; } static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbol) { int escape_quote = 0; for(;;) { char c = html_parser_next_char(self); if(!escape_quote && c == quote_symbol) { return; } else if(c == '\\') { escape_quote = !escape_quote; } else if(c == '\0') { return; } else { escape_quote = 0; } } } static void html_parser_goto_script_end_tag(HtmlParser *self) { self->text.data = self->source + self->offset; self->text.size = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '"' || c == '\'') { html_parser_advance_char(self); html_parser_goto_end_of_js_string(self, c); } else if(c == '<' && self->offset + 7 < self->source_len && memcmp(self->source + self->offset + 1, "/script", 7) == 0) { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace); self->offset += 7; for(;;) { c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); break; } else if(c == '\0') { break; } else { html_parser_advance_char(self); } } break; } else if(c == '\0') { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace); break; } else { html_parser_advance_char(self); } } self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); } static void html_parser_goto_comment_end(HtmlParser *self) { for(;;) { if(self->source_len - self->offset >= 3 && memcmp(self->source + self->offset, "-->", 3) == 0) { self->offset += 3; break; } html_parser_advance_char(self); } } static void html_parser_parse_tag_start(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_next_char(self); if(c == '>') { if(tag_name_found && self->is_tag_void) { self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); self->tag_name = self->tag_before_void_tag; } self->is_tag_void = 0; if(self->inside_script_tag) { self->inside_script_tag = 0; /* inside a javascript string */ html_parser_goto_script_end_tag(self); } return; } else if(c == '/') { html_parser_skip_whitespace(self); if(html_parser_peek_char(self) == '>') { html_parser_advance_char(self); if(tag_name_found) { self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); if(self->is_tag_void) self->tag_name = self->tag_before_void_tag; else html_parser_try_pop_unclosed_tag(self); } self->is_tag_void = 0; self->inside_script_tag = 0; return; } } else if(is_identifier_char(c)) { HtmlStringView identifier; identifier.data = self->source + self->offset - 1; for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } identifier.size = (self->source + self->offset) - identifier.data; if(tag_name_found) { /* attribute name */ self->attribute_key = identifier; html_string_view_to_lowercase(&self->attribute_key); self->attribute_value.data = NULL; self->attribute_value.size = 0; html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '=') { html_parser_advance_char(self); html_parser_skip_whitespace(self); c = html_parser_peek_char(self); if(c == '"' || c == '\'' || c == '`') { html_parser_advance_char(self); html_parser_parse_attribute_value_quoted(self, c); } else if(is_attribute_value_char(c)) { html_parser_advance_char(self); html_parser_parse_attribute_value(self); } } self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); } else { /* tag name */ HtmlStringView prev_tag_name = self->tag_name; self->tag_name = identifier; html_string_view_to_lowercase(&self->tag_name); tag_name_found = 1; if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) { html_parser_goto_comment_end(self); return; } self->is_tag_void = is_void_tag(&self->tag_name); if(self->is_tag_void) { self->tag_before_void_tag = prev_tag_name; } else { html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); self->inside_script_tag = string_view_equals(&self->tag_name, &script_tag); } self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); } } else if(c == '\0') { return; } } } static void html_parser_parse_tag_end(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); return; } else if(!tag_name_found && is_identifier_char(c)) { HtmlStringView tag_end_name; ssize_t found_start_tag_index; ssize_t i; tag_end_name.data = self->source + self->offset; html_parser_advance_char(self); for(;;) { c = html_parser_peek_char(self); if(is_identifier_char(c)) { html_parser_advance_char(self); } else { break; } } tag_end_name.size = (self->source + self->offset) - tag_end_name.data; tag_name_found = 1; /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */ if(is_void_tag(&tag_end_name)) { fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data); continue; } found_start_tag_index = -1; for(i = self->unclosed_tags_offset - 1; i >= 0; --i) { if(string_view_equals(&self->unclosed_tags[i], &tag_end_name)) { found_start_tag_index = i; break; } } if(found_start_tag_index != -1) { for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) { self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1]; self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); } } else { fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data); } } else if(c == '\0') { return; } else { html_parser_advance_char(self); } } } void html_parser_parse(HtmlParser *self) { HtmlStringView top_unclosed_tag; html_parser_reset(self); for(;;) { char c = html_parser_next_char(self); if(c == '<') { html_parser_skip_whitespace(self); if(html_parser_peek_char(self) == '/') { html_parser_advance_char(self); html_parser_parse_tag_end(self); } else { html_parser_parse_tag_start(self); } } else if(c == '\0') { break; } else { self->text.data = (self->source + self->offset) - 1; for(;;) { c = html_parser_peek_char(self); if(c == '<' || c == '\0') break; else html_parser_advance_char(self); } self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace); self->parse_callback(self, HTML_PARSE_TEXT, self->callback_userdata); } } while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { self->tag_name = top_unclosed_tag; self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); html_parser_pop_unclosed_tag(self); } }