From 8d8e23320e48f1d8fd98c3c914696f6fe0f7161e Mon Sep 17 00:00:00 2001 From: DEC05EBA Date: Wed, 1 Jan 2020 09:13:18 +0100 Subject: Ignore comments, ignore end tags without a start tag. Fixes tags closing too soon --- src/HtmlParser.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/HtmlParser.c b/src/HtmlParser.c index 81104b7..7c91a77 100644 --- a/src/HtmlParser.c +++ b/src/HtmlParser.c @@ -125,7 +125,8 @@ static char html_parser_peek_char(HtmlParser *self) { } static void html_parser_advance_char(HtmlParser *self) { - ++self->offset; + if(self->offset < self->source_len) + ++self->offset; } static int is_alpha(char c) { @@ -248,7 +249,7 @@ static void html_parser_goto_script_end_tag(HtmlParser *self) { if(c == '"' || c == '\'') { html_parser_advance_char(self); html_parser_goto_end_of_js_string(self, c); - } else if(c == '<' && self->offset + 7 < self->source_len && strncmp(self->source + self->offset + 1, "/script", 7) == 0) { + } else if(c == '<' && self->offset + 7 < self->source_len && memcmp(self->source + self->offset + 1, "/script", 7) == 0) { self->text.size = (self->source + self->offset) - self->text.data; strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); self->offset += 7; @@ -276,6 +277,16 @@ static void html_parser_goto_script_end_tag(HtmlParser *self) { self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); } +static void html_parser_goto_comment_end(HtmlParser *self) { + for(;;) { + if(self->source_len - self->offset >= 3 && memcmp(self->source + self->offset, "-->", 3) == 0) { + self->offset += 3; + break; + } + html_parser_advance_char(self); + } +} + static void html_parser_parse_tag_start(HtmlParser *self) { int tag_name_found = 0; for(;;) { @@ -340,6 +351,10 @@ static void html_parser_parse_tag_start(HtmlParser *self) { /* tag name */ self->tag_name = identifier; tag_name_found = 1; + if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) { + html_parser_goto_comment_end(self); + return; + } self->is_tag_void = is_void_tag(&self->tag_name); if(!self->is_tag_void) { html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); @@ -373,20 +388,29 @@ static void html_parser_parse_tag_end(HtmlParser *self) { } } tag_end_name.size = (self->source + self->offset) - tag_end_name.data; + tag_name_found = 1; /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */ if(is_void_tag(&tag_end_name)) { - fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", tag_end_name.size, tag_end_name.data); + fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data); continue; } - HtmlStringView top_unclosed_tag; - while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { - self->tag_name = top_unclosed_tag; - self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); - html_parser_pop_unclosed_tag(self); - if(string_view_equals(&top_unclosed_tag, &tag_end_name)) + ssize_t found_start_tag_index = -1; + for(ssize_t i = self->unclosed_tags_offset - 1; i >= 0; --i) { + if(string_view_equals(&self->unclosed_tags[i], &tag_end_name)) { + found_start_tag_index = i; break; + } + } + + if(found_start_tag_index != -1) { + for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) { + self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1]; + self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + } + } else { + fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data); } } else if(c == '\0') { return; -- cgit v1.2.3