aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDEC05EBA <dec05eba@protonmail.com>2020-01-01 09:13:18 +0100
committerDEC05EBA <dec05eba@protonmail.com>2020-01-01 09:13:18 +0100
commit8d8e23320e48f1d8fd98c3c914696f6fe0f7161e (patch)
tree3550f131afa5b62e160076dcc7df9ae8262da8cf /src
parent0f8f6ffe2e6cf6a9210da489a030b25a1ef307a3 (diff)
Ignore comments, ignore end tags without a start tag.
Fixes tags closing too soon
Diffstat (limited to 'src')
-rw-r--r--src/HtmlParser.c42
1 files changed, 33 insertions, 9 deletions
diff --git a/src/HtmlParser.c b/src/HtmlParser.c
index 81104b7..7c91a77 100644
--- a/src/HtmlParser.c
+++ b/src/HtmlParser.c
@@ -125,7 +125,8 @@ static char html_parser_peek_char(HtmlParser *self) {
}
static void html_parser_advance_char(HtmlParser *self) {
- ++self->offset;
+ if(self->offset < self->source_len)
+ ++self->offset;
}
static int is_alpha(char c) {
@@ -248,7 +249,7 @@ static void html_parser_goto_script_end_tag(HtmlParser *self) {
if(c == '"' || c == '\'') {
html_parser_advance_char(self);
html_parser_goto_end_of_js_string(self, c);
- } else if(c == '<' && self->offset + 7 < self->source_len && strncmp(self->source + self->offset + 1, "/script", 7) == 0) {
+ } else if(c == '<' && self->offset + 7 < self->source_len && memcmp(self->source + self->offset + 1, "/script", 7) == 0) {
self->text.size = (self->source + self->offset) - self->text.data;
strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace);
self->offset += 7;
@@ -276,6 +277,16 @@ static void html_parser_goto_script_end_tag(HtmlParser *self) {
self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata);
}
+static void html_parser_goto_comment_end(HtmlParser *self) {
+ for(;;) {
+ if(self->source_len - self->offset >= 3 && memcmp(self->source + self->offset, "-->", 3) == 0) {
+ self->offset += 3;
+ break;
+ }
+ html_parser_advance_char(self);
+ }
+}
+
static void html_parser_parse_tag_start(HtmlParser *self) {
int tag_name_found = 0;
for(;;) {
@@ -340,6 +351,10 @@ static void html_parser_parse_tag_start(HtmlParser *self) {
/* tag name */
self->tag_name = identifier;
tag_name_found = 1;
+ if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) {
+ html_parser_goto_comment_end(self);
+ return;
+ }
self->is_tag_void = is_void_tag(&self->tag_name);
if(!self->is_tag_void) {
html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size);
@@ -373,20 +388,29 @@ static void html_parser_parse_tag_end(HtmlParser *self) {
}
}
tag_end_name.size = (self->source + self->offset) - tag_end_name.data;
+ tag_name_found = 1;
/* void tags close themselves, this is probably invalid html but we choose to ignore it silently */
if(is_void_tag(&tag_end_name)) {
- fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", tag_end_name.size, tag_end_name.data);
+ fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);
continue;
}
- HtmlStringView top_unclosed_tag;
- while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) {
- self->tag_name = top_unclosed_tag;
- self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
- html_parser_pop_unclosed_tag(self);
- if(string_view_equals(&top_unclosed_tag, &tag_end_name))
+ ssize_t found_start_tag_index = -1;
+ for(ssize_t i = self->unclosed_tags_offset - 1; i >= 0; --i) {
+ if(string_view_equals(&self->unclosed_tags[i], &tag_end_name)) {
+ found_start_tag_index = i;
break;
+ }
+ }
+
+ if(found_start_tag_index != -1) {
+ for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) {
+ self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1];
+ self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
+ }
+ } else {
+ fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);
}
} else if(c == '\0') {
return;