From 2b1a4ac7ad5743400fa875a91ee6869fcd94a9ed Mon Sep 17 00:00:00 2001 From: dec05eba Date: Fri, 2 Jul 2021 17:42:37 +0200 Subject: Return non-0 value from callback to cancel parsing (and return the value in html_parser_parse) --- include/HtmlParser.h | 10 +++++--- src/HtmlParser.c | 66 +++++++++++++++++++++++++++++++++++----------------- tests/main.c | 3 ++- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/include/HtmlParser.h b/include/HtmlParser.h index e6f0c3b..a7cdb4f 100644 --- a/include/HtmlParser.h +++ b/include/HtmlParser.h @@ -27,7 +27,8 @@ typedef enum{ HTML_PARSE_JAVASCRIPT_CODE } HtmlParseType; -typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata); +/* Return 0 to continue */ +typedef int (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata); #define UNCLOSED_TAGS_SIZE 2048 @@ -53,8 +54,11 @@ struct HtmlParser { HtmlStringView unclosed_tags[UNCLOSED_TAGS_SIZE]; }; -/* Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END */ -void html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); +/* + Returns the value returned from |parse_callback|. 0 meaning success. + Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END +*/ +int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); #ifdef __cplusplus } diff --git a/src/HtmlParser.c b/src/HtmlParser.c index 7171d1f..aadf2a8 100644 --- a/src/HtmlParser.c +++ b/src/HtmlParser.c @@ -266,7 +266,7 @@ static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbo } } -static void html_parser_goto_script_end_tag(HtmlParser *self) { +static int html_parser_goto_script_end_tag(HtmlParser *self) { self->text.data = self->source + self->offset; self->text.size = 0; for(;;) { @@ -298,7 +298,7 @@ static void html_parser_goto_script_end_tag(HtmlParser *self) { html_parser_advance_char(self); } } - self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); + return self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); } static void html_parser_goto_comment_end(HtmlParser *self) { @@ -311,13 +311,15 @@ static void html_parser_goto_comment_end(HtmlParser *self) { } } -static void html_parser_parse_tag_start(HtmlParser *self) { +static int html_parser_parse_tag_start(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_next_char(self); if(c == '>') { if(tag_name_found && self->is_tag_void) { - self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + if(res != 0) + return res; self->tag_name = self->tag_before_void_tag; } self->is_tag_void = 0; @@ -325,15 +327,17 @@ static void html_parser_parse_tag_start(HtmlParser *self) { if(self->inside_script_tag) { self->inside_script_tag = 0; /* inside a javascript string */ - html_parser_goto_script_end_tag(self); + return html_parser_goto_script_end_tag(self); } - return; + return 0; } else if(c == '/') { html_parser_skip_whitespace(self); if(html_parser_peek_char(self) == '>') { html_parser_advance_char(self); if(tag_name_found) { - self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + if(res != 0) + return res; if(self->is_tag_void) self->tag_name = self->tag_before_void_tag; else @@ -341,7 +345,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) { } self->is_tag_void = 0; self->inside_script_tag = 0; - return; + return 0; } } else if(is_identifier_char(c)) { HtmlStringView identifier; @@ -356,6 +360,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) { } identifier.size = (self->source + self->offset) - identifier.data; if(tag_name_found) { + int res = 0; /* attribute name */ self->attribute_key = identifier; /*html_string_view_to_lowercase(&self->attribute_key);*/ @@ -376,8 +381,11 @@ static void html_parser_parse_tag_start(HtmlParser *self) { html_parser_parse_attribute_value(self); } } - self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); + res = self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); + if(res != 0) + return res; } else { + int res = 0; /* tag name */ HtmlStringView prev_tag_name = self->tag_name; self->tag_name = identifier; @@ -385,7 +393,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) { tag_name_found = 1; if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) { html_parser_goto_comment_end(self); - return; + return 0; } self->is_tag_void = is_void_tag(&self->tag_name); if(self->is_tag_void) { @@ -394,21 +402,23 @@ static void html_parser_parse_tag_start(HtmlParser *self) { html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); self->inside_script_tag = string_view_equals_case_insensitive(&self->tag_name, &script_tag); } - self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); + res = self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); + if(res != 0) + return res; } } else if(c == '\0') { - return; + return 0; } } } -static void html_parser_parse_tag_end(HtmlParser *self) { +static int html_parser_parse_tag_end(HtmlParser *self) { int tag_name_found = 0; for(;;) { char c = html_parser_peek_char(self); if(c == '>') { html_parser_advance_char(self); - return; + return 0; } else if(!tag_name_found && is_identifier_char(c)) { HtmlStringView tag_end_name; ssize_t found_start_tag_index; @@ -443,37 +453,44 @@ static void html_parser_parse_tag_end(HtmlParser *self) { if(found_start_tag_index != -1) { for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) { + int res = 0; self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1]; - self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + if(res != 0) + return res; } } else { /*fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);*/ } } else if(c == '\0') { - return; + return 0; } else { html_parser_advance_char(self); } } } -void html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { +int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { HtmlStringView top_unclosed_tag; HtmlParser self; html_parser_init(&self, html_source, len, parse_callback, userdata); for(;;) { char c = html_parser_next_char(&self); if(c == '<') { + int res = 0; html_parser_skip_whitespace(&self); if(html_parser_peek_char(&self) == '/') { html_parser_advance_char(&self); - html_parser_parse_tag_end(&self); + res = html_parser_parse_tag_end(&self); } else { - html_parser_parse_tag_start(&self); + res = html_parser_parse_tag_start(&self); } + if(res != 0) + return res; } else if(c == '\0') { break; } else { + int res = 0; self.text.data = (self.source + self.offset) - 1; for(;;) { c = html_parser_peek_char(&self); @@ -484,13 +501,20 @@ void html_parser_parse(const char *html_source, size_t len, HtmlParseCallback pa } self.text.size = (self.source + self.offset) - self.text.data; strip(self.text.data, self.text.size, &self.text_stripped.data, &self.text_stripped.size, is_whitespace); - self.parse_callback(&self, HTML_PARSE_TEXT, self.callback_userdata); + res = self.parse_callback(&self, HTML_PARSE_TEXT, self.callback_userdata); + if(res != 0) + return res; } } while(html_parser_try_get_top_unclosed_tag(&self, &top_unclosed_tag)) { + int res = 0; self.tag_name = top_unclosed_tag; - self.parse_callback(&self, HTML_PARSE_TAG_END, self.callback_userdata); + res = self.parse_callback(&self, HTML_PARSE_TAG_END, self.callback_userdata); + if(res != 0) + return res; html_parser_pop_unclosed_tag(&self); } + + return 0; } diff --git a/tests/main.c b/tests/main.c index de37c9a..b3b6d5c 100644 --- a/tests/main.c +++ b/tests/main.c @@ -20,7 +20,7 @@ char* file_get_content(const char *path, long *filesize) { return data; } -static void html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata_any) { +static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata_any) { switch(parse_type) { case HTML_PARSE_TAG_START: printf("tag start: %.*s\n", (int)html_parser->tag_name.size, html_parser->tag_name.data); @@ -29,6 +29,7 @@ static void html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_typ printf("tag end: %.*s\n", (int)html_parser->tag_name.size, html_parser->tag_name.data); break; } + return 0; } int main() { -- cgit v1.2.3