diff options
-rw-r--r-- | include/HtmlParser.h | 2 | ||||
-rw-r--r-- | src/HtmlParser.c | 31 |
2 files changed, 25 insertions, 8 deletions
diff --git a/include/HtmlParser.h b/include/HtmlParser.h index 5c1f4c1..1e23d0d 100644 --- a/include/HtmlParser.h +++ b/include/HtmlParser.h @@ -56,10 +56,12 @@ struct HtmlParser { /* Returns the value returned from |parse_callback|. 0 meaning success. + Input text is expected to be in utf8 and may or may not have UTF8-BOM. Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END. Note: HTML_PARSE_TEXT may be called multiple times for a tag. For example if a tag has multiple text items split between child tags like this: <div>hello<h1>text</h1>world</div>. In this case, HTML_PARSE_TEXT will be called twice for the div tag. First with "hello" and then with "world". + This function does 0 dynamic memory allocations. */ int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); diff --git a/src/HtmlParser.c b/src/HtmlParser.c index a760f98..8a595ac 100644 --- a/src/HtmlParser.c +++ b/src/HtmlParser.c @@ -26,9 +26,9 @@ static HtmlStringView void_tags[] = { static HtmlStringView script_tag = {"script", 6}; -static char to_lower(char c) { - if(c >= 'A' && c <= 'Z') - return c + 32; +static char to_upper(char c) { + if(c >= 'a' && c <= 'z') + return c - 32; else return c; } @@ -37,7 +37,7 @@ static int string_view_equals_case_insensitive(HtmlStringView *self, HtmlStringV size_t i = 0; if(self->size != other->size) return 0; for(; i < self->size; ++i) { - if(to_lower(self->data[i]) != to_lower(other->data[i])) + if(to_upper(self->data[i]) != to_upper(other->data[i])) return 0; } return 1; @@ -103,6 +103,12 @@ static int is_void_tag(HtmlStringView *tag_name) { } static void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { + /* Utf8 BOM */ + if(len >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) { + html_source += 3; + len -= 3; + } + self->source = html_source; self->source_len = len; self->parse_callback = parse_callback; @@ -268,6 +274,7 @@ static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbo } static int html_parser_goto_script_end_tag(HtmlParser *self) { + int res = 0; self->text.data = self->source + self->offset; self->text.size = 0; for(;;) { @@ -300,10 +307,18 @@ static int html_parser_goto_script_end_tag(HtmlParser *self) { } } - if(self->text_stripped.size > 0) - return self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); - else - return 0; + if(self->text_stripped.size > 0) { + res = self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); + if(res != 0) + return res; + } + + res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + if(res != 0) + return res; + + html_parser_try_pop_unclosed_tag(self); + return res; } static void html_parser_goto_comment_end(HtmlParser *self) { |