From ac0a3e0ebb9b460a31a76115cb4d494361c03e49 Mon Sep 17 00:00:00 2001 From: DEC05EBA Date: Tue, 31 Dec 2019 08:46:05 +0100 Subject: Initial commit, copied from wwwhtml-parser. Works. Need to add unescape of html sequences --- .gitignore | 5 + LICENSE | 14 ++ README.md | 1 + include/HtmlParser.h | 54 +++++++ project.conf | 5 + src/HtmlParser.c | 434 +++++++++++++++++++++++++++++++++++++++++++++++++++ tests/main.c | 6 + 7 files changed, 519 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 include/HtmlParser.h create mode 100644 project.conf create mode 100644 src/HtmlParser.c create mode 100644 tests/main.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..636c6b9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Compiled sibs files +sibs-build/ +compile_commands.json +tests/sibs-build/ +tests/compile_commands.json diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b27915c --- /dev/null +++ b/LICENSE @@ -0,0 +1,14 @@ +Copyright (C) 2019 DEC05EBA + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . diff --git a/README.md b/README.md new file mode 100644 index 0000000..b90e48b --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +A small html parser written in C. The parser fixes broken html (missing end tags). The parser doesn't perform any dynamic allocations (heap) and only outputs the parsing result to a callback function rather than a dom tree. diff --git a/include/HtmlParser.h b/include/HtmlParser.h new file mode 100644 index 0000000..72de123 --- /dev/null +++ b/include/HtmlParser.h @@ -0,0 +1,54 @@ +#ifndef HTML_PARSER_H +#define HTML_PARSER_H + +/* + A small html parser with no dependencies and doesn't dynamically allocate any memory + and can parse and repair broken html (just like web browsers) +*/ + +#include + +typedef struct { + const char *data; + size_t size; +} StringView; + +typedef struct HtmlParser HtmlParser; + +typedef enum{ + HTML_PARSE_TAG_START, + HTML_PARSE_TAG_END, + HTML_PARSE_ATTRIBUTE, + HTML_PARSE_TEXT, + HTML_PARSE_JAVASCRIPT_CODE +} HtmlParseType; + +typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata); + +#define UNCLOSED_TAGS_SIZE 2048 + +struct HtmlParser { + const char *source; + size_t source_len; + size_t offset; + HtmlParseCallback parse_callback; + void *callback_userdata; + + StringView tag_name; + StringView attribute_key; + StringView attribute_value; + StringView text; + + int is_tag_void; + int inside_script_tag; + + size_t unclosed_tags_offset; + StringView unclosed_tags[UNCLOSED_TAGS_SIZE]; +}; + +void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); +void html_parser_deinit(HtmlParser *self); + +void html_parser_parse(HtmlParser *self); + +#endif /* HTML_PARSER_H */ \ No newline at end of file diff --git a/project.conf b/project.conf new file mode 100644 index 0000000..84f91f0 --- /dev/null +++ b/project.conf @@ -0,0 +1,5 @@ +[package] +name = "html-parser" +type = "static" +version = "0.1.0" +platforms = ["any"] diff --git a/src/HtmlParser.c b/src/HtmlParser.c new file mode 100644 index 0000000..a1b62cd --- /dev/null +++ b/src/HtmlParser.c @@ -0,0 +1,434 @@ +#include "../include/HtmlParser.h" +#include +#include +#include + +static StringView void_tags[] = { + {.data = "area", .size = 4}, + {.data = "base", .size = 4}, + {.data = "br", .size = 2}, + {.data = "col", .size = 3}, + {.data = "command", .size = 7}, + {.data = "embed", .size = 5}, + {.data = "hr", .size = 2}, + {.data = "img", .size = 3}, + {.data = "input", .size = 5}, + {.data = "keygen", .size = 6}, + {.data = "link", .size = 4}, + {.data = "meta", .size = 4}, + {.data = "param", .size = 5}, + {.data = "source", .size = 6}, + {.data = "track", .size = 5}, + {.data = "wbr", .size = 3}, + {.data = NULL, .size = 0} +}; + +static StringView script_tag = {.data = "script", .size = 6}; + +static int string_view_equals(StringView *self, StringView *other) { + return self->size == other->size && memcmp(self->data, other->data, self->size) == 0; +} + +static int is_whitespace(int c) { + switch(c) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\v': + return 1; + default: + return 0; + } +} + +static int is_newline(int c) { + return c == '\n' || c == '\r'; +} + +static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { + size_t i = 0; + while(i < size && strip_filter_func(str[i])) { + ++i; + } + *output_str = str + i; + *output_size = size - i; +} + +static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { + ssize_t i = size - 1; + while(i >= 0 && strip_filter_func(str[i])) { + --i; + } + *output_size = i + 1; +} + +static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { + lstrip(str, size, output_str, output_size, strip_filter_func); + rstrip(*output_str, *output_size, output_size, strip_filter_func); +} + +static int is_void_tag(StringView *tag_name) { + StringView *tag_iter = &void_tags[0]; + /* !DOCTYPE, !--, etc.... */ + if(tag_name->size > 0 && tag_name->data[0] == '!') + return 1; + while(tag_iter->data) { + if(string_view_equals(tag_name, tag_iter)) + return 1; + ++tag_iter; + } + return 0; +} + +static void html_parser_reset(HtmlParser *self) { + self->offset = 0; + self->tag_name.data = NULL; + self->tag_name.size = 0; + self->attribute_key.data = NULL; + self->attribute_key.size = 0; + self->attribute_value.data = NULL; + self->attribute_value.size = 0; + self->text.data = NULL; + self->text.size = 0; + self->is_tag_void = 0; + self->inside_script_tag = 0; + self->unclosed_tags_offset = 0; +} + +void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) { + self->source = html_source; + self->source_len = len; + self->parse_callback = parse_callback; + self->callback_userdata = userdata; +} + +void html_parser_deinit(HtmlParser *self) { + +} + +static char html_parser_next_char(HtmlParser *self) { + if(self->offset < self->source_len) { + char c = self->source[self->offset]; + ++self->offset; + return c; + } + return '\0'; +} + +static char html_parser_peek_char(HtmlParser *self) { + if(self->offset < self->source_len) { + char c = self->source[self->offset]; + return c; + } + return '\0'; +} + +static void html_parser_advance_char(HtmlParser *self) { + ++self->offset; +} + +static int is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static int is_identifier_char(char c) { + return is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '!'; +} + +static void html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) { + if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) { + fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n"); + return; + } + self->unclosed_tags[self->unclosed_tags_offset].data = data; + self->unclosed_tags[self->unclosed_tags_offset].size = size; + ++self->unclosed_tags_offset; +} + +static void html_parser_pop_unclosed_tag(HtmlParser *self) { + assert(self->unclosed_tags_offset > 0); + --self->unclosed_tags_offset; +} + +static void html_parser_try_pop_unclosed_tag(HtmlParser *self) { + if(self->unclosed_tags_offset > 0) + --self->unclosed_tags_offset; +} + +static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, StringView *result) { + if(self->unclosed_tags_offset > 0) { + *result = self->unclosed_tags[self->unclosed_tags_offset - 1]; + return 1; + } + return 0; +} + +static void html_parser_skip_whitespace(HtmlParser *self) { + for(;;) { + char c = html_parser_peek_char(self); + if(is_whitespace(c)) { + html_parser_advance_char(self); + } else { + break; + } + } +} + +static int is_attribute_value_char(char c) { + switch(c) { + case '"': + case '\'': + case '`': + case '<': + case '>': + case '&': + return 0; + default: + return 1; + } +} + +/* TODO: Unescape html characters in attribute value */ +static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) { + self->attribute_value.data = self->source + self->offset; + for(;;) { + char c = html_parser_peek_char(self); + if(c == quote_symbol) { + self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; + html_parser_advance_char(self); + break; + } else if(c == '\0') { + self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; + break; + } else { + html_parser_advance_char(self); + } + } + strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline); +} + +static void html_parser_parse_attribute_value(HtmlParser *self) { + self->attribute_value.data = self->source + self->offset; + for(;;) { + char c = html_parser_peek_char(self); + if(!is_attribute_value_char(c) || c == '\0') + break; + else + html_parser_advance_char(self); + } + self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data; +} + +static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbol) { + int escape_quote = 0; + for(;;) { + char c = html_parser_next_char(self); + if(!escape_quote && c == quote_symbol) { + return; + } else if(c == '\\') { + escape_quote = !escape_quote; + } else if(c == '\0') { + return; + } else { + escape_quote = 0; + } + } +} + +static void html_parser_goto_script_end_tag(HtmlParser *self) { + self->text.data = self->source + self->offset; + self->text.size = 0; + for(;;) { + char c = html_parser_peek_char(self); + if(c == '"' || c == '\'') { + html_parser_advance_char(self); + html_parser_goto_end_of_js_string(self, c); + } else if(c == '<' && self->offset + 7 < self->source_len && strncmp(self->source + self->offset + 1, "/script", 7) == 0) { + self->text.size = (self->source + self->offset) - self->text.data; + strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); + self->offset += 7; + for(;;) { + c = html_parser_peek_char(self); + if(c == '>') { + html_parser_advance_char(self); + break; + } else if(c == '\0') { + break; + } else { + html_parser_advance_char(self); + } + } + break; + } else if(c == '\0') { + self->text.size = (self->source + self->offset) - self->text.data; + strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); + break; + } else { + html_parser_advance_char(self); + } + } + if(self->text.size > 0) + self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata); +} + +static void html_parser_parse_tag_start(HtmlParser *self) { + int tag_name_found = 0; + for(;;) { + char c = html_parser_next_char(self); + if(c == '>') { + if(self->is_tag_void) + self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + self->is_tag_void = 0; + + if(self->inside_script_tag) { + self->inside_script_tag = 0; + /* inside a javascript string */ + html_parser_goto_script_end_tag(self); + } + return; + } else if(c == '/') { + if(html_parser_peek_char(self) == '>') { + html_parser_advance_char(self); + if(tag_name_found) { + self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + if(!self->is_tag_void) + html_parser_try_pop_unclosed_tag(self); + } + self->is_tag_void = 0; + self->inside_script_tag = 0; + return; + } + } else if(is_identifier_char(c)) { + StringView identifier; + identifier.data = self->source + self->offset - 1; + for(;;) { + c = html_parser_peek_char(self); + if(is_identifier_char(c)) { + html_parser_advance_char(self); + } else { + break; + } + } + identifier.size = (self->source + self->offset) - identifier.data; + if(tag_name_found) { + /* attribute name */ + self->attribute_key = identifier; + self->attribute_value.data = NULL; + self->attribute_value.size = 0; + + html_parser_skip_whitespace(self); + c = html_parser_peek_char(self); + if(c == '=') { + html_parser_advance_char(self); + html_parser_skip_whitespace(self); + c = html_parser_peek_char(self); + if(c == '"' || c == '\'' || c == '`') { + html_parser_advance_char(self); + html_parser_parse_attribute_value_quoted(self, c); + } else if(is_attribute_value_char(c)) { + html_parser_advance_char(self); + html_parser_parse_attribute_value(self); + } + } + self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata); + } else { + /* tag name */ + self->tag_name = identifier; + tag_name_found = 1; + self->is_tag_void = is_void_tag(&self->tag_name); + if(!self->is_tag_void) { + html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size); + self->inside_script_tag = string_view_equals(&self->tag_name, &script_tag); + } + self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata); + } + } else if(c == '\0') { + return; + } + } +} + +static void html_parser_parse_tag_end(HtmlParser *self) { + int tag_name_found = 0; + for(;;) { + char c = html_parser_peek_char(self); + if(c == '>') { + html_parser_advance_char(self); + return; + } else if(!tag_name_found && is_identifier_char(c)) { + StringView tag_end_name; + tag_end_name.data = self->source + self->offset; + html_parser_advance_char(self); + for(;;) { + c = html_parser_peek_char(self); + if(is_identifier_char(c)) { + html_parser_advance_char(self); + } else { + break; + } + } + tag_end_name.size = (self->source + self->offset) - tag_end_name.data; + + /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */ + if(is_void_tag(&tag_end_name)) { + fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", tag_end_name.size, tag_end_name.data); + continue; + } + + StringView top_unclosed_tag; + while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { + self->tag_name = top_unclosed_tag; + self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + html_parser_pop_unclosed_tag(self); + if(string_view_equals(&top_unclosed_tag, &tag_end_name)) + break; + } + } else if(c == '\0') { + return; + } else { + html_parser_advance_char(self); + } + } +} + +void html_parser_parse(HtmlParser *self) { + html_parser_reset(self); + for(;;) { + char c = html_parser_next_char(self); + if(c == '<') { + if(html_parser_peek_char(self) == '/') { + html_parser_advance_char(self); + html_parser_parse_tag_end(self); + } else { + html_parser_parse_tag_start(self); + } + } else if(c == '\0') { + break; + } else { + self->text.data = (self->source + self->offset) - 1; + for(;;) { + c = html_parser_peek_char(self); + if(c == '<' || c == '\0') + break; + else + html_parser_advance_char(self); + } + self->text.size = (self->source + self->offset) - self->text.data; + strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace); + if(self->text.size > 0) + self->parse_callback(self, HTML_PARSE_TEXT, self->callback_userdata); + } + } + + StringView top_unclosed_tag; + while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) { + self->tag_name = top_unclosed_tag; + self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata); + html_parser_pop_unclosed_tag(self); + } +} diff --git a/tests/main.c b/tests/main.c new file mode 100644 index 0000000..ff1570b --- /dev/null +++ b/tests/main.c @@ -0,0 +1,6 @@ +#include + +int main(int argc, char **argv) { + printf("hello, world!\n"); + return 0; +} -- cgit v1.2.3