diff options
author | DEC05EBA <dec05eba@protonmail.com> | 2019-12-31 08:46:05 +0100 |
---|---|---|
committer | DEC05EBA <dec05eba@protonmail.com> | 2019-12-31 08:49:12 +0100 |
commit | ac0a3e0ebb9b460a31a76115cb4d494361c03e49 (patch) | |
tree | 50cbb9a689934792372eb998d3f2653776194829 /include |
Initial commit, copied from wwwhtml-parser. Works. Need to add unescape of html sequences
Diffstat (limited to 'include')
-rw-r--r-- | include/HtmlParser.h | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/include/HtmlParser.h b/include/HtmlParser.h new file mode 100644 index 0000000..72de123 --- /dev/null +++ b/include/HtmlParser.h @@ -0,0 +1,54 @@ +#ifndef HTML_PARSER_H +#define HTML_PARSER_H + +/* + A small html parser with no dependencies and doesn't dynamically allocate any memory + and can parse and repair broken html (just like web browsers) +*/ + +#include <stddef.h> + +typedef struct { + const char *data; + size_t size; +} StringView; + +typedef struct HtmlParser HtmlParser; + +typedef enum{ + HTML_PARSE_TAG_START, + HTML_PARSE_TAG_END, + HTML_PARSE_ATTRIBUTE, + HTML_PARSE_TEXT, + HTML_PARSE_JAVASCRIPT_CODE +} HtmlParseType; + +typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata); + +#define UNCLOSED_TAGS_SIZE 2048 + +struct HtmlParser { + const char *source; + size_t source_len; + size_t offset; + HtmlParseCallback parse_callback; + void *callback_userdata; + + StringView tag_name; + StringView attribute_key; + StringView attribute_value; + StringView text; + + int is_tag_void; + int inside_script_tag; + + size_t unclosed_tags_offset; + StringView unclosed_tags[UNCLOSED_TAGS_SIZE]; +}; + +void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata); +void html_parser_deinit(HtmlParser *self); + +void html_parser_parse(HtmlParser *self); + +#endif /* HTML_PARSER_H */
\ No newline at end of file |