#ifndef HTML_PARSER_H
#define HTML_PARSER_H
/*
A small html parser with no dependencies and doesn't dynamically allocate any memory
and can parse and repair broken html (just like web browsers)
*/
#include
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const char *data;
size_t size;
} HtmlStringView;
typedef struct HtmlParser HtmlParser;
typedef enum{
HTML_PARSE_TAG_START,
HTML_PARSE_TAG_END,
HTML_PARSE_ATTRIBUTE,
HTML_PARSE_TEXT,
HTML_PARSE_JAVASCRIPT_CODE
} HtmlParseType;
/* Return 0 to continue */
typedef int (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata);
#define UNCLOSED_TAGS_SIZE 2048
struct HtmlParser {
const char *source;
size_t source_len;
size_t offset;
HtmlParseCallback parse_callback;
void *callback_userdata;
/* The name of the current enclosing tag */
HtmlStringView tag_name;
HtmlStringView attribute_key;
HtmlStringView attribute_value;
HtmlStringView text;
HtmlStringView text_stripped;
HtmlStringView tag_before_void_tag;
int is_tag_void;
int inside_script_tag;
size_t unclosed_tags_offset;
HtmlStringView unclosed_tags[UNCLOSED_TAGS_SIZE];
};
/*
Returns the value returned from |parse_callback|. 0 meaning success.
Input text is expected to be in utf8 and may or may not have UTF8-BOM.
Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END.
Note: HTML_PARSE_TEXT may be called multiple times for a tag. For example if a tag has multiple text items split between child tags
like this: hello
text
world
.
In this case, HTML_PARSE_TEXT will be called twice for the div tag. First with "hello" and then with "world".
This function does 0 dynamic memory allocations.
*/
int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata);
/* Returns 1 if equals */
int html_string_view_equals_case_insensitive(HtmlStringView *self, HtmlStringView *other);
#ifdef __cplusplus
}
#endif
#endif /* HTML_PARSER_H */