#include "../include/HtmlParser.h"
#include
#include
#include
static HtmlStringView void_tags[] = {
{"area", 4},
{"base", 4},
{"br", 2},
{"col", 3},
{"command", 7},
{"embed", 5},
{"hr", 2},
{"img", 3},
{"input", 5},
{"keygen", 6},
{"link", 4},
{"meta", 4},
{"param", 5},
{"source", 6},
{"track", 5},
{"wbr", 3},
{"xml", 3},
{NULL, 0}
};
static HtmlStringView script_tag = {"script", 6};
static char to_lower(char c) {
if(c >= 'A' && c <= 'Z')
return c + 32;
else
return c;
}
static int string_view_equals_case_insensitive(HtmlStringView *self, HtmlStringView *other) {
size_t i = 0;
if(self->size != other->size) return 0;
for(; i < self->size; ++i) {
if(to_lower(self->data[i]) != to_lower(other->data[i]))
return 0;
}
return 1;
}
static int is_whitespace(int c) {
switch(c) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\v':
return 1;
default:
return 0;
}
}
static int is_newline(int c) {
return c == '\n' || c == '\r';
}
static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
size_t i = 0;
while(i < size && strip_filter_func(str[i])) {
++i;
}
*output_str = str + i;
*output_size = size - i;
}
static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
ssize_t i = size - 1;
while(i >= 0 && strip_filter_func(str[i])) {
--i;
}
*output_size = i + 1;
}
static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
lstrip(str, size, output_str, output_size, strip_filter_func);
rstrip(*output_str, *output_size, output_size, strip_filter_func);
}
/*static void html_string_view_to_lowercase(HtmlStringView *string_view) {
size_t i = 0;
for(; i < string_view->size; ++i) {
string_view->data[i] = to_lower(string_view->data[i]);
}
}*/
static int is_void_tag(HtmlStringView *tag_name) {
HtmlStringView *tag_iter = &void_tags[0];
/* !DOCTYPE, !--, etc.... */
if(tag_name->size > 0 && tag_name->data[0] == '!')
return 1;
while(tag_iter->data) {
if(string_view_equals_case_insensitive(tag_name, tag_iter))
return 1;
++tag_iter;
}
return 0;
}
static void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) {
self->source = html_source;
self->source_len = len;
self->parse_callback = parse_callback;
self->callback_userdata = userdata;
self->offset = 0;
self->tag_name.data = NULL;
self->tag_name.size = 0;
self->attribute_key.data = NULL;
self->attribute_key.size = 0;
self->attribute_value.data = NULL;
self->attribute_value.size = 0;
self->text.data = NULL;
self->text.size = 0;
self->text_stripped.data = NULL;
self->text_stripped.size = 0;
self->tag_before_void_tag.data = NULL;
self->tag_before_void_tag.size = 0;
self->is_tag_void = 0;
self->inside_script_tag = 0;
self->unclosed_tags_offset = 0;
}
static char html_parser_next_char(HtmlParser *self) {
if(self->offset < self->source_len) {
char c = self->source[self->offset];
++self->offset;
return c;
}
return '\0';
}
static char html_parser_peek_char(HtmlParser *self) {
if(self->offset < self->source_len) {
char c = self->source[self->offset];
return c;
}
return '\0';
}
static void html_parser_advance_char(HtmlParser *self) {
if(self->offset < self->source_len)
++self->offset;
}
static int html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) {
if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) {
fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n");
return 1;
}
self->unclosed_tags[self->unclosed_tags_offset].data = data;
self->unclosed_tags[self->unclosed_tags_offset].size = size;
++self->unclosed_tags_offset;
return 0;
}
static void html_parser_pop_unclosed_tag(HtmlParser *self) {
assert(self->unclosed_tags_offset > 0);
--self->unclosed_tags_offset;
}
static void html_parser_try_pop_unclosed_tag(HtmlParser *self) {
if(self->unclosed_tags_offset > 0)
--self->unclosed_tags_offset;
}
static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, HtmlStringView *result) {
if(self->unclosed_tags_offset > 0) {
*result = self->unclosed_tags[self->unclosed_tags_offset - 1];
return 1;
}
return 0;
}
static void html_parser_skip_whitespace(HtmlParser *self) {
for(;;) {
char c = html_parser_peek_char(self);
if(is_whitespace(c)) {
html_parser_advance_char(self);
} else {
break;
}
}
}
static int is_attribute_value_char(char c) {
switch(c) {
case '"':
case '\'':
case '`':
case '<':
case '>':
case '&':
return 0;
default:
return 1;
}
}
static int is_identifier_char(char c) {
switch(c) {
case ' ':
case '\t':
case '\n':
case '\v':
case '"':
case '\'':
case '<':
case '>':
case '/':
case '=':
return 0;
default:
return 1;
}
}
/* TODO: Unescape html characters in attribute value */
static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) {
self->attribute_value.data = self->source + self->offset;
for(;;) {
char c = html_parser_peek_char(self);
if(c == quote_symbol) {
self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
html_parser_advance_char(self);
break;
} else if(c == '\0') {
self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
break;
} else {
html_parser_advance_char(self);
}
}
strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline);
}
static void html_parser_parse_attribute_value(HtmlParser *self) {
self->attribute_value.data = self->source + self->offset;
for(;;) {
char c = html_parser_peek_char(self);
if(!is_attribute_value_char(c) || is_whitespace(c) || c == '\0')
break;
else
html_parser_advance_char(self);
}
self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
}
static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbol) {
int escape_quote = 0;
for(;;) {
char c = html_parser_next_char(self);
if(!escape_quote && c == quote_symbol) {
return;
} else if(c == '\\') {
escape_quote = !escape_quote;
} else if(c == '\0') {
return;
} else {
escape_quote = 0;
}
}
}
static int html_parser_goto_script_end_tag(HtmlParser *self) {
self->text.data = self->source + self->offset;
self->text.size = 0;
for(;;) {
char c = html_parser_peek_char(self);
if(c == '"' || c == '\'') {
html_parser_advance_char(self);
html_parser_goto_end_of_js_string(self, c);
} else if(c == '<' && self->offset + 7 < self->source_len && memcmp(self->source + self->offset + 1, "/script", 7) == 0) {
self->text.size = (self->source + self->offset) - self->text.data;
strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace);
self->offset += 7;
for(;;) {
c = html_parser_peek_char(self);
if(c == '>') {
html_parser_advance_char(self);
break;
} else if(c == '\0') {
break;
} else {
html_parser_advance_char(self);
}
}
break;
} else if(c == '\0') {
self->text.size = (self->source + self->offset) - self->text.data;
strip(self->text.data, self->text.size, &self->text_stripped.data, &self->text_stripped.size, is_whitespace);
break;
} else {
html_parser_advance_char(self);
}
}
if(self->text_stripped.size > 0)
return self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata);
else
return 0;
}
static void html_parser_goto_comment_end(HtmlParser *self) {
for(;;) {
if(self->source_len - self->offset >= 3 && memcmp(self->source + self->offset, "-->", 3) == 0) {
self->offset += 3;
break;
}
html_parser_advance_char(self);
}
}
static int html_parser_parse_tag_start(HtmlParser *self) {
int tag_name_found = 0;
for(;;) {
char c = html_parser_next_char(self);
if(c == '>') {
if(tag_name_found && self->is_tag_void) {
int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
if(res != 0)
return res;
self->tag_name = self->tag_before_void_tag;
}
self->is_tag_void = 0;
if(self->inside_script_tag) {
self->inside_script_tag = 0;
/* inside a javascript string */
return html_parser_goto_script_end_tag(self);
}
return 0;
} else if(c == '/') {
html_parser_skip_whitespace(self);
if(html_parser_peek_char(self) == '>') {
html_parser_advance_char(self);
if(tag_name_found) {
int res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
if(res != 0)
return res;
if(self->is_tag_void)
self->tag_name = self->tag_before_void_tag;
else
html_parser_try_pop_unclosed_tag(self);
}
self->is_tag_void = 0;
self->inside_script_tag = 0;
return 0;
}
} else if(is_identifier_char(c)) {
HtmlStringView identifier;
identifier.data = self->source + self->offset - 1;
for(;;) {
c = html_parser_peek_char(self);
if(is_identifier_char(c)) {
html_parser_advance_char(self);
} else {
break;
}
}
identifier.size = (self->source + self->offset) - identifier.data;
if(tag_name_found) {
int res = 0;
/* attribute name */
self->attribute_key = identifier;
/*html_string_view_to_lowercase(&self->attribute_key);*/
self->attribute_value.data = NULL;
self->attribute_value.size = 0;
html_parser_skip_whitespace(self);
c = html_parser_peek_char(self);
if(c == '=') {
html_parser_advance_char(self);
html_parser_skip_whitespace(self);
c = html_parser_peek_char(self);
if(c == '"' || c == '\'' || c == '`') {
html_parser_advance_char(self);
html_parser_parse_attribute_value_quoted(self, c);
} else if(is_attribute_value_char(c)) {
html_parser_advance_char(self);
html_parser_parse_attribute_value(self);
}
}
res = self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata);
if(res != 0)
return res;
} else {
int res = 0;
/* tag name */
HtmlStringView prev_tag_name = self->tag_name;
self->tag_name = identifier;
/*html_string_view_to_lowercase(&self->tag_name);*/
tag_name_found = 1;
if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) {
html_parser_goto_comment_end(self);
return 0;
}
self->is_tag_void = is_void_tag(&self->tag_name);
if(self->is_tag_void) {
self->tag_before_void_tag = prev_tag_name;
} else {
res = html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size);
if(res != 0)
return res;
self->inside_script_tag = string_view_equals_case_insensitive(&self->tag_name, &script_tag);
}
res = self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata);
if(res != 0)
return res;
}
} else if(c == '\0') {
return 0;
}
}
}
static int html_parser_parse_tag_end(HtmlParser *self) {
int tag_name_found = 0;
for(;;) {
char c = html_parser_peek_char(self);
if(c == '>') {
html_parser_advance_char(self);
return 0;
} else if(!tag_name_found && is_identifier_char(c)) {
HtmlStringView tag_end_name;
ssize_t found_start_tag_index;
ssize_t i;
tag_end_name.data = self->source + self->offset;
html_parser_advance_char(self);
for(;;) {
c = html_parser_peek_char(self);
if(is_identifier_char(c)) {
html_parser_advance_char(self);
} else {
break;
}
}
tag_end_name.size = (self->source + self->offset) - tag_end_name.data;
tag_name_found = 1;
/* void tags close themselves, this is probably invalid html but we choose to ignore it silently */
if(is_void_tag(&tag_end_name)) {
/*fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);*/
continue;
}
found_start_tag_index = -1;
for(i = self->unclosed_tags_offset - 1; i >= 0; --i) {
if(string_view_equals_case_insensitive(&self->unclosed_tags[i], &tag_end_name)) {
found_start_tag_index = i;
break;
}
}
if(found_start_tag_index != -1) {
for(; self->unclosed_tags_offset > (size_t)found_start_tag_index; --self->unclosed_tags_offset) {
int res = 0;
self->tag_name = self->unclosed_tags[self->unclosed_tags_offset - 1];
res = self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
if(res != 0)
return res;
}
} else {
/*fprintf(stderr, "Warning: start tag not found for end tag '%.*s'\n", (int)tag_end_name.size, tag_end_name.data);*/
}
} else if(c == '\0') {
return 0;
} else {
html_parser_advance_char(self);
}
}
}
int html_parser_parse(const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) {
HtmlStringView top_unclosed_tag;
HtmlParser self;
html_parser_init(&self, html_source, len, parse_callback, userdata);
for(;;) {
char c = html_parser_next_char(&self);
if(c == '<') {
int res = 0;
html_parser_skip_whitespace(&self);
if(html_parser_peek_char(&self) == '/') {
html_parser_advance_char(&self);
res = html_parser_parse_tag_end(&self);
} else {
res = html_parser_parse_tag_start(&self);
}
if(res != 0)
return res;
} else if(c == '\0') {
break;
} else {
int res = 0;
self.text.data = (self.source + self.offset) - 1;
for(;;) {
c = html_parser_peek_char(&self);
if(c == '<' || c == '\0')
break;
else
html_parser_advance_char(&self);
}
self.text.size = (self.source + self.offset) - self.text.data;
strip(self.text.data, self.text.size, &self.text_stripped.data, &self.text_stripped.size, is_whitespace);
if(self.text_stripped.size > 0) {
res = self.parse_callback(&self, HTML_PARSE_TEXT, self.callback_userdata);
if(res != 0)
return res;
}
}
}
while(html_parser_try_get_top_unclosed_tag(&self, &top_unclosed_tag)) {
int res = 0;
self.tag_name = top_unclosed_tag;
res = self.parse_callback(&self, HTML_PARSE_TAG_END, self.callback_userdata);
if(res != 0)
return res;
html_parser_pop_unclosed_tag(&self);
}
return 0;
}