#include "../include/quickmedia/HtmlSearch.h"
#include "../include/quickmedia/XpathParser.h"
#include
#include
#include
#include
static void string_init(QuickMediaString *self) {
self->data = NULL;
self->size = 0;
self->capacity = 0;
}
static void string_deinit(QuickMediaString *self) {
free(self->data);
self->data = NULL;
self->size = 0;
self->capacity = 0;
}
static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) {
if(self->capacity >= new_capacity)
return 0;
size_t capacity = self->capacity;
if(capacity == 0)
capacity = 8;
while(capacity < new_capacity) {
capacity += (capacity >> 1);
}
void *new_data = realloc(self->data, capacity);
if(!new_data) {
fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity);
return 1;
}
self->data = new_data;
self->capacity = capacity;
return 0;
}
static int string_append(QuickMediaString *self, const char *str, size_t size) {
if(size == 0)
return 0;
int res = string_ensure_capacity(self, self->size + size + 1);
if(res != 0)
return res;
memcpy((char*)self->data + self->size, str, size);
((char*)self->data)[self->size + size] = '\0';
self->size += size;
return 0;
}
static size_t find_first_not_char(const char *str, size_t size, char not_char) {
assert(not_char != '\0');
size_t i = 0;
for(; i < size && str[i] == not_char; ++i) {}
return i;
}
static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) {
if(index < str->size)
return str->data[index];
else
return fallback;
}
static int is_whitespace(int c) {
switch(c) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\v':
return 1;
default:
return 0;
}
}
static int is_newline(int c) {
return c == '\n' || c == '\r';
}
static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
if(size == 0) {
*output_str = str;
*output_size = size;
return;
}
size_t i = 0;
while(i < size && strip_filter_func(str[i])) {
++i;
}
*output_str = str + i;
*output_size = size - i;
}
static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
if(size == 0) {
*output_size = size;
return;
}
ssize_t i = size - 1;
while(i >= 0 && strip_filter_func(str[i])) {
--i;
}
*output_size = i + 1;
}
static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
lstrip(str, size, output_str, output_size, strip_filter_func);
rstrip(*output_str, *output_size, output_size, strip_filter_func);
}
/* Returns 0 on match */
static int str_glob_match(QuickMediaStringView str, QuickMediaStringView glob, int is_glob) {
size_t str_index = 0;
size_t glob_index = 0;
strip(str.data, str.size, &str.data, &str.size, is_whitespace);
strip(glob.data, glob.size, &glob.data, &glob.size, is_whitespace);
if(!is_glob) {
if(glob.size == str.size && memcmp(str.data, glob.data, str.size) == 0)
return 0;
else
return 1;
}
if(str.size == 0) {
/* TODO: What about glob = **** (more than one asterix) */
if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*'))
return 0;
else
return 1;
}
size_t prev_str_index = 0;
size_t prev_glob_index = 0;
char next_glob_c = '\0';
for(;;) {
char glob_c = string_view_char_or(&glob, glob_index, '\0');
if(glob_c == '*') {
glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*');
next_glob_c = string_view_char_or(&glob, glob_index, '\0');
if(next_glob_c == '\0')
return 0;
const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index);
if(!s_p)
return 1;
str_index = (const char*)s_p - str.data;
prev_str_index = str_index;
prev_glob_index = glob_index;
} else {
char str_c = string_view_char_or(&str, str_index, '\0');
if(str_c != glob_c) {
str_index = prev_str_index + 1;
glob_index = prev_glob_index;
const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index);
if(!s_p)
return 1;
str_index = (const char*)s_p - str.data;
prev_str_index = str_index;
continue;
}
if(str_c == '\0')
return 0;
++str_index;
++glob_index;
}
}
assert(0); /* shouldn't happen */
return 1;
}
static char to_upper(char c) {
if(c >= 'a' && c <= 'z')
return c - 32;
else
return c;
}
static int string_views_equal_case_insensitive_strip(QuickMediaStringView str1, QuickMediaStringView str2) {
if(str2.size != str1.size)
return 1;
strip(str1.data, str1.size, &str1.data, &str1.size, is_whitespace);
strip(str2.data, str2.size, &str2.data, &str2.size, is_whitespace);
for(size_t i = 0; i < str1.size; ++i) {
char c1 = str1.data[i];
char c2 = str2.data[i];
if(to_upper(c1) != to_upper(c2))
return 1;
}
return 0;
}
static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) {
for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) {
if(string_views_equal_case_insensitive_strip(attr->key, name) == 0)
return attr;
}
return NULL;
}
static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
if(!node)
return 0;
int match_index = 0;
/* We use two loops because we want to find children before grandchildren */
for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
/* A text node doesn't have a name */
if(!child->node.is_tag || child->node.name.size == 0)
continue;
/* Match without node name or node name matches */
if(search_data->name.size == 0 || string_views_equal_case_insensitive_strip(child->node.name, search_data->name) == 0) {
#define on_match() do { \
if(search_data->child) { \
if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \
return 1; \
} else { \
QuickMediaMatchNode match_node; \
match_node.node = &child->node; \
string_init(&match_node.__str); \
if(result_callback(&match_node, userdata) != 0) { \
string_deinit(&match_node.__str); \
return 1; \
} \
string_deinit(&match_node.__str); \
} \
} while(0)
/* If we search without param, then it's a match */
if(!search_data->param.defined) {
if(search_data->param.index == -1 || search_data->param.index == match_index)
on_match();
++match_index;
continue;
}
QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name);
/* Couldn't find the param that we want to match against */
if(!child_attr)
continue;
assert(search_data->param.value.size > 0);
/* If the param value matches what we want to search for */
if(str_glob_match(child_attr->value, search_data->param.value, search_data->param.value_is_glob) == 0) {
if(search_data->param.index == -1 || search_data->param.index == match_index)
on_match();
++match_index;
continue;
}
}
}
if(search_data->recursive) {
for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0)
return 1;
}
}
return 0;
}
static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
assert(search_data);
assert(result_callback);
if(!search_data || !result_callback)
return -1;
find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata);
return 0;
}
static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent);
static void html_node_child_deinit(QuickMediaHtmlChildNode *self);
static void html_attribute_init(QuickMediaHtmlAttribute *self) {
self->key.data = NULL;
self->key.size = 0;
self->value.data = NULL,
self->value.size = 0;
self->next = NULL;
}
static void html_attribute_deinit(QuickMediaHtmlAttribute *self) {
if(self->next) {
html_attribute_deinit(self->next);
free(self->next);
self->next = NULL;
}
html_attribute_init(self);
}
static void html_node_init(QuickMediaHtmlNode *self) {
self->is_tag = 1;
self->name.data = NULL;
self->name.size = 0;
self->first_attribute = NULL;
self->last_attribute = NULL;
self->first_child = NULL;
self->last_child = NULL;
self->parent = NULL;
}
static void html_node_deinit(QuickMediaHtmlNode *self) {
if(self->first_attribute) {
html_attribute_deinit(self->first_attribute);
free(self->first_attribute);
self->first_attribute = NULL;
}
if(self->first_child) {
html_node_child_deinit(self->first_child);
free(self->first_child);
self->first_child = NULL;
}
html_node_init(self);
}
static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) {
QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute));
if(!attribute)
return 1;
html_attribute_init(attribute);
attribute->key.data = key.data;
attribute->key.size = key.size;
attribute->value.data = value.data;
attribute->value.size = value.size;
if(self->last_attribute) {
self->last_attribute->next = attribute;
self->last_attribute = attribute;
} else {
self->first_attribute = attribute;
self->last_attribute = attribute;
}
return 0;
}
void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) {
html_node_init(&self->node);
self->node.parent = parent;
if(parent) {
if(parent->last_child) {
parent->last_child->next = self;
parent->last_child = self;
} else {
parent->first_child = self;
parent->last_child = self;
}
}
self->next = NULL;
}
void html_node_child_deinit(QuickMediaHtmlChildNode *self) {
if(self->next) {
html_node_child_deinit(self->next);
free(self->next);
self->next = NULL;
}
html_node_deinit(&self->node);
}
static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
QuickMediaHtmlNode **html_node_p = userdata;
QuickMediaHtmlNode *html_node = *html_node_p;
switch(parse_type) {
case HTML_PARSE_TAG_START: {
QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
if(!child_node)
return 1;
html_node_child_init(child_node, html_node);
child_node->node.name.data = html_parser->tag_name.data;
child_node->node.name.size = html_parser->tag_name.size;
*html_node_p = &child_node->node;
break;
}
case HTML_PARSE_TAG_END: {
if(html_node->parent)
*html_node_p = html_node->parent;
break;
}
case HTML_PARSE_ATTRIBUTE: {
if(html_node_add_attribute(html_node, html_parser->attribute_key, html_parser->attribute_value) != 0)
return 1;
break;
}
case HTML_PARSE_TEXT:
/* fallthrough */
case HTML_PARSE_JAVASCRIPT_CODE: {
if(html_parser->text_stripped.size == 0)
return 0;
QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
if(!child_node)
return 1;
html_node_child_init(child_node, html_node);
child_node->node.is_tag = 0;
child_node->node.name.data = html_parser->text.data;
child_node->node.name.size = html_parser->text.size;
break;
}
}
return 0;
}
QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
QuickMediaStringView attr_name;
attr_name.data = attribute_name;
attr_name.size = strlen(attribute_name);
QuickMediaHtmlAttribute *attr = get_attribute_by_name(self, attr_name);
if(attr) {
return attr->value;
} else {
QuickMediaStringView attr_value;
attr_value.data = NULL;
attr_value.size = 0;
return attr_value;
}
}
QuickMediaHtmlNode* quickmedia_html_node_find_child(QuickMediaHtmlNode *self, const char *tag_name, const char *attribute_name, const char *attribute_value) {
QuickMediaStringView tag;
tag.data = tag_name;
tag.size = strlen(tag_name);
QuickMediaStringView attr_name;
attr_name.data = attribute_name;
attr_name.size = strlen(attribute_name);
QuickMediaStringView attr_value;
attr_value.data = attribute_value;
attr_value.size = strlen(attribute_value);
for(QuickMediaHtmlChildNode *child = self->first_child; child; child = child->next) {
if(!child->node.is_tag)
continue;
if(string_views_equal_case_insensitive_strip(child->node.name, tag) != 0)
continue;
QuickMediaHtmlAttribute *attr = get_attribute_by_name(&child->node, attr_name);
if(!attr)
continue;
if(string_views_equal_case_insensitive_strip(attr->value, attr_value) == 0)
return &child->node;
}
return NULL;
}
static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) {
if(node->is_tag) {
int newline = 0;
if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) {
if(string_append(str, "\n", 1) != 0)
return 1;
newline = 1;
} else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) {
if(str->size > 0) {
if(string_append(str, "\n", 1) != 0)
return 1;
}
newline = 1;
} else if(node->name.size == 1 && node->name.data[0] == 'p') {
if(str->size > 0) {
if(string_append(str, "\n", 1) != 0)
return 1;
}
newline = 1;
}
size_t prev_size = str->size;
for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) {
merge_inner_text(&child->node, str);
}
if(newline && str->size > prev_size && str->size > 0) {
if(string_append(str, "\n", 1) != 0)
return 1;
}
} else {
const char *inner_text = node->name.data;
size_t inner_text_size = node->name.size;
strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline);
if(inner_text_size > 0) {
if(string_append(str, inner_text, inner_text_size) != 0)
return 1;
}
}
return 0;
}
QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) {
if(self->__str.data) {
QuickMediaStringView text;
text.data = self->__str.data;
text.size = self->__str.size;
strip(text.data, text.size, &text.data, &text.size, is_whitespace);
return text;
}
if(!self->node->first_child) {
QuickMediaStringView text;
text.data = NULL;
text.size = 0;
return text;
}
/* If the only child is the text node then there is no need to create a copy of it */
if(!self->node->first_child->next && !self->node->first_child->node.is_tag) {
QuickMediaStringView text = self->node->first_child->node.name;
strip(text.data, text.size, &text.data, &text.size, is_whitespace);
return text;
}
if(merge_inner_text(self->node, &self->__str) != 0) {
QuickMediaStringView text;
text.data = NULL;
text.size = 0;
return text;
}
QuickMediaStringView text;
text.data = self->__str.data;
text.size = self->__str.size;
strip(text.data, text.size, &text.data, &text.size, is_whitespace);
return text;
}
int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) {
QuickMediaHtmlNode *html_node = &self->root_node;
html_node_init(html_node);
if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) {
quickmedia_html_search_deinit(self);
return 1;
}
return 0;
}
void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
html_node_deinit(&self->root_node);
}
int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
QuickMediaNodeSearch search_data;
quickmedia_node_search_init(&search_data);
int result = quickmedia_parse_xpath(xpath, &search_data);
if(result != 0)
goto cleanup;
result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata);
cleanup:
quickmedia_node_search_deinit(&search_data);
return result;
}