#include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" #include #include #include #include static void string_init(QuickMediaString *self) { self->data = NULL; self->size = 0; self->capacity = 0; } static void string_deinit(QuickMediaString *self) { free(self->data); self->data = NULL; self->size = 0; self->capacity = 0; } static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) { if(self->capacity >= new_capacity) return 0; size_t capacity = self->capacity; if(capacity == 0) capacity = 8; while(capacity < new_capacity) { capacity += (capacity >> 1); } void *new_data = realloc(self->data, capacity); if(!new_data) { fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity); return 1; } self->data = new_data; self->capacity = capacity; return 0; } static int string_append(QuickMediaString *self, const char *str, size_t size) { int res = string_ensure_capacity(self, self->size + size); if(res != 0) return res; memcpy((char*)self->data + self->size, str, size); ((char*)self->data)[self->size + size] = '\0'; self->size += size; return 0; } static size_t find_first_not_char(const char *str, size_t size, char not_char) { assert(not_char != '\0'); size_t i = 0; for(; i < size && str[i] == not_char; ++i) {} return i; } static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) { if(index < str->size) return str->data[index]; else return fallback; } /* Returns 0 on match */ static int str_glob_match(const QuickMediaStringView str, const QuickMediaStringView glob) { size_t str_index = 0; size_t glob_index = 0; if(str.size == 0) { /* TODO: What about glob = **** (more than one asterix) */ if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*')) return 0; else return 1; } for(;;) { char glob_c = string_view_char_or(&glob, glob_index, '\0'); if(glob_c == '*') { glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*'); char next_glob_c = string_view_char_or(&glob, glob_index, '\0'); if(next_glob_c == '\0') return 0; const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index); if(!s_p) return 1; const size_t new_str_index = (const char*)s_p - (str.data + str_index); str_index = new_str_index; } else { char str_c = string_view_char_or(&str, str_index, '\0'); if(str_c != glob_c) return 1; if(str_c == '\0') return 0; } ++str_index; ++glob_index; } assert(0); /* shouldn't happen */ return 1; } static int string_views_equal(const QuickMediaStringView str1, const QuickMediaStringView str2) { if(str2.size == str1.size && memcmp(str2.data, str1.data, str1.size) == 0) return 0; else return 1; } static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) { for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) { if(string_views_equal(attr->key, name) == 0) return attr; } return NULL; } static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { if(!node) return 0; /* We use two loops because we want to find children before grandchildren */ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { /* A text node doesn't have a name */ if(!child->node.is_tag || child->node.name.size == 0) continue; /* Match without node name or node name matches */ if(search_data->name.size == 0 || string_views_equal(child->node.name, search_data->name) == 0) { #define on_match() do { \ if(search_data->child) { \ if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \ return 1; \ } else { \ QuickMediaMatchNode match_node; \ match_node.node = &child->node; \ string_init(&match_node.__str); \ if(result_callback(&match_node, userdata) != 0) { \ string_deinit(&match_node.__str); \ return 1; \ } \ string_deinit(&match_node.__str); \ } \ } while(0) /* If we search without param, then it's a match */ if(!search_data->param.defined) { on_match(); continue; } QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; assert(search_data->param.value.size > 0); /* If the param value matches what we want to search for */ if(str_glob_match(child_attr->value, search_data->param.value) == 0) { on_match(); continue; } } } if(search_data->recursive) { for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0) return 1; } } return 0; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(search_data); assert(result_callback); if(!search_data || !result_callback) return -1; find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata); return 0; } static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent); static void html_node_child_deinit(QuickMediaHtmlChildNode *self); static void html_attribute_init(QuickMediaHtmlAttribute *self) { self->key.data = NULL; self->key.size = 0; self->value.data = NULL, self->value.size = 0; self->next = NULL; } static void html_attribute_deinit(QuickMediaHtmlAttribute *self) { if(self->next) { html_attribute_deinit(self->next); free(self->next); self->next = NULL; } html_attribute_init(self); } static void html_node_init(QuickMediaHtmlNode *self) { self->is_tag = 1; self->name.data = NULL; self->name.size = 0; self->first_attribute = NULL; self->last_attribute = NULL; self->first_child = NULL; self->last_child = NULL; self->parent = NULL; } static void html_node_deinit(QuickMediaHtmlNode *self) { if(self->first_attribute) { html_attribute_deinit(self->first_attribute); free(self->first_attribute); self->first_attribute = NULL; } if(self->first_child) { html_node_child_deinit(self->first_child); free(self->first_child); self->first_child = NULL; } html_node_init(self); } static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) { QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute)); if(!attribute) return 1; html_attribute_init(attribute); attribute->key.data = key.data; attribute->key.size = key.size; attribute->value.data = value.data; attribute->value.size = value.size; if(self->last_attribute) { self->last_attribute->next = attribute; self->last_attribute = attribute; } else { self->first_attribute = attribute; self->last_attribute = attribute; } return 0; } void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) { html_node_init(&self->node); self->node.parent = parent; if(parent) { if(parent->last_child) { parent->last_child->next = self; parent->last_child = self; } else { parent->first_child = self; parent->last_child = self; } } self->next = NULL; } void html_node_child_deinit(QuickMediaHtmlChildNode *self) { if(self->next) { html_node_child_deinit(self->next); free(self->next); self->next = NULL; } html_node_deinit(&self->node); } static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { QuickMediaHtmlNode **html_node_p = userdata; QuickMediaHtmlNode *html_node = *html_node_p; switch(parse_type) { case HTML_PARSE_TAG_START: { QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); if(!child_node) return 1; html_node_child_init(child_node, html_node); child_node->node.name.data = html_parser->tag_name.data; child_node->node.name.size = html_parser->tag_name.size; *html_node_p = &child_node->node; break; } case HTML_PARSE_TAG_END: { if(html_node->parent) *html_node_p = html_node->parent; break; } case HTML_PARSE_ATTRIBUTE: { if(html_node_add_attribute(html_node, html_parser->attribute_key, html_parser->attribute_value) != 0) return 1; break; } case HTML_PARSE_TEXT: /* fallthrough */ case HTML_PARSE_JAVASCRIPT_CODE: { QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); if(!child_node) return 1; html_node_child_init(child_node, html_node); child_node->node.is_tag = 0; child_node->node.name.data = html_parser->text.data; child_node->node.name.size = html_parser->text.size; break; } } return 0; } QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name) { QuickMediaStringView attr_name; attr_name.data = attribute_name; attr_name.size = strlen(attribute_name); QuickMediaHtmlAttribute *attr = get_attribute_by_name(self->node, attr_name); if(attr) { return attr->value; } else { QuickMediaStringView attr_value; attr_value.data = NULL; attr_value.size = 0; return attr_value; } } static int is_whitespace(int c) { switch(c) { case ' ': case '\n': case '\r': case '\t': case '\v': return 1; default: return 0; } } static int is_newline(int c) { return c == '\n' || c == '\r'; } static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; while(i < size && strip_filter_func(str[i])) { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { ssize_t i = size - 1; while(i >= 0 && strip_filter_func(str[i])) { --i; } *output_size = i + 1; } static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) { if(node->is_tag) { int newline = 0; if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) { if(string_append(str, "\n", 1) != 0) return 1; newline = 1; } else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) { if(str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } newline = 1; } else if(node->name.size == 1 && node->name.data[0] == 'p') { if(str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } newline = 1; } size_t prev_size = str->size; for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) { merge_inner_text(&child->node, str); } if(newline && str->size > prev_size && str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } } else { const char *inner_text = node->name.data; size_t inner_text_size = node->name.size; strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline); if(inner_text_size > 0) { if(string_append(str, node->name.data, node->name.size) != 0) return 1; } } return 0; } QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) { if(self->__str.data) { QuickMediaStringView text; text.data = self->__str.data; text.size = self->__str.size; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } if(!self->node->first_child) { QuickMediaStringView text; text.data = NULL; text.size = 0; return text; } /* If the only child is the text node then there is no need to create a copy of it */ if(!self->node->first_child->next && !self->node->first_child->node.is_tag) { QuickMediaStringView text = self->node->first_child->node.name; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } if(merge_inner_text(self->node, &self->__str) != 0) { QuickMediaStringView text; text.data = NULL; text.size = 0; return text; } QuickMediaStringView text; text.data = self->__str.data; text.size = self->__str.size; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) { /* Utf8 BOM */ if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) { html_source += 3; size -= 3; } QuickMediaHtmlNode *html_node = &self->root_node; html_node_init(html_node); if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) { quickmedia_html_search_deinit(self); return 1; } return 0; } void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { html_node_deinit(&self->root_node); } int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; quickmedia_node_search_init(&search_data); int result = quickmedia_parse_xpath(xpath, &search_data); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata); cleanup: quickmedia_node_search_deinit(&search_data); return result; }