#include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" #include #include #include #include static void string_init(QuickMediaString *self) { self->data = NULL; self->size = 0; self->capacity = 0; } static void string_deinit(QuickMediaString *self) { free(self->data); self->data = NULL; self->size = 0; self->capacity = 0; } static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) { if(self->capacity >= new_capacity) return 0; size_t capacity = self->capacity; if(capacity == 0) capacity = 8; while(capacity < new_capacity) { capacity += (capacity >> 1); } void *new_data = realloc(self->data, capacity); if(!new_data) { fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity); return 1; } self->data = new_data; self->capacity = capacity; return 0; } static int string_append(QuickMediaString *self, const char *str, size_t size) { if(size == 0) return 0; int res = string_ensure_capacity(self, self->size + size + 1); if(res != 0) return res; memcpy((char*)self->data + self->size, str, size); ((char*)self->data)[self->size + size] = '\0'; self->size += size; return 0; } static size_t find_first_not_char(const char *str, size_t size, char not_char) { assert(not_char != '\0'); size_t i = 0; for(; i < size && str[i] == not_char; ++i) {} return i; } static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) { if(index < str->size) return str->data[index]; else return fallback; } /* Returns 0 on match */ static int str_glob_match(const QuickMediaStringView str, const QuickMediaStringView glob, int is_glob) { size_t str_index = 0; size_t glob_index = 0; if(!is_glob) { if(glob.size == str.size && memcmp(str.data, glob.data, str.size) == 0) return 0; else return 1; } if(str.size == 0) { /* TODO: What about glob = **** (more than one asterix) */ if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*')) return 0; else return 1; } size_t prev_str_index = 0; size_t prev_glob_index = 0; char next_glob_c = '\0'; for(;;) { char glob_c = string_view_char_or(&glob, glob_index, '\0'); if(glob_c == '*') { glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*'); next_glob_c = string_view_char_or(&glob, glob_index, '\0'); if(next_glob_c == '\0') return 0; const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index); if(!s_p) return 1; str_index = (const char*)s_p - str.data; prev_str_index = str_index; prev_glob_index = glob_index; } else { char str_c = string_view_char_or(&str, str_index, '\0'); if(str_c != glob_c) { str_index = prev_str_index + 1; glob_index = prev_glob_index; const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index); if(!s_p) return 1; str_index = (const char*)s_p - str.data; prev_str_index = str_index; continue; } if(str_c == '\0') return 0; ++str_index; ++glob_index; } } assert(0); /* shouldn't happen */ return 1; } static char to_upper(char c) { if(c >= 'a' && c <= 'z') return c - 32; else return c; } static int string_views_equal_case_insensitive(const QuickMediaStringView str1, const QuickMediaStringView str2) { if(str2.size != str1.size) return 1; for(size_t i = 0; i < str1.size; ++i) { char c1 = str1.data[i]; char c2 = str2.data[i]; if(to_upper(c1) != to_upper(c2)) return 1; } return 0; } static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) { for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) { if(string_views_equal_case_insensitive(attr->key, name) == 0) return attr; } return NULL; } static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { if(!node) return 0; int match_index = 0; /* We use two loops because we want to find children before grandchildren */ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { /* A text node doesn't have a name */ if(!child->node.is_tag || child->node.name.size == 0) continue; /* Match without node name or node name matches */ if(search_data->name.size == 0 || string_views_equal_case_insensitive(child->node.name, search_data->name) == 0) { #define on_match() do { \ if(search_data->child) { \ if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \ return 1; \ } else { \ QuickMediaMatchNode match_node; \ match_node.node = &child->node; \ string_init(&match_node.__str); \ if(result_callback(&match_node, userdata) != 0) { \ string_deinit(&match_node.__str); \ return 1; \ } \ string_deinit(&match_node.__str); \ } \ } while(0) /* If we search without param, then it's a match */ if(!search_data->param.defined) { if(search_data->param.index == -1 || search_data->param.index == match_index) on_match(); ++match_index; continue; } QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; assert(search_data->param.value.size > 0); /* If the param value matches what we want to search for */ if(str_glob_match(child_attr->value, search_data->param.value, search_data->param.value_is_glob) == 0) { if(search_data->param.index == -1 || search_data->param.index == match_index) on_match(); ++match_index; continue; } } } if(search_data->recursive) { for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0) return 1; } } return 0; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(search_data); assert(result_callback); if(!search_data || !result_callback) return -1; find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata); return 0; } static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent); static void html_node_child_deinit(QuickMediaHtmlChildNode *self); static void html_attribute_init(QuickMediaHtmlAttribute *self) { self->key.data = NULL; self->key.size = 0; self->value.data = NULL, self->value.size = 0; self->next = NULL; } static void html_attribute_deinit(QuickMediaHtmlAttribute *self) { if(self->next) { html_attribute_deinit(self->next); free(self->next); self->next = NULL; } html_attribute_init(self); } static void html_node_init(QuickMediaHtmlNode *self) { self->is_tag = 1; self->name.data = NULL; self->name.size = 0; self->first_attribute = NULL; self->last_attribute = NULL; self->first_child = NULL; self->last_child = NULL; self->parent = NULL; } static void html_node_deinit(QuickMediaHtmlNode *self) { if(self->first_attribute) { html_attribute_deinit(self->first_attribute); free(self->first_attribute); self->first_attribute = NULL; } if(self->first_child) { html_node_child_deinit(self->first_child); free(self->first_child); self->first_child = NULL; } html_node_init(self); } static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) { QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute)); if(!attribute) return 1; html_attribute_init(attribute); attribute->key.data = key.data; attribute->key.size = key.size; attribute->value.data = value.data; attribute->value.size = value.size; if(self->last_attribute) { self->last_attribute->next = attribute; self->last_attribute = attribute; } else { self->first_attribute = attribute; self->last_attribute = attribute; } return 0; } void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) { html_node_init(&self->node); self->node.parent = parent; if(parent) { if(parent->last_child) { parent->last_child->next = self; parent->last_child = self; } else { parent->first_child = self; parent->last_child = self; } } self->next = NULL; } void html_node_child_deinit(QuickMediaHtmlChildNode *self) { if(self->next) { html_node_child_deinit(self->next); free(self->next); self->next = NULL; } html_node_deinit(&self->node); } static int is_whitespace(int c) { switch(c) { case ' ': case '\n': case '\r': case '\t': case '\v': return 1; default: return 0; } } static int is_newline(int c) { return c == '\n' || c == '\r'; } static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { size_t i = 0; while(i < size && strip_filter_func(str[i])) { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { ssize_t i = size - 1; while(i >= 0 && strip_filter_func(str[i])) { --i; } *output_size = i + 1; } static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { lstrip(str, size, output_str, output_size, strip_filter_func); rstrip(*output_str, *output_size, output_size, strip_filter_func); } static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { QuickMediaHtmlNode **html_node_p = userdata; QuickMediaHtmlNode *html_node = *html_node_p; switch(parse_type) { case HTML_PARSE_TAG_START: { QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); if(!child_node) return 1; html_node_child_init(child_node, html_node); child_node->node.name.data = html_parser->tag_name.data; child_node->node.name.size = html_parser->tag_name.size; *html_node_p = &child_node->node; break; } case HTML_PARSE_TAG_END: { if(html_node->parent) *html_node_p = html_node->parent; break; } case HTML_PARSE_ATTRIBUTE: { HtmlStringView attr_key = html_parser->attribute_key; HtmlStringView attr_value = html_parser->attribute_value; strip(attr_key.data, attr_key.size, &attr_key.data, &attr_key.size, is_whitespace); strip(attr_value.data, attr_value.size, &attr_value.data, &attr_value.size, is_whitespace); if(html_node_add_attribute(html_node, attr_key, attr_value) != 0) return 1; break; } case HTML_PARSE_TEXT: /* fallthrough */ case HTML_PARSE_JAVASCRIPT_CODE: { if(html_parser->text_stripped.size == 0) return 0; QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); if(!child_node) return 1; html_node_child_init(child_node, html_node); child_node->node.is_tag = 0; child_node->node.name.data = html_parser->text.data; child_node->node.name.size = html_parser->text.size; break; } } return 0; } QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { QuickMediaStringView attr_name; attr_name.data = attribute_name; attr_name.size = strlen(attribute_name); QuickMediaHtmlAttribute *attr = get_attribute_by_name(self, attr_name); if(attr) { QuickMediaStringView attr_value = attr->value; strip(attr_value.data, attr_value.size, &attr_value.data, &attr_value.size, is_whitespace); return attr_value; } else { QuickMediaStringView attr_value; attr_value.data = NULL; attr_value.size = 0; return attr_value; } } static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) { if(node->is_tag) { int newline = 0; if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) { if(string_append(str, "\n", 1) != 0) return 1; newline = 1; } else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) { if(str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } newline = 1; } else if(node->name.size == 1 && node->name.data[0] == 'p') { if(str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } newline = 1; } size_t prev_size = str->size; for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) { merge_inner_text(&child->node, str); } if(newline && str->size > prev_size && str->size > 0) { if(string_append(str, "\n", 1) != 0) return 1; } } else { const char *inner_text = node->name.data; size_t inner_text_size = node->name.size; strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline); if(inner_text_size > 0) { if(string_append(str, inner_text, inner_text_size) != 0) return 1; } } return 0; } QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) { if(self->__str.data) { QuickMediaStringView text; text.data = self->__str.data; text.size = self->__str.size; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } if(!self->node->first_child) { QuickMediaStringView text; text.data = NULL; text.size = 0; return text; } /* If the only child is the text node then there is no need to create a copy of it */ if(!self->node->first_child->next && !self->node->first_child->node.is_tag) { QuickMediaStringView text = self->node->first_child->node.name; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } if(merge_inner_text(self->node, &self->__str) != 0) { QuickMediaStringView text; text.data = NULL; text.size = 0; return text; } QuickMediaStringView text; text.data = self->__str.data; text.size = self->__str.size; strip(text.data, text.size, &text.data, &text.size, is_whitespace); return text; } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) { QuickMediaHtmlNode *html_node = &self->root_node; html_node_init(html_node); if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) { quickmedia_html_search_deinit(self); return 1; } return 0; } void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { html_node_deinit(&self->root_node); } int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; quickmedia_node_search_init(&search_data); int result = quickmedia_parse_xpath(xpath, &search_data); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata); cleanup: quickmedia_node_search_deinit(&search_data); return result; }