#include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" #include #include static void string_init(QuickMediaString *self) { self->data = NULL; self->size = 0; self->capacity = 0; } static void string_deinit(QuickMediaString *self) { free(self->data); self->data = NULL; self->size = 0; self->capacity = 0; } static int string_append(QuickMediaString *self, const char *str, size_t size) { size_t new_capacity = self->capacity; if(new_capacity == 0) { new_capacity = 8; } size_t new_size = self->size + size; while(new_size + 1 > new_capacity) { new_capacity += (new_capacity >> 1); } void *new_data = realloc(self->data, new_capacity); if(!new_data) { fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity); return 1; } memcpy((char*)new_data + self->size, str, size); ((char*)new_data)[self->size + size] = '\0'; self->data = (char*)new_data; self->size = new_size; self->capacity = new_capacity; return 0; } static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { size_t i = 0; while(i < size && str[i] == '\n') { ++i; } *output_str = str + i; *output_size = size - i; } static void rstrip_newline(const char *str, size_t size, size_t *output_size) { ssize_t i = size - 1; while(i >= 0 && str[i] == '\n') { --i; } *output_size = i + 1; } static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { lstrip_newline(str, size, output_str, output_size); rstrip_newline(*output_str, *output_size, output_size); } /* Returns pointer to char that is not |not_char|, even if the first matching character is the null terminator. |not_char| can't be '\0' */ static const char* find_first_not_char(const char *str, char not_char) { assert(not_char != '\0'); while(*str == not_char) { ++str; } return str; } /* Returns 0 on match */ static int str_glob_match(const char *str, const char *glob) { for(;;) { char glob_c = *glob; if(glob_c == '*') { glob = find_first_not_char(glob + 1, '*'); char next_glob_c = *glob; if(next_glob_c == '\0') return 0; str = strchr(str, next_glob_c); if(!str) return 1; } else { char str_c = *str; if(str_c != glob_c) return 1; if(str_c == '\0') return 0; } ++str; ++glob; } assert(0); /* shouldn't happen */ return 1; } static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) { for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { const char *node_name = tidyNodeGetName(child); if(node_name && strcmp(node_name, "br") == 0) { string_append(str, "\n", 1); } if(tidyNodeGetType(child) == TidyNode_Text) { TidyBuffer tidy_buffer; tidyBufInit(&tidy_buffer); if(tidyNodeGetText(doc, child, &tidy_buffer)) { const char *inner_text = (const char*)tidy_buffer.bp; size_t inner_text_size = tidy_buffer.size; strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size); string_append(str, inner_text, inner_text_size); } tidyBufFree(&tidy_buffer); } else { int res = add_inner_text_recursive(doc, child, str); if(res != 0) return res; } } return 0; } static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { assert(name); for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { const char *attr_name = tidyAttrName(attr); if(attr_name && strcmp(name, attr_name) == 0) return attr; } return NULL; } static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { /* We use two loops because we want to find children before grandchildren */ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { const char *child_node_name = tidyNodeGetName(child); /* A text node doesn't have a name */ if(!child_node_name) continue; /* Match without node name or node name matches */ if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { #define on_match() do { \ if(search_data->child) \ find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ else { \ QuickMediaHtmlNode node; \ node.doc = tdoc; \ node.node = child; \ string_init(&node.text); \ result_callback(&node, userdata); \ string_deinit(&node.text); \ } \ } while(0) /* If we search without param, then it's a match */ if(!search_data->param.defined) { on_match(); continue; } TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; const char *attr_value = tidyAttrValue(child_attr); assert(search_data->param.value); /* If the param value matches what we want to search for */ if(attr_value && str_glob_match(attr_value, search_data->param.value) == 0) { on_match(); continue; } } } if(search_data->recursive) { for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { find_child_nodes(tdoc, child, search_data, result_callback, userdata); } } } const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); if(!attr) return NULL; return tidyAttrValue(attr); } const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { if(self->text.data) return self->text.data; if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0) string_append(&self->text, " ", 1); return self->text.data; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(search_data); assert(result_callback); if(!search_data || !result_callback) return -1; TidyNode root_node = tidyGetRoot(self->doc); find_child_nodes(self->doc, root_node, search_data, result_callback, userdata); return 0; } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) { self->doc = tidyCreate(); tidyOptSetBool(self->doc, TidyShowWarnings, no); tidyOptSetInt(self->doc, TidyUseCustomTags, 1); tidyOptSetInt(self->doc, TidyWrapLen, 0); /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */ if(tidyParseString(self->doc, html_source) < 0) { tidyRelease(self->doc); self->doc = NULL; } return 0; } void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { if(self->doc) { tidyRelease(self->doc); self->doc = NULL; } } int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; quickmedia_node_search_init(&search_data); int result = quickmedia_parse_xpath(xpath, &search_data); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata); cleanup: quickmedia_node_search_deinit(&search_data); return result; }