#include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" #include #include static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { assert(name); for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { const char *attr_name = tidyAttrName(attr); if(attr_name && strcmp(name, attr_name) == 0) return attr; } return NULL; } static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { /* We use two loops because we want to find children before grandchildren */ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { const char *child_node_name = tidyNodeGetName(child); /* A text node doesn't have a name */ if(!child_node_name) continue; /* Match without node name or node name matches */ if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { #define on_match() do { \ if(search_data->child) \ find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ else { \ QuickMediaHtmlNode node; \ node.doc = tdoc; \ node.node = child; \ node.text = NULL; \ result_callback(&node, userdata); \ if(node.text){ \ tidyBufFree(node.text); \ free(node.text); \ } \ } \ } while(0) /* If we search without param, then it's a match */ if(!search_data->param.defined) { on_match(); continue; } TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; const char *attr_value = tidyAttrValue(child_attr); assert(search_data->param.value); /* If the param value matches what we want to search for */ if(attr_value && strcmp(search_data->param.value, attr_value) == 0) { on_match(); continue; } } } if(search_data->recursive) { for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { find_child_nodes(tdoc, child, search_data, result_callback, userdata); } } } const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); if(!attr) return NULL; return tidyAttrValue(attr); } const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { if(self->text) return (const char*)((TidyBuffer*)self->text)->bp; TidyNode child_node = tidyGetChild(self->node); if(tidyNodeGetType(child_node) != TidyNode_Text) return NULL; self->text = malloc(sizeof(TidyBuffer)); tidyBufInit(self->text); tidyNodeGetText(self->doc, child_node, self->text); return (const char*)((TidyBuffer*)self->text)->bp; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(search_data); assert(result_callback); if(!search_data || !result_callback) return -1; TidyNode root_node = tidyGetRoot(self->doc); find_child_nodes(self->doc, root_node, search_data, result_callback, userdata); return 0; } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) { self->doc = tidyCreate(); tidyOptSetBool(self->doc, TidyShowWarnings, no); tidyOptSetBool(self->doc, TidyUseCustomTags, yes); /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */ if(tidyParseString(self->doc, html_source) < 0) { tidyRelease(self->doc); self->doc = NULL; } return 0; } void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { if(self->doc) { tidyRelease(self->doc); self->doc = NULL; } } int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; quickmedia_node_search_init(&search_data); int result = quickmedia_parse_xpath(xpath, &search_data); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata); cleanup: quickmedia_node_search_deinit(&search_data); return result; }