#include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" #include #include static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { assert(name); for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { const char *attr_name = tidyAttrName(attr); if(attr_name && strcmp(name, attr_name) == 0) return attr; } return NULL; } static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { /* We use two loops because we want to find children before grandchildren */ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { const char *child_node_name = tidyNodeGetName(child); /* A text node doesn't have a name */ if(!child_node_name) continue; /* Match without node name or node name matches */ if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { #define on_match() do { \ if(search_data->child) \ find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ else { \ QuickMediaHtmlNode node; \ node.doc = tdoc; \ node.node = child; \ node.text = NULL; \ result_callback(&node, userdata); \ if(node.text){ \ tidyBufFree(node.text); \ free(node.text); \ } \ } \ } while(0) /* If we search without param, then it's a match */ if(!search_data->param_defined) { on_match(); continue; } TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; const char *attr_value = tidyAttrValue(child_attr); assert(search_data->param.value); /* If the param value matches what we want to search for */ if(attr_value && strcmp(search_data->param.value, attr_value) == 0) { on_match(); continue; } } } if(search_data->recursive) { for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { find_child_nodes(tdoc, child, search_data, result_callback, userdata); } } } const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); if(!attr) return NULL; return tidyAttrValue(attr); } const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { QuickMediaStringView string_view; string_view.data = NULL; string_view.size = 0; if(self->text) { string_view.data = (const char*)((TidyBuffer*)self->text)->bp; string_view.size = ((TidyBuffer*)self->text)->size; return string_view; } TidyNode child_node = tidyGetChild(self->node); if(tidyNodeGetType(child_node) != TidyNode_Text) return string_view; self->text = malloc(sizeof(TidyBuffer)); tidyBufInit(self->text); tidyNodeGetText(self->doc, child_node, self->text); string_view.data = (const char*)((TidyBuffer*)self->text)->bp; string_view.size = ((TidyBuffer*)self->text)->size; return string_view; } static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(html_source); assert(search_data); assert(result_callback); if(!html_source || !search_data || !result_callback) return -1; TidyDoc tdoc = tidyCreate(); tidyOptSetBool(tdoc, TidyShowWarnings, no); /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */ int rc = tidyParseString( tdoc, html_source); if(rc < 0) { tidyRelease(tdoc); return rc; } TidyNode root_node = tidyGetRoot(tdoc); find_child_nodes(tdoc, root_node, search_data, result_callback, userdata); tidyRelease(tdoc); return 0; } int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; int xpath_result = quickmedia_parse_xpath(xpath, &search_data); if(xpath_result != 0) return xpath_result; return quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata); }