diff options
Diffstat (limited to 'src/HtmlSearch.c')
-rw-r--r-- | src/HtmlSearch.c | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c new file mode 100644 index 0000000..e59dc1e --- /dev/null +++ b/src/HtmlSearch.c @@ -0,0 +1,130 @@ +#include "../include/quickmedia/HtmlSearch.h" +#include "../include/quickmedia/XpathParser.h" + +#include <tidy.h> +#include <tidybuffio.h> + +static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { + assert(name); + for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { + const char *attr_name = tidyAttrName(attr); + if(attr_name && strcmp(name, attr_name) == 0) + return attr; + } + return NULL; +} + +static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + /* We use two loops because we want to find children before grandchildren */ + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + const char *child_node_name = tidyNodeGetName(child); + /* A text node doesn't have a name */ + if(!child_node_name) + continue; + + /* Match without node name or node name matches */ + if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { + #define on_match() do { \ + if(search_data->child) \ + find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ + else { \ + QuickMediaHtmlNode node; \ + node.doc = tdoc; \ + node.node = child; \ + node.text = NULL; \ + result_callback(&node, userdata); \ + if(node.text){ \ + tidyBufFree(node.text); \ + free(node.text); \ + } \ + } \ + } while(0) + + /* If we search without param, then it's a match */ + if(!search_data->param_defined) { + on_match(); + continue; + } + + TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); + /* Couldn't find the param that we want to match against */ + if(!child_attr) + continue; + + const char *attr_value = tidyAttrValue(child_attr); + assert(search_data->param.value); + /* If the param value matches what we want to search for */ + if(attr_value && strcmp(search_data->param.value, attr_value) == 0) { + on_match(); + continue; + } + } + } + + if(search_data->recursive) { + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + find_child_nodes(tdoc, child, search_data, result_callback, userdata); + } + } +} + +const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { + TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); + if(!attr) + return NULL; + return tidyAttrValue(attr); +} + +const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { + QuickMediaStringView string_view; + string_view.data = NULL; + string_view.size = 0; + + if(self->text) { + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; + } + + TidyNode child_node = tidyGetChild(self->node); + if(tidyNodeGetType(child_node) != TidyNode_Text) + return string_view; + + self->text = malloc(sizeof(TidyBuffer)); + tidyBufInit(self->text); + tidyNodeGetText(self->doc, child_node, self->text); + + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; +} + +static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + assert(html_source); + assert(search_data); + assert(result_callback); + if(!html_source || !search_data || !result_callback) + return -1; + + TidyDoc tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyShowWarnings, no); + /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */ + int rc = tidyParseString( tdoc, html_source); + if(rc < 0) { + tidyRelease(tdoc); + return rc; + } + + TidyNode root_node = tidyGetRoot(tdoc); + find_child_nodes(tdoc, root_node, search_data, result_callback, userdata); + tidyRelease(tdoc); + return 0; +} + +int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + QuickMediaNodeSearch search_data; + int xpath_result = quickmedia_parse_xpath(xpath, &search_data); + if(xpath_result != 0) + return xpath_result; + return quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata); +} |