#include "../include/quickmedia/HtmlSearch.h"
#include "../include/quickmedia/XpathParser.h"
#include
#include
static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
assert(name);
for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
const char *attr_name = tidyAttrName(attr);
if(attr_name && strcmp(name, attr_name) == 0)
return attr;
}
return NULL;
}
static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
/* We use two loops because we want to find children before grandchildren */
for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
const char *child_node_name = tidyNodeGetName(child);
/* A text node doesn't have a name */
if(!child_node_name)
continue;
/* Match without node name or node name matches */
if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) {
#define on_match() do { \
if(search_data->child) \
find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \
else { \
QuickMediaHtmlNode node; \
node.doc = tdoc; \
node.node = child; \
node.text = NULL; \
result_callback(&node, userdata); \
if(node.text){ \
tidyBufFree(node.text); \
free(node.text); \
} \
} \
} while(0)
/* If we search without param, then it's a match */
if(!search_data->param.defined) {
on_match();
continue;
}
TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name);
/* Couldn't find the param that we want to match against */
if(!child_attr)
continue;
const char *attr_value = tidyAttrValue(child_attr);
assert(search_data->param.value);
/* If the param value matches what we want to search for */
if(attr_value && strcmp(search_data->param.value, attr_value) == 0) {
on_match();
continue;
}
}
}
if(search_data->recursive) {
for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
find_child_nodes(tdoc, child, search_data, result_callback, userdata);
}
}
}
const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name);
if(!attr)
return NULL;
return tidyAttrValue(attr);
}
const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
QuickMediaStringView string_view;
string_view.data = NULL;
string_view.size = 0;
if(self->text) {
string_view.data = (const char*)((TidyBuffer*)self->text)->bp;
string_view.size = ((TidyBuffer*)self->text)->size;
return string_view;
}
TidyNode child_node = tidyGetChild(self->node);
if(tidyNodeGetType(child_node) != TidyNode_Text)
return string_view;
self->text = malloc(sizeof(TidyBuffer));
tidyBufInit(self->text);
tidyNodeGetText(self->doc, child_node, self->text);
string_view.data = (const char*)((TidyBuffer*)self->text)->bp;
string_view.size = ((TidyBuffer*)self->text)->size;
return string_view;
}
static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
assert(html_source);
assert(search_data);
assert(result_callback);
if(!html_source || !search_data || !result_callback)
return -1;
TidyDoc tdoc = tidyCreate();
tidyOptSetBool(tdoc, TidyShowWarnings, no);
/* tidyOptSetBool(tdoc, TidyForceOutput, yes); */
int rc = tidyParseString( tdoc, html_source);
if(rc < 0) {
tidyRelease(tdoc);
return rc;
}
TidyNode root_node = tidyGetRoot(tdoc);
find_child_nodes(tdoc, root_node, search_data, result_callback, userdata);
tidyRelease(tdoc);
return 0;
}
int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
QuickMediaNodeSearch search_data;
quickmedia_node_search_init(&search_data);
int result = quickmedia_parse_xpath(xpath, &search_data);
if(result != 0)
goto cleanup;
result = quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata);
cleanup:
quickmedia_node_search_deinit(&search_data);
return result;
}