#include "../include/quickmedia/HtmlSearch.h"
#include "../include/quickmedia/XpathParser.h"
#include
#include
static void string_init(QuickMediaString *self) {
self->data = NULL;
self->size = 0;
self->capacity = 0;
}
static void string_deinit(QuickMediaString *self) {
free(self->data);
self->data = NULL;
self->size = 0;
self->capacity = 0;
}
static int string_append(QuickMediaString *self, const char *str, size_t size) {
size_t new_capacity = self->capacity;
if(new_capacity == 0) {
new_capacity = 8;
}
size_t new_size = self->size + size;
while(new_size + 1 > new_capacity) {
new_capacity += (new_capacity >> 1);
}
void *new_data = realloc(self->data, new_capacity);
if(!new_data) {
fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity);
return 1;
}
memcpy((char*)new_data + self->size, str, size);
((char*)new_data)[self->size + size] = '\0';
self->data = (char*)new_data;
self->size = new_size;
self->capacity = new_capacity;
return 0;
}
static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
size_t i = 0;
while(i < size && str[i] == '\n') {
++i;
}
*output_str = str + i;
*output_size = size - i;
}
static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
ssize_t i = size - 1;
while(i >= 0 && str[i] == '\n') {
--i;
}
*output_size = i + 1;
}
static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
lstrip_newline(str, size, output_str, output_size);
rstrip_newline(*output_str, *output_size, output_size);
}
static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) {
for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
const char *node_name = tidyNodeGetName(child);
if(node_name && strcmp(node_name, "br") == 0) {
string_append(str, "\n", 1);
}
if(tidyNodeGetType(child) == TidyNode_Text) {
TidyBuffer tidy_buffer;
tidyBufInit(&tidy_buffer);
if(tidyNodeGetText(doc, child, &tidy_buffer)) {
const char *inner_text = (const char*)tidy_buffer.bp;
size_t inner_text_size = tidy_buffer.size;
strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
string_append(str, inner_text, inner_text_size);
}
tidyBufFree(&tidy_buffer);
} else {
int res = add_inner_text_recursive(doc, child, str);
if(res != 0)
return res;
}
}
return 0;
}
static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
assert(name);
for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
const char *attr_name = tidyAttrName(attr);
if(attr_name && strcmp(name, attr_name) == 0)
return attr;
}
return NULL;
}
static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
/* We use two loops because we want to find children before grandchildren */
for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
const char *child_node_name = tidyNodeGetName(child);
/* A text node doesn't have a name */
if(!child_node_name)
continue;
/* Match without node name or node name matches */
if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) {
#define on_match() do { \
if(search_data->child) \
find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \
else { \
QuickMediaHtmlNode node; \
node.doc = tdoc; \
node.node = child; \
string_init(&node.text); \
result_callback(&node, userdata); \
string_deinit(&node.text); \
} \
} while(0)
/* If we search without param, then it's a match */
if(!search_data->param.defined) {
on_match();
continue;
}
TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name);
/* Couldn't find the param that we want to match against */
if(!child_attr)
continue;
const char *attr_value = tidyAttrValue(child_attr);
assert(search_data->param.value);
/* If the param value matches what we want to search for */
if(attr_value && strcmp(search_data->param.value, attr_value) == 0) {
on_match();
continue;
}
}
}
if(search_data->recursive) {
for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
find_child_nodes(tdoc, child, search_data, result_callback, userdata);
}
}
}
const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name);
if(!attr)
return NULL;
return tidyAttrValue(attr);
}
const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
if(self->text.data)
return self->text.data;
if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0)
string_append(&self->text, " ", 1);
return self->text.data;
}
static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
assert(search_data);
assert(result_callback);
if(!search_data || !result_callback)
return -1;
TidyNode root_node = tidyGetRoot(self->doc);
find_child_nodes(self->doc, root_node, search_data, result_callback, userdata);
return 0;
}
int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) {
self->doc = tidyCreate();
tidyOptSetBool(self->doc, TidyShowWarnings, no);
tidyOptSetBool(self->doc, TidyUseCustomTags, yes);
tidyOptSetInt(self->doc, TidyWrapLen, 0);
/* tidyOptSetBool(self->doc, TidyForceOutput, yes); */
if(tidyParseString(self->doc, html_source) < 0) {
tidyRelease(self->doc);
self->doc = NULL;
}
return 0;
}
void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
if(self->doc) {
tidyRelease(self->doc);
self->doc = NULL;
}
}
int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
QuickMediaNodeSearch search_data;
quickmedia_node_search_init(&search_data);
int result = quickmedia_parse_xpath(xpath, &search_data);
if(result != 0)
goto cleanup;
result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata);
cleanup:
quickmedia_node_search_deinit(&search_data);
return result;
}