aboutsummaryrefslogtreecommitdiff
path: root/src/HtmlSearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/HtmlSearch.c')
-rw-r--r--src/HtmlSearch.c512
1 files changed, 379 insertions, 133 deletions
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c
index c49ee46..23c4736 100644
--- a/src/HtmlSearch.c
+++ b/src/HtmlSearch.c
@@ -1,8 +1,10 @@
#include "../include/quickmedia/HtmlSearch.h"
#include "../include/quickmedia/XpathParser.h"
-#include <tidy.h>
-#include <tidybuffio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
static void string_init(QuickMediaString *self) {
self->data = NULL;
@@ -17,75 +19,83 @@ static void string_deinit(QuickMediaString *self) {
self->capacity = 0;
}
-static int string_append(QuickMediaString *self, const char *str, size_t size) {
- size_t new_capacity = self->capacity;
- if(new_capacity == 0) {
- new_capacity = 8;
- }
+static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) {
+ if(self->capacity >= new_capacity)
+ return 0;
+
+ size_t capacity = self->capacity;
+ if(capacity == 0)
+ capacity = 8;
- size_t new_size = self->size + size;
- while(new_size + 1 > new_capacity) {
- new_capacity += (new_capacity >> 1);
- }
+ while(capacity < new_capacity) {
+ capacity += (capacity >> 1);
+ }
- void *new_data = realloc(self->data, new_capacity);
+ void *new_data = realloc(self->data, capacity);
if(!new_data) {
- fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity);
+ fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity);
return 1;
}
-
- memcpy((char*)new_data + self->size, str, size);
- ((char*)new_data)[self->size + size] = '\0';
- self->data = (char*)new_data;
- self->size = new_size;
- self->capacity = new_capacity;
+
+ self->data = new_data;
+ self->capacity = capacity;
return 0;
}
-static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- size_t i = 0;
- while(i < size && str[i] == '\n') {
- ++i;
- }
- *output_str = str + i;
- *output_size = size - i;
-}
+static int string_append(QuickMediaString *self, const char *str, size_t size) {
+ int res = string_ensure_capacity(self, self->size + size);
+ if(res != 0)
+ return res;
-static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
- ssize_t i = size - 1;
- while(i >= 0 && str[i] == '\n') {
- --i;
- }
- *output_size = i + 1;
+ memcpy((char*)self->data + self->size, str, size);
+ ((char*)self->data)[self->size + size] = '\0';
+ self->size += size;
+ return 0;
}
-static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- lstrip_newline(str, size, output_str, output_size);
- rstrip_newline(*output_str, *output_size, output_size);
+static size_t find_first_not_char(const char *str, size_t size, char not_char) {
+ assert(not_char != '\0');
+ size_t i = 0;
+ for(; i < size && str[i] == not_char; ++i) {}
+ return i;
}
-/* Returns pointer to char that is not |not_char|, even if the first matching character is the null terminator. |not_char| can't be '\0' */
-static const char* find_first_not_char(const char *str, char not_char) {
- assert(not_char != '\0');
- while(*str == not_char) { ++str; }
- return str;
+static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) {
+ if(index < str->size)
+ return str->data[index];
+ else
+ return fallback;
}
/* Returns 0 on match */
-static int str_glob_match(const char *str, const char *glob) {
+static int str_glob_match(const QuickMediaStringView str, const QuickMediaStringView glob) {
+ size_t str_index = 0;
+ size_t glob_index = 0;
+
+ if(str.size == 0) {
+ /* TODO: What about glob = **** (more than one asterix) */
+ if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*'))
+ return 0;
+ else
+ return 1;
+ }
+
for(;;) {
- char glob_c = *glob;
+ char glob_c = string_view_char_or(&glob, glob_index, '\0');
if(glob_c == '*') {
- glob = find_first_not_char(glob + 1, '*');
- char next_glob_c = *glob;
+ glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*');
+ char next_glob_c = string_view_char_or(&glob, glob_index, '\0');
if(next_glob_c == '\0')
return 0;
- str = strchr(str, next_glob_c);
- if(!str)
+ const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index);
+ if(!s_p)
return 1;
+
+ const size_t new_str_index = (const char*)s_p - (str.data + str_index);
+ str_index = new_str_index;
} else {
- char str_c = *str;
+ char str_c = string_view_char_or(&str, str_index, '\0');
if(str_c != glob_c)
return 1;
@@ -93,73 +103,54 @@ static int str_glob_match(const char *str, const char *glob) {
return 0;
}
- ++str;
- ++glob;
+ ++str_index;
+ ++glob_index;
}
assert(0); /* shouldn't happen */
return 1;
}
-static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) {
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- const char *node_name = tidyNodeGetName(child);
- if(node_name && strcmp(node_name, "br") == 0) {
- string_append(str, "\n", 1);
- } else if(tidyNodeGetType(child) == TidyNode_Start && node_name && strcmp(node_name, "p") == 0) {
- if(str->size > 0)
- string_append(str, "\n", 1);
- }
-
- if(tidyNodeGetType(child) == TidyNode_Text) {
- TidyBuffer tidy_buffer;
- tidyBufInit(&tidy_buffer);
- if(tidyNodeGetText(doc, child, &tidy_buffer)) {
- const char *inner_text = (const char*)tidy_buffer.bp;
- size_t inner_text_size = tidy_buffer.size;
- strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
- string_append(str, inner_text, inner_text_size);
- }
- tidyBufFree(&tidy_buffer);
- } else {
- int res = add_inner_text_recursive(doc, child, str);
- if(res != 0)
- return res;
- }
- }
- return 0;
+static int string_views_equal(const QuickMediaStringView str1, const QuickMediaStringView str2) {
+ if(str2.size == str1.size && memcmp(str2.data, str1.data, str1.size) == 0)
+ return 0;
+ else
+ return 1;
}
-static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
- assert(name);
- for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
- const char *attr_name = tidyAttrName(attr);
- if(attr_name && strcmp(name, attr_name) == 0)
+static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) {
+ for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) {
+ if(string_views_equal(attr->key, name) == 0)
return attr;
}
return NULL;
}
-static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+ if(!node)
+ return 0;
+
/* We use two loops because we want to find children before grandchildren */
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- const char *child_node_name = tidyNodeGetName(child);
+ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
/* A text node doesn't have a name */
- if(!child_node_name)
+ if(!child->node.is_tag || child->node.name.size == 0)
continue;
/* Match without node name or node name matches */
- if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) {
+ if(search_data->name.size == 0 || string_views_equal(child->node.name, search_data->name) == 0) {
#define on_match() do { \
- if(search_data->child) \
- find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \
- else { \
- QuickMediaHtmlNode node; \
- node.doc = tdoc; \
- node.node = child; \
- string_init(&node.text); \
- result_callback(&node, userdata); \
- string_deinit(&node.text); \
+ if(search_data->child) { \
+ if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \
+ return 1; \
+ } else { \
+ QuickMediaMatchNode match_node; \
+ match_node.node = &child->node; \
+ string_init(&match_node.__str); \
+ if(result_callback(&match_node, userdata) != 0) { \
+ string_deinit(&match_node.__str); \
+ return 1; \
+ } \
+ string_deinit(&match_node.__str); \
} \
} while(0)
@@ -169,15 +160,14 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
continue;
}
- TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name);
+ QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name);
/* Couldn't find the param that we want to match against */
if(!child_attr)
continue;
- const char *attr_value = tidyAttrValue(child_attr);
- assert(search_data->param.value);
+ assert(search_data->param.value.size > 0);
/* If the param value matches what we want to search for */
- if(attr_value && str_glob_match(attr_value, search_data->param.value) == 0) {
+ if(str_glob_match(child_attr->value, search_data->param.value) == 0) {
on_match();
continue;
}
@@ -185,27 +175,13 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
}
if(search_data->recursive) {
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- find_child_nodes(tdoc, child, search_data, result_callback, userdata);
+ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
+ if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0)
+ return 1;
}
}
-}
-
-const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
- TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name);
- if(!attr)
- return NULL;
- return tidyAttrValue(attr);
-}
-const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
- if(self->text.data)
- return self->text.data;
-
- if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0)
- string_append(&self->text, " ", 1);
-
- return self->text.data;
+ return 0;
}
static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
@@ -214,29 +190,299 @@ static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNode
if(!search_data || !result_callback)
return -1;
- TidyNode root_node = tidyGetRoot(self->doc);
- find_child_nodes(self->doc, root_node, search_data, result_callback, userdata);
+ find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata);
return 0;
}
-int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) {
- self->doc = tidyCreate();
- tidyOptSetBool(self->doc, TidyShowWarnings, no);
- tidyOptSetInt(self->doc, TidyUseCustomTags, 1);
- tidyOptSetInt(self->doc, TidyWrapLen, 0);
- /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */
- if(tidyParseString(self->doc, html_source) < 0) {
- tidyRelease(self->doc);
- self->doc = NULL;
+static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent);
+static void html_node_child_deinit(QuickMediaHtmlChildNode *self);
+
+static void html_attribute_init(QuickMediaHtmlAttribute *self) {
+ self->key.data = NULL;
+ self->key.size = 0;
+ self->value.data = NULL,
+ self->value.size = 0;
+ self->next = NULL;
+}
+
+static void html_attribute_deinit(QuickMediaHtmlAttribute *self) {
+ if(self->next) {
+ html_attribute_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
+ }
+ html_attribute_init(self);
+}
+
+static void html_node_init(QuickMediaHtmlNode *self) {
+ self->is_tag = 1;
+ self->name.data = NULL;
+ self->name.size = 0;
+ self->first_attribute = NULL;
+ self->last_attribute = NULL;
+ self->first_child = NULL;
+ self->last_child = NULL;
+ self->parent = NULL;
+}
+
+static void html_node_deinit(QuickMediaHtmlNode *self) {
+ if(self->first_attribute) {
+ html_attribute_deinit(self->first_attribute);
+ free(self->first_attribute);
+ self->first_attribute = NULL;
+ }
+
+ if(self->first_child) {
+ html_node_child_deinit(self->first_child);
+ free(self->first_child);
+ self->first_child = NULL;
+ }
+
+ html_node_init(self);
+}
+
+static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) {
+ QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute));
+ if(!attribute)
+ return 1;
+
+ html_attribute_init(attribute);
+ attribute->key.data = key.data;
+ attribute->key.size = key.size;
+ attribute->value.data = value.data;
+ attribute->value.size = value.size;
+
+ if(self->last_attribute) {
+ self->last_attribute->next = attribute;
+ self->last_attribute = attribute;
+ } else {
+ self->first_attribute = attribute;
+ self->last_attribute = attribute;
}
+
return 0;
}
-void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
- if(self->doc) {
- tidyRelease(self->doc);
- self->doc = NULL;
+void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) {
+ html_node_init(&self->node);
+ self->node.parent = parent;
+ if(parent) {
+ if(parent->last_child) {
+ parent->last_child->next = self;
+ parent->last_child = self;
+ } else {
+ parent->first_child = self;
+ parent->last_child = self;
+ }
+ }
+ self->next = NULL;
+}
+
+void html_node_child_deinit(QuickMediaHtmlChildNode *self) {
+ if(self->next) {
+ html_node_child_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
}
+ html_node_deinit(&self->node);
+}
+
+static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
+ QuickMediaHtmlNode **html_node_p = userdata;
+ QuickMediaHtmlNode *html_node = *html_node_p;
+
+ switch(parse_type) {
+ case HTML_PARSE_TAG_START: {
+ QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
+ if(!child_node)
+ return 1;
+ html_node_child_init(child_node, html_node);
+ child_node->node.name.data = html_parser->tag_name.data;
+ child_node->node.name.size = html_parser->tag_name.size;
+ *html_node_p = &child_node->node;
+ break;
+ }
+ case HTML_PARSE_TAG_END: {
+ if(html_node->parent)
+ *html_node_p = html_node->parent;
+ break;
+ }
+ case HTML_PARSE_ATTRIBUTE: {
+ if(html_node_add_attribute(html_node, html_parser->attribute_key, html_parser->attribute_value) != 0)
+ return 1;
+ break;
+ }
+ case HTML_PARSE_TEXT:
+ /* fallthrough */
+ case HTML_PARSE_JAVASCRIPT_CODE: {
+ QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
+ if(!child_node)
+ return 1;
+ html_node_child_init(child_node, html_node);
+ child_node->node.is_tag = 0;
+ child_node->node.name.data = html_parser->text.data;
+ child_node->node.name.size = html_parser->text.size;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name) {
+ QuickMediaStringView attr_name;
+ attr_name.data = attribute_name;
+ attr_name.size = strlen(attribute_name);
+
+ QuickMediaHtmlAttribute *attr = get_attribute_by_name(self->node, attr_name);
+ if(attr) {
+ return attr->value;
+ } else {
+ QuickMediaStringView attr_value;
+ attr_value.data = NULL;
+ attr_value.size = 0;
+ return attr_value;
+ }
+}
+
+static int is_whitespace(int c) {
+ switch(c) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\v':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int is_newline(int c) {
+ return c == '\n' || c == '\r';
+}
+
+static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+ size_t i = 0;
+ while(i < size && strip_filter_func(str[i])) {
+ ++i;
+ }
+ *output_str = str + i;
+ *output_size = size - i;
+}
+
+static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
+ ssize_t i = size - 1;
+ while(i >= 0 && strip_filter_func(str[i])) {
+ --i;
+ }
+ *output_size = i + 1;
+}
+
+static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+ lstrip(str, size, output_str, output_size, strip_filter_func);
+ rstrip(*output_str, *output_size, output_size, strip_filter_func);
+}
+
+static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) {
+ if(node->is_tag) {
+ int newline = 0;
+ if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ newline = 1;
+ } else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) {
+ if(str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ newline = 1;
+ } else if(node->name.size == 1 && node->name.data[0] == 'p') {
+ if(str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ newline = 1;
+ }
+
+ size_t prev_size = str->size;
+ for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) {
+ merge_inner_text(&child->node, str);
+ }
+
+ if(newline && str->size > prev_size && str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ } else {
+ const char *inner_text = node->name.data;
+ size_t inner_text_size = node->name.size;
+ strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline);
+ if(inner_text_size > 0) {
+ if(string_append(str, node->name.data, node->name.size) != 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) {
+ if(self->__str.data) {
+ QuickMediaStringView text;
+ text.data = self->__str.data;
+ text.size = self->__str.size;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+ }
+
+ if(!self->node->first_child) {
+ QuickMediaStringView text;
+ text.data = NULL;
+ text.size = 0;
+ return text;
+ }
+
+ /* If the only child is the text node then there is no need to create a copy of it */
+ /* TODO: Strip newline and whitespace */
+ if(!self->node->first_child->next && !self->node->first_child->node.is_tag) {
+ QuickMediaStringView text = self->node->first_child->node.name;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+ }
+
+ if(merge_inner_text(self->node, &self->__str) != 0) {
+ QuickMediaStringView text;
+ text.data = NULL;
+ text.size = 0;
+ return text;
+ }
+
+ QuickMediaStringView text;
+ text.data = self->__str.data;
+ text.size = self->__str.size;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+}
+
+int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) {
+ /* Utf8 BOM */
+ if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) {
+ html_source += 3;
+ size -= 3;
+ }
+
+ QuickMediaHtmlNode *html_node = &self->root_node;
+ html_node_init(html_node);
+ if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) {
+ quickmedia_html_search_deinit(self);
+ return 1;
+ }
+
+ return 0;
+}
+
+void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
+ html_node_deinit(&self->root_node);
}
int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {