aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-07-02 23:17:06 +0200
committerdec05eba <dec05eba@protonmail.com>2021-07-02 23:17:06 +0200
commit5f283adc8d1a29f420d466e85b216e9d6f4a9822 (patch)
tree1006965593e1f7c2571f0f74a64c3555fc03d506
parent48ceb8591e1e3c7695d091f02a6a43edb5f77936 (diff)
Remove dependency on html tidy
-rw-r--r--README.md3
m---------depends/html-parser0
-rw-r--r--include/quickmedia/HtmlSearch.h53
-rw-r--r--include/quickmedia/NodeSearch.h18
-rw-r--r--include/quickmedia/XpathTokenizer.h2
-rw-r--r--src/HtmlSearch.c512
-rw-r--r--src/NodeSearch.c18
-rw-r--r--src/XpathParser.c15
-rw-r--r--src/XpathTokenizer.c11
-rw-r--r--tests/main.c46
10 files changed, 480 insertions, 198 deletions
diff --git a/README.md b/README.md
index 7279c0d..0516066 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,4 @@
Html search using non-standard xpath, written in C. See tests/main.c
+
+# Note
+This library does not decode html sequences in text and attribute values \ No newline at end of file
diff --git a/depends/html-parser b/depends/html-parser
-Subproject 917f810d7f196fef5959bc3096ce7360df961fc
+Subproject fd8f0358ceb43c423a4180e23fcd5b9f6201d82
diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h
index bedde03..63f2175 100644
--- a/include/quickmedia/HtmlSearch.h
+++ b/include/quickmedia/HtmlSearch.h
@@ -2,47 +2,74 @@
#define QUICKMEDIA_HTML_SEARCH_H
#include "NodeSearch.h"
+#include <HtmlParser.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
+typedef struct QuickMediaHtmlAttribute QuickMediaHtmlAttribute;
+typedef struct QuickMediaHtmlNode QuickMediaHtmlNode;
+typedef struct QuickMediaHtmlChildNode QuickMediaHtmlChildNode;
+typedef struct QuickMediaTextNode QuickMediaTextNode;
+
typedef struct {
char *data;
size_t size;
size_t capacity;
} QuickMediaString;
+struct QuickMediaHtmlAttribute {
+ QuickMediaStringView key;
+ QuickMediaStringView value;
+ QuickMediaHtmlAttribute *next;
+};
+
+struct QuickMediaHtmlNode {
+ int is_tag; /* 0 = text, 1 = tag */
+ QuickMediaStringView name; /* name if the node is a tag, text if the node is a text */
+ QuickMediaHtmlAttribute *first_attribute;
+ QuickMediaHtmlAttribute *last_attribute;
+ QuickMediaHtmlChildNode *first_child;
+ QuickMediaHtmlChildNode *last_child;
+ QuickMediaHtmlNode *parent;
+};
+
+struct QuickMediaHtmlChildNode {
+ QuickMediaHtmlNode node;
+ QuickMediaHtmlChildNode *next;
+};
+
typedef struct {
- const void *doc;
- const void *node;
- QuickMediaString text;
-} QuickMediaHtmlNode;
+ QuickMediaHtmlNode *node;
+ QuickMediaString __str;
+} QuickMediaMatchNode;
typedef struct {
- const void *doc;
+ QuickMediaHtmlNode root_node;
} QuickMediaHtmlSearch;
/*
- Returns NULL if attribute doesn't exist or if it doesn't have any value.
+ Returns an empty string view if attribute doesn't exist or if it doesn't have any value.
The result is only valid within the callback function scope.
*/
-const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name);
+QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name);
/*
- Returns NULL if the node doesn't have any text.
+ Returns an empty string if the node doesn't have any text or if there was an error creating the text.
The result is only valid within the callback function scope.
*/
-const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self);
+QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self);
-/* @node is only valid within the callback function scope */
-typedef void (*QuickMediaHtmlSearchResultCallback)(QuickMediaHtmlNode *node, void *userdata);
+/* @node is only valid within the callback function scope. Return 0 to continue */
+typedef int (*QuickMediaHtmlSearchResultCallback)(QuickMediaMatchNode *node, void *userdata);
-int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source);
+/* |html_source| should be in utf8 format and may contain utf8 BOM */
+int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size);
void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self);
-/* Non-standard xpath. Doesn't use '@' symbol for accessing properties */
+/* Non-standard xpath. Doesn't use '@' symbol for accessing properties. Returns non-0 value if there is a syntax error in the xpath */
int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata);
#ifdef __cplusplus
diff --git a/include/quickmedia/NodeSearch.h b/include/quickmedia/NodeSearch.h
index adaac44..9e7fd0c 100644
--- a/include/quickmedia/NodeSearch.h
+++ b/include/quickmedia/NodeSearch.h
@@ -1,30 +1,32 @@
#ifndef QUICKMEDIA_NODE_SEARCH_H
#define QUICKMEDIA_NODE_SEARCH_H
+#include <stddef.h>
+
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
- char *name;
- char *value;
+ const char *data;
+ size_t size;
+} QuickMediaStringView;
+
+typedef struct {
+ QuickMediaStringView name;
+ QuickMediaStringView value;
int defined;
} QuickMediaNodeSearchParam;
typedef struct QuickMediaNodeSearch QuickMediaNodeSearch;
struct QuickMediaNodeSearch {
- char *name; /* optional */
+ QuickMediaStringView name; /* optional */
int recursive;
QuickMediaNodeSearchParam param; /* optional */
QuickMediaNodeSearch *child; /* optional */
};
-typedef struct {
- const char *data;
- unsigned long long size;
-} QuickMediaStringView;
-
void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self);
void quickmedia_node_search_init(QuickMediaNodeSearch *self);
void quickmedia_node_search_deinit(QuickMediaNodeSearch *self);
diff --git a/include/quickmedia/XpathTokenizer.h b/include/quickmedia/XpathTokenizer.h
index cada673..62f6d75 100644
--- a/include/quickmedia/XpathTokenizer.h
+++ b/include/quickmedia/XpathTokenizer.h
@@ -30,8 +30,6 @@ typedef enum {
void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath);
QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self);
int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token);
-char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self);
-char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self);
#ifdef __cplusplus
}
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c
index c49ee46..23c4736 100644
--- a/src/HtmlSearch.c
+++ b/src/HtmlSearch.c
@@ -1,8 +1,10 @@
#include "../include/quickmedia/HtmlSearch.h"
#include "../include/quickmedia/XpathParser.h"
-#include <tidy.h>
-#include <tidybuffio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
static void string_init(QuickMediaString *self) {
self->data = NULL;
@@ -17,75 +19,83 @@ static void string_deinit(QuickMediaString *self) {
self->capacity = 0;
}
-static int string_append(QuickMediaString *self, const char *str, size_t size) {
- size_t new_capacity = self->capacity;
- if(new_capacity == 0) {
- new_capacity = 8;
- }
+static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) {
+ if(self->capacity >= new_capacity)
+ return 0;
+
+ size_t capacity = self->capacity;
+ if(capacity == 0)
+ capacity = 8;
- size_t new_size = self->size + size;
- while(new_size + 1 > new_capacity) {
- new_capacity += (new_capacity >> 1);
- }
+ while(capacity < new_capacity) {
+ capacity += (capacity >> 1);
+ }
- void *new_data = realloc(self->data, new_capacity);
+ void *new_data = realloc(self->data, capacity);
if(!new_data) {
- fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity);
+ fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity);
return 1;
}
-
- memcpy((char*)new_data + self->size, str, size);
- ((char*)new_data)[self->size + size] = '\0';
- self->data = (char*)new_data;
- self->size = new_size;
- self->capacity = new_capacity;
+
+ self->data = new_data;
+ self->capacity = capacity;
return 0;
}
-static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- size_t i = 0;
- while(i < size && str[i] == '\n') {
- ++i;
- }
- *output_str = str + i;
- *output_size = size - i;
-}
+static int string_append(QuickMediaString *self, const char *str, size_t size) {
+ int res = string_ensure_capacity(self, self->size + size);
+ if(res != 0)
+ return res;
-static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
- ssize_t i = size - 1;
- while(i >= 0 && str[i] == '\n') {
- --i;
- }
- *output_size = i + 1;
+ memcpy((char*)self->data + self->size, str, size);
+ ((char*)self->data)[self->size + size] = '\0';
+ self->size += size;
+ return 0;
}
-static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- lstrip_newline(str, size, output_str, output_size);
- rstrip_newline(*output_str, *output_size, output_size);
+static size_t find_first_not_char(const char *str, size_t size, char not_char) {
+ assert(not_char != '\0');
+ size_t i = 0;
+ for(; i < size && str[i] == not_char; ++i) {}
+ return i;
}
-/* Returns pointer to char that is not |not_char|, even if the first matching character is the null terminator. |not_char| can't be '\0' */
-static const char* find_first_not_char(const char *str, char not_char) {
- assert(not_char != '\0');
- while(*str == not_char) { ++str; }
- return str;
+static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) {
+ if(index < str->size)
+ return str->data[index];
+ else
+ return fallback;
}
/* Returns 0 on match */
-static int str_glob_match(const char *str, const char *glob) {
+static int str_glob_match(const QuickMediaStringView str, const QuickMediaStringView glob) {
+ size_t str_index = 0;
+ size_t glob_index = 0;
+
+ if(str.size == 0) {
+ /* TODO: What about glob = **** (more than one asterix) */
+ if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*'))
+ return 0;
+ else
+ return 1;
+ }
+
for(;;) {
- char glob_c = *glob;
+ char glob_c = string_view_char_or(&glob, glob_index, '\0');
if(glob_c == '*') {
- glob = find_first_not_char(glob + 1, '*');
- char next_glob_c = *glob;
+ glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*');
+ char next_glob_c = string_view_char_or(&glob, glob_index, '\0');
if(next_glob_c == '\0')
return 0;
- str = strchr(str, next_glob_c);
- if(!str)
+ const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index);
+ if(!s_p)
return 1;
+
+ const size_t new_str_index = (const char*)s_p - (str.data + str_index);
+ str_index = new_str_index;
} else {
- char str_c = *str;
+ char str_c = string_view_char_or(&str, str_index, '\0');
if(str_c != glob_c)
return 1;
@@ -93,73 +103,54 @@ static int str_glob_match(const char *str, const char *glob) {
return 0;
}
- ++str;
- ++glob;
+ ++str_index;
+ ++glob_index;
}
assert(0); /* shouldn't happen */
return 1;
}
-static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) {
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- const char *node_name = tidyNodeGetName(child);
- if(node_name && strcmp(node_name, "br") == 0) {
- string_append(str, "\n", 1);
- } else if(tidyNodeGetType(child) == TidyNode_Start && node_name && strcmp(node_name, "p") == 0) {
- if(str->size > 0)
- string_append(str, "\n", 1);
- }
-
- if(tidyNodeGetType(child) == TidyNode_Text) {
- TidyBuffer tidy_buffer;
- tidyBufInit(&tidy_buffer);
- if(tidyNodeGetText(doc, child, &tidy_buffer)) {
- const char *inner_text = (const char*)tidy_buffer.bp;
- size_t inner_text_size = tidy_buffer.size;
- strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
- string_append(str, inner_text, inner_text_size);
- }
- tidyBufFree(&tidy_buffer);
- } else {
- int res = add_inner_text_recursive(doc, child, str);
- if(res != 0)
- return res;
- }
- }
- return 0;
+static int string_views_equal(const QuickMediaStringView str1, const QuickMediaStringView str2) {
+ if(str2.size == str1.size && memcmp(str2.data, str1.data, str1.size) == 0)
+ return 0;
+ else
+ return 1;
}
-static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
- assert(name);
- for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
- const char *attr_name = tidyAttrName(attr);
- if(attr_name && strcmp(name, attr_name) == 0)
+static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) {
+ for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) {
+ if(string_views_equal(attr->key, name) == 0)
return attr;
}
return NULL;
}
-static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+ if(!node)
+ return 0;
+
/* We use two loops because we want to find children before grandchildren */
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- const char *child_node_name = tidyNodeGetName(child);
+ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
/* A text node doesn't have a name */
- if(!child_node_name)
+ if(!child->node.is_tag || child->node.name.size == 0)
continue;
/* Match without node name or node name matches */
- if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) {
+ if(search_data->name.size == 0 || string_views_equal(child->node.name, search_data->name) == 0) {
#define on_match() do { \
- if(search_data->child) \
- find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \
- else { \
- QuickMediaHtmlNode node; \
- node.doc = tdoc; \
- node.node = child; \
- string_init(&node.text); \
- result_callback(&node, userdata); \
- string_deinit(&node.text); \
+ if(search_data->child) { \
+ if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \
+ return 1; \
+ } else { \
+ QuickMediaMatchNode match_node; \
+ match_node.node = &child->node; \
+ string_init(&match_node.__str); \
+ if(result_callback(&match_node, userdata) != 0) { \
+ string_deinit(&match_node.__str); \
+ return 1; \
+ } \
+ string_deinit(&match_node.__str); \
} \
} while(0)
@@ -169,15 +160,14 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
continue;
}
- TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name);
+ QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name);
/* Couldn't find the param that we want to match against */
if(!child_attr)
continue;
- const char *attr_value = tidyAttrValue(child_attr);
- assert(search_data->param.value);
+ assert(search_data->param.value.size > 0);
/* If the param value matches what we want to search for */
- if(attr_value && str_glob_match(attr_value, search_data->param.value) == 0) {
+ if(str_glob_match(child_attr->value, search_data->param.value) == 0) {
on_match();
continue;
}
@@ -185,27 +175,13 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
}
if(search_data->recursive) {
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- find_child_nodes(tdoc, child, search_data, result_callback, userdata);
+ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) {
+ if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0)
+ return 1;
}
}
-}
-
-const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
- TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name);
- if(!attr)
- return NULL;
- return tidyAttrValue(attr);
-}
-const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
- if(self->text.data)
- return self->text.data;
-
- if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0)
- string_append(&self->text, " ", 1);
-
- return self->text.data;
+ return 0;
}
static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
@@ -214,29 +190,299 @@ static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNode
if(!search_data || !result_callback)
return -1;
- TidyNode root_node = tidyGetRoot(self->doc);
- find_child_nodes(self->doc, root_node, search_data, result_callback, userdata);
+ find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata);
return 0;
}
-int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) {
- self->doc = tidyCreate();
- tidyOptSetBool(self->doc, TidyShowWarnings, no);
- tidyOptSetInt(self->doc, TidyUseCustomTags, 1);
- tidyOptSetInt(self->doc, TidyWrapLen, 0);
- /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */
- if(tidyParseString(self->doc, html_source) < 0) {
- tidyRelease(self->doc);
- self->doc = NULL;
+static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent);
+static void html_node_child_deinit(QuickMediaHtmlChildNode *self);
+
+static void html_attribute_init(QuickMediaHtmlAttribute *self) {
+ self->key.data = NULL;
+ self->key.size = 0;
+ self->value.data = NULL,
+ self->value.size = 0;
+ self->next = NULL;
+}
+
+static void html_attribute_deinit(QuickMediaHtmlAttribute *self) {
+ if(self->next) {
+ html_attribute_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
+ }
+ html_attribute_init(self);
+}
+
+static void html_node_init(QuickMediaHtmlNode *self) {
+ self->is_tag = 1;
+ self->name.data = NULL;
+ self->name.size = 0;
+ self->first_attribute = NULL;
+ self->last_attribute = NULL;
+ self->first_child = NULL;
+ self->last_child = NULL;
+ self->parent = NULL;
+}
+
+static void html_node_deinit(QuickMediaHtmlNode *self) {
+ if(self->first_attribute) {
+ html_attribute_deinit(self->first_attribute);
+ free(self->first_attribute);
+ self->first_attribute = NULL;
+ }
+
+ if(self->first_child) {
+ html_node_child_deinit(self->first_child);
+ free(self->first_child);
+ self->first_child = NULL;
+ }
+
+ html_node_init(self);
+}
+
+static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) {
+ QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute));
+ if(!attribute)
+ return 1;
+
+ html_attribute_init(attribute);
+ attribute->key.data = key.data;
+ attribute->key.size = key.size;
+ attribute->value.data = value.data;
+ attribute->value.size = value.size;
+
+ if(self->last_attribute) {
+ self->last_attribute->next = attribute;
+ self->last_attribute = attribute;
+ } else {
+ self->first_attribute = attribute;
+ self->last_attribute = attribute;
}
+
return 0;
}
-void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
- if(self->doc) {
- tidyRelease(self->doc);
- self->doc = NULL;
+void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) {
+ html_node_init(&self->node);
+ self->node.parent = parent;
+ if(parent) {
+ if(parent->last_child) {
+ parent->last_child->next = self;
+ parent->last_child = self;
+ } else {
+ parent->first_child = self;
+ parent->last_child = self;
+ }
+ }
+ self->next = NULL;
+}
+
+void html_node_child_deinit(QuickMediaHtmlChildNode *self) {
+ if(self->next) {
+ html_node_child_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
}
+ html_node_deinit(&self->node);
+}
+
+static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
+ QuickMediaHtmlNode **html_node_p = userdata;
+ QuickMediaHtmlNode *html_node = *html_node_p;
+
+ switch(parse_type) {
+ case HTML_PARSE_TAG_START: {
+ QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
+ if(!child_node)
+ return 1;
+ html_node_child_init(child_node, html_node);
+ child_node->node.name.data = html_parser->tag_name.data;
+ child_node->node.name.size = html_parser->tag_name.size;
+ *html_node_p = &child_node->node;
+ break;
+ }
+ case HTML_PARSE_TAG_END: {
+ if(html_node->parent)
+ *html_node_p = html_node->parent;
+ break;
+ }
+ case HTML_PARSE_ATTRIBUTE: {
+ if(html_node_add_attribute(html_node, html_parser->attribute_key, html_parser->attribute_value) != 0)
+ return 1;
+ break;
+ }
+ case HTML_PARSE_TEXT:
+ /* fallthrough */
+ case HTML_PARSE_JAVASCRIPT_CODE: {
+ QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode));
+ if(!child_node)
+ return 1;
+ html_node_child_init(child_node, html_node);
+ child_node->node.is_tag = 0;
+ child_node->node.name.data = html_parser->text.data;
+ child_node->node.name.size = html_parser->text.size;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name) {
+ QuickMediaStringView attr_name;
+ attr_name.data = attribute_name;
+ attr_name.size = strlen(attribute_name);
+
+ QuickMediaHtmlAttribute *attr = get_attribute_by_name(self->node, attr_name);
+ if(attr) {
+ return attr->value;
+ } else {
+ QuickMediaStringView attr_value;
+ attr_value.data = NULL;
+ attr_value.size = 0;
+ return attr_value;
+ }
+}
+
+static int is_whitespace(int c) {
+ switch(c) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\v':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int is_newline(int c) {
+ return c == '\n' || c == '\r';
+}
+
+static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+ size_t i = 0;
+ while(i < size && strip_filter_func(str[i])) {
+ ++i;
+ }
+ *output_str = str + i;
+ *output_size = size - i;
+}
+
+static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
+ ssize_t i = size - 1;
+ while(i >= 0 && strip_filter_func(str[i])) {
+ --i;
+ }
+ *output_size = i + 1;
+}
+
+static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+ lstrip(str, size, output_str, output_size, strip_filter_func);
+ rstrip(*output_str, *output_size, output_size, strip_filter_func);
+}
+
+static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) {
+ if(node->is_tag) {
+ int newline = 0;
+ if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ newline = 1;
+ } else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) {
+ if(str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ newline = 1;
+ } else if(node->name.size == 1 && node->name.data[0] == 'p') {
+ if(str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ newline = 1;
+ }
+
+ size_t prev_size = str->size;
+ for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) {
+ merge_inner_text(&child->node, str);
+ }
+
+ if(newline && str->size > prev_size && str->size > 0) {
+ if(string_append(str, "\n", 1) != 0)
+ return 1;
+ }
+ } else {
+ const char *inner_text = node->name.data;
+ size_t inner_text_size = node->name.size;
+ strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline);
+ if(inner_text_size > 0) {
+ if(string_append(str, node->name.data, node->name.size) != 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) {
+ if(self->__str.data) {
+ QuickMediaStringView text;
+ text.data = self->__str.data;
+ text.size = self->__str.size;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+ }
+
+ if(!self->node->first_child) {
+ QuickMediaStringView text;
+ text.data = NULL;
+ text.size = 0;
+ return text;
+ }
+
+ /* If the only child is the text node then there is no need to create a copy of it */
+ /* TODO: Strip newline and whitespace */
+ if(!self->node->first_child->next && !self->node->first_child->node.is_tag) {
+ QuickMediaStringView text = self->node->first_child->node.name;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+ }
+
+ if(merge_inner_text(self->node, &self->__str) != 0) {
+ QuickMediaStringView text;
+ text.data = NULL;
+ text.size = 0;
+ return text;
+ }
+
+ QuickMediaStringView text;
+ text.data = self->__str.data;
+ text.size = self->__str.size;
+ strip(text.data, text.size, &text.data, &text.size, is_whitespace);
+ return text;
+}
+
+int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) {
+ /* Utf8 BOM */
+ if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) {
+ html_source += 3;
+ size -= 3;
+ }
+
+ QuickMediaHtmlNode *html_node = &self->root_node;
+ html_node_init(html_node);
+ if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) {
+ quickmedia_html_search_deinit(self);
+ return 1;
+ }
+
+ return 0;
+}
+
+void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) {
+ html_node_deinit(&self->root_node);
}
int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
diff --git a/src/NodeSearch.c b/src/NodeSearch.c
index 0a36215..bddb26c 100644
--- a/src/NodeSearch.c
+++ b/src/NodeSearch.c
@@ -2,28 +2,28 @@
#include <stdlib.h>
void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) {
- self->name = NULL;
- self->value = NULL;
+ self->name.data = NULL;
+ self->name.size = 0;
+ self->value.data = NULL;
+ self->value.size = 0;
self->defined = 0;
}
static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) {
- free(self->name);
- free(self->value);
- self->name = NULL;
- self->value = NULL;
+ quickmedia_node_search_param_init(self);
}
void quickmedia_node_search_init(QuickMediaNodeSearch *self) {
- self->name = NULL;
+ self->name.data = NULL;
+ self->name.size = 0;
self->recursive = 0;
quickmedia_node_search_param_init(&self->param);
self->child = NULL;
}
void quickmedia_node_search_deinit(QuickMediaNodeSearch *self) {
- free(self->name);
- self->name = NULL;
+ self->name.data = NULL;
+ self->name.size = 0;
quickmedia_node_search_param_deinit(&self->param);
if(self->child) {
diff --git a/src/XpathParser.c b/src/XpathParser.c
index 4326e85..0dbe270 100644
--- a/src/XpathParser.c
+++ b/src/XpathParser.c
@@ -19,7 +19,7 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa
if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER)
return -1;
- result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer);
+ result->name = self->tokenizer.identifier;
token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL)
@@ -29,7 +29,7 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa
if(token != QUICKMEDIA_XPATH_TOKEN_STRING)
return -3;
- result->value = quickmedia_xpath_tokenizer_copy_string(&self->tokenizer);
+ result->value = self->tokenizer.string;
token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET)
@@ -50,23 +50,22 @@ static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *r
if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER)
return -1;
- result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer);
+ result->name = self->tokenizer.identifier;
int param_result = xpath_parse_param(self, &result->param);
- if(param_result < 0) {
- quickmedia_node_search_deinit(result);
+ if(param_result < 0)
return param_result;
- }
result->child = malloc(sizeof(QuickMediaNodeSearch));
+ if(!result->child)
+ return -1;
+
int node_result = xpath_parse_node(self, result->child);
if(node_result > 0) {
node_result = 0;
/* Didn't have child, remove child */
free(result->child);
result->child = NULL;
- } else if(node_result < 0) {
- quickmedia_node_search_deinit(result);
}
return node_result;
diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c
index 32bede9..ae17939 100644
--- a/src/XpathTokenizer.c
+++ b/src/XpathTokenizer.c
@@ -91,14 +91,3 @@ int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMedi
self->code = restore_point;
return -1;
}
-
-char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self) {
- char *result = malloc(self->identifier.size + 1);
- result[self->identifier.size] = '\0';
- memcpy(result, self->identifier.data, self->identifier.size);
- return result;
-}
-
-char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self) {
- return quickmedia_xpath_tokenizer_copy_identifier(self);
-}
diff --git a/tests/main.c b/tests/main.c
index 7888ea1..2a08ec7 100644
--- a/tests/main.c
+++ b/tests/main.c
@@ -1,34 +1,51 @@
-#include <stdio.h>
#include "../include/quickmedia/HtmlSearch.h"
-#include <assert.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
-static char* get_file_content(const char *filepath) {
+static char* get_file_content(const char *filepath, size_t *filesize) {
FILE *file = fopen(filepath, "rb");
assert(file);
fseek(file, 0, SEEK_END);
- size_t filesize = ftell(file);
+ *filesize = ftell(file);
fseek(file, 0, SEEK_SET);
- char *buffer = malloc(filesize + 1);
- buffer[filesize] = '\0';
- fread(buffer, 1, filesize, file);
+ char *buffer = malloc((*filesize) + 1);
+ buffer[*filesize] = '\0';
+ fread(buffer, 1, *filesize, file);
return buffer;
}
-static void result_callback(QuickMediaHtmlNode *node, void *userdata) {
- const char *href = quickmedia_html_node_get_attribute_value(node, "href");
- const char *text = quickmedia_html_node_get_text(node);
- printf("a href: %s, node value: %s\n", href, text);
+static int result_callback(QuickMediaMatchNode *node, void *userdata) {
+ QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href");
+ QuickMediaStringView text = quickmedia_html_node_get_text(node);
+ printf("a href: %.*s, node value: %.*s\n", (int)href.size, href.data, (int)text.size, text.data);
+ return 0;
+}
+
+static int result_callback_nested_text(QuickMediaMatchNode *node, void *userdata) {
+ QuickMediaStringView text = quickmedia_html_node_get_text(node);
+ printf("text: %.*s\n", (int)text.size, text.data);
+ return 0;
+}
+
+static void test_nested_nodes_get_text() {
+ const char *html_source = "<div class=\"item\">hello<h1>text</h1>world</div>";
+ QuickMediaHtmlSearch html_search;
+ quickmedia_html_search_init(&html_search, html_source, strlen(html_source));
+ quickmedia_html_find_nodes_xpath(&html_search, "//div[class='item']", result_callback_nested_text, NULL);
+ quickmedia_html_search_deinit(&html_search);
}
-int main(int argc, char **argv) {
- char *file_content = get_file_content("test_files/test.html");
+int main() {
+ size_t filesize = 0;
+ char *file_content = get_file_content("test_files/test.html", &filesize);
QuickMediaHtmlSearch html_search;
- int result = quickmedia_html_search_init(&html_search, file_content);
+ int result = quickmedia_html_search_init(&html_search, file_content, filesize);
if(result != 0)
goto cleanup;
result = quickmedia_html_find_nodes_xpath(&html_search, "//h3[class=\"story_name\"]//a", result_callback, NULL);
@@ -38,5 +55,6 @@ int main(int argc, char **argv) {
cleanup:
quickmedia_html_search_deinit(&html_search);
free(file_content);
+ test_nested_nodes_get_text();
return result;
}