aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-04-28 22:48:58 +0200
committerdec05eba <dec05eba@protonmail.com>2021-04-28 22:48:58 +0200
commit0578bfd08637d3e113d28507ea73fa9a649f2f21 (patch)
tree1a4165fe295146b2ba42f95aeb7a8ba910b30489
parent4e5de201c070352837d22f3700b3ea47d9ed3043 (diff)
Combine all inner text
-rw-r--r--include/quickmedia/HtmlSearch.h9
-rw-r--r--src/HtmlSearch.c103
2 files changed, 96 insertions, 16 deletions
diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h
index b3b2eaa..bedde03 100644
--- a/include/quickmedia/HtmlSearch.h
+++ b/include/quickmedia/HtmlSearch.h
@@ -2,15 +2,22 @@
#define QUICKMEDIA_HTML_SEARCH_H
#include "NodeSearch.h"
+#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
+ char *data;
+ size_t size;
+ size_t capacity;
+} QuickMediaString;
+
+typedef struct {
const void *doc;
const void *node;
- void *text;
+ QuickMediaString text;
} QuickMediaHtmlNode;
typedef struct {
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c
index b5aef91..b7801b9 100644
--- a/src/HtmlSearch.c
+++ b/src/HtmlSearch.c
@@ -4,6 +4,87 @@
#include <tidy.h>
#include <tidybuffio.h>
+static void string_init(QuickMediaString *self) {
+ self->data = NULL;
+ self->size = 0;
+ self->capacity = 0;
+}
+
+static void string_deinit(QuickMediaString *self) {
+ free(self->data);
+ self->data = NULL;
+ self->size = 0;
+ self->capacity = 0;
+}
+
+static int string_append(QuickMediaString *self, const char *str, size_t size) {
+ size_t new_capacity = self->capacity;
+ if(new_capacity == 0) {
+ new_capacity = 8;
+ }
+
+ size_t new_size = self->size + size;
+ while(new_size + 1 > new_capacity) {
+ new_capacity += (new_capacity >> 1);
+ }
+
+ void *new_data = realloc(self->data, new_capacity);
+ if(!new_data) {
+ fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity);
+ return 1;
+ }
+
+ memcpy((char*)new_data + self->size, str, size);
+ ((char*)new_data)[self->size + size] = '\0';
+ self->data = (char*)new_data;
+ self->size = new_size;
+ self->capacity = new_capacity;
+ return 0;
+}
+
+static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
+ size_t i = 0;
+ while(i < size && str[i] == '\n') {
+ ++i;
+ }
+ *output_str = str + i;
+ *output_size = size - i;
+}
+
+static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
+ ssize_t i = size - 1;
+ while(i >= 0 && str[i] == '\n') {
+ --i;
+ }
+ *output_size = i + 1;
+}
+
+static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
+ lstrip_newline(str, size, output_str, output_size);
+ rstrip_newline(*output_str, *output_size, output_size);
+}
+
+static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) {
+ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+ if(tidyNodeGetType(child) == TidyNode_Text) {
+ TidyBuffer tidy_buffer;
+ tidyBufInit(&tidy_buffer);
+ if(tidyNodeGetText(doc, child, &tidy_buffer)) {
+ const char *inner_text = (const char*)tidy_buffer.bp;
+ size_t inner_text_size = tidy_buffer.size;
+ strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
+ string_append(str, inner_text, inner_text_size);
+ }
+ tidyBufFree(&tidy_buffer);
+ } else {
+ int res = add_inner_text_recursive(doc, child, str);
+ if(res != 0)
+ return res;
+ }
+ }
+ return 0;
+}
+
static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
assert(name);
for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
@@ -31,12 +112,9 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
QuickMediaHtmlNode node; \
node.doc = tdoc; \
node.node = child; \
- node.text = NULL; \
+ string_init(&node.text); \
result_callback(&node, userdata); \
- if(node.text){ \
- tidyBufFree(node.text); \
- free(node.text); \
- } \
+ string_deinit(&node.text); \
} \
} while(0)
@@ -76,18 +154,13 @@ const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, c
}
const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
- if(self->text)
- return (const char*)((TidyBuffer*)self->text)->bp;
-
- TidyNode child_node = tidyGetChild(self->node);
- if(tidyNodeGetType(child_node) != TidyNode_Text)
- return NULL;
+ if(self->text.data)
+ return self->text.data;
- self->text = malloc(sizeof(TidyBuffer));
- tidyBufInit(self->text);
- tidyNodeGetText(self->doc, child_node, self->text);
+ if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0)
+ string_append(&self->text, " ", 1);
- return (const char*)((TidyBuffer*)self->text)->bp;
+ return self->text.data;
}
static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {