From 0578bfd08637d3e113d28507ea73fa9a649f2f21 Mon Sep 17 00:00:00 2001
From: dec05eba <dec05eba@protonmail.com>
Date: Wed, 28 Apr 2021 22:48:58 +0200
Subject: Combine all inner text

---
 include/quickmedia/HtmlSearch.h |   9 +++-
 src/HtmlSearch.c                | 103 ++++++++++++++++++++++++++++++++++------
 2 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h
index b3b2eaa..bedde03 100644
--- a/include/quickmedia/HtmlSearch.h
+++ b/include/quickmedia/HtmlSearch.h
@@ -2,15 +2,22 @@
 #define QUICKMEDIA_HTML_SEARCH_H
 
 #include "NodeSearch.h"
+#include <stddef.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef struct {
+    char *data;
+    size_t size;
+    size_t capacity;
+} QuickMediaString;
+
 typedef struct {
     const void *doc;
     const void *node;
-    void *text;
+    QuickMediaString text;
 } QuickMediaHtmlNode;
 
 typedef struct {
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c
index b5aef91..b7801b9 100644
--- a/src/HtmlSearch.c
+++ b/src/HtmlSearch.c
@@ -4,6 +4,87 @@
 #include <tidy.h>
 #include <tidybuffio.h>
 
+static void string_init(QuickMediaString *self) {
+    self->data = NULL;
+    self->size = 0;
+    self->capacity = 0;
+}
+
+static void string_deinit(QuickMediaString *self) {
+    free(self->data);
+    self->data = NULL;
+    self->size = 0;
+    self->capacity = 0;
+}
+
+static int string_append(QuickMediaString *self, const char *str, size_t size) {
+    size_t new_capacity = self->capacity;
+    if(new_capacity == 0) {
+        new_capacity = 8;
+    }
+
+    size_t new_size = self->size + size;
+    while(new_size + 1 > new_capacity) {
+        new_capacity += (new_capacity >> 1);
+    }
+
+    void *new_data = realloc(self->data, new_capacity);
+    if(!new_data) {
+        fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity);
+        return 1;
+    }
+
+    memcpy((char*)new_data + self->size, str, size);
+    ((char*)new_data)[self->size + size] = '\0';
+    self->data = (char*)new_data;
+    self->size = new_size;
+    self->capacity = new_capacity;
+    return 0;
+}
+
+static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
+    size_t i = 0;
+    while(i < size && str[i] == '\n') {
+        ++i;
+    }
+    *output_str = str + i;
+    *output_size = size - i;
+}
+
+static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
+    ssize_t i = size - 1;
+    while(i >= 0 && str[i] == '\n') {
+        --i;
+    }
+    *output_size = i + 1;
+}
+
+static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
+    lstrip_newline(str, size, output_str, output_size);
+    rstrip_newline(*output_str, *output_size, output_size);
+}
+
+static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) {
+    for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+        if(tidyNodeGetType(child) == TidyNode_Text) {
+            TidyBuffer tidy_buffer;
+            tidyBufInit(&tidy_buffer);
+            if(tidyNodeGetText(doc, child, &tidy_buffer)) {
+                const char *inner_text = (const char*)tidy_buffer.bp;
+                size_t inner_text_size = tidy_buffer.size;
+                strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
+                string_append(str, inner_text, inner_text_size);
+            }
+            tidyBufFree(&tidy_buffer);
+        } else {
+            int res = add_inner_text_recursive(doc, child, str);
+            if(res != 0)
+                return res;
+        }
+    }
+    return 0;
+}
+
 static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
     assert(name);
     for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
@@ -31,12 +112,9 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe
                     QuickMediaHtmlNode node; \
                     node.doc = tdoc; \
                     node.node = child; \
-                    node.text = NULL; \
+                    string_init(&node.text); \
                     result_callback(&node, userdata); \
-                    if(node.text){ \
-                        tidyBufFree(node.text); \
-                        free(node.text); \
-                    } \
+                    string_deinit(&node.text); \
                 } \
             } while(0)
 
@@ -76,18 +154,13 @@ const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, c
 }
 
 const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
-    if(self->text)
-        return (const char*)((TidyBuffer*)self->text)->bp;
-
-    TidyNode child_node = tidyGetChild(self->node);
-    if(tidyNodeGetType(child_node) != TidyNode_Text)
-        return NULL;
+    if(self->text.data)
+        return self->text.data;
 
-    self->text = malloc(sizeof(TidyBuffer));
-    tidyBufInit(self->text);
-    tidyNodeGetText(self->doc, child_node, self->text);
+    if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0)
+        string_append(&self->text, " ", 1);
     
-    return (const char*)((TidyBuffer*)self->text)->bp;
+    return self->text.data;
 }
 
 static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
-- 
cgit v1.2.3-70-g09d2