From 0578bfd08637d3e113d28507ea73fa9a649f2f21 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Wed, 28 Apr 2021 22:48:58 +0200 Subject: Combine all inner text --- include/quickmedia/HtmlSearch.h | 9 +++- src/HtmlSearch.c | 103 ++++++++++++++++++++++++++++++++++------ 2 files changed, 96 insertions(+), 16 deletions(-) diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h index b3b2eaa..bedde03 100644 --- a/include/quickmedia/HtmlSearch.h +++ b/include/quickmedia/HtmlSearch.h @@ -2,15 +2,22 @@ #define QUICKMEDIA_HTML_SEARCH_H #include "NodeSearch.h" +#include #ifdef __cplusplus extern "C" { #endif +typedef struct { + char *data; + size_t size; + size_t capacity; +} QuickMediaString; + typedef struct { const void *doc; const void *node; - void *text; + QuickMediaString text; } QuickMediaHtmlNode; typedef struct { diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c index b5aef91..b7801b9 100644 --- a/src/HtmlSearch.c +++ b/src/HtmlSearch.c @@ -4,6 +4,87 @@ #include #include +static void string_init(QuickMediaString *self) { + self->data = NULL; + self->size = 0; + self->capacity = 0; +} + +static void string_deinit(QuickMediaString *self) { + free(self->data); + self->data = NULL; + self->size = 0; + self->capacity = 0; +} + +static int string_append(QuickMediaString *self, const char *str, size_t size) { + size_t new_capacity = self->capacity; + if(new_capacity == 0) { + new_capacity = 8; + } + + size_t new_size = self->size + size; + while(new_size + 1 > new_capacity) { + new_capacity += (new_capacity >> 1); + } + + void *new_data = realloc(self->data, new_capacity); + if(!new_data) { + fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity); + return 1; + } + + memcpy((char*)new_data + self->size, str, size); + ((char*)new_data)[self->size + size] = '\0'; + self->data = (char*)new_data; + self->size = new_size; + self->capacity = new_capacity; + return 0; +} + +static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { + size_t i = 0; + while(i < size && str[i] == '\n') { + ++i; + } + *output_str = str + i; + *output_size = size - i; +} + +static void rstrip_newline(const char *str, size_t size, size_t *output_size) { + ssize_t i = size - 1; + while(i >= 0 && str[i] == '\n') { + --i; + } + *output_size = i + 1; +} + +static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { + lstrip_newline(str, size, output_str, output_size); + rstrip_newline(*output_str, *output_size, output_size); +} + +static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) { + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + if(tidyNodeGetType(child) == TidyNode_Text) { + TidyBuffer tidy_buffer; + tidyBufInit(&tidy_buffer); + if(tidyNodeGetText(doc, child, &tidy_buffer)) { + const char *inner_text = (const char*)tidy_buffer.bp; + size_t inner_text_size = tidy_buffer.size; + strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size); + string_append(str, inner_text, inner_text_size); + } + tidyBufFree(&tidy_buffer); + } else { + int res = add_inner_text_recursive(doc, child, str); + if(res != 0) + return res; + } + } + return 0; +} + static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { assert(name); for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { @@ -31,12 +112,9 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe QuickMediaHtmlNode node; \ node.doc = tdoc; \ node.node = child; \ - node.text = NULL; \ + string_init(&node.text); \ result_callback(&node, userdata); \ - if(node.text){ \ - tidyBufFree(node.text); \ - free(node.text); \ - } \ + string_deinit(&node.text); \ } \ } while(0) @@ -76,18 +154,13 @@ const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, c } const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { - if(self->text) - return (const char*)((TidyBuffer*)self->text)->bp; - - TidyNode child_node = tidyGetChild(self->node); - if(tidyNodeGetType(child_node) != TidyNode_Text) - return NULL; + if(self->text.data) + return self->text.data; - self->text = malloc(sizeof(TidyBuffer)); - tidyBufInit(self->text); - tidyNodeGetText(self->doc, child_node, self->text); + if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0) + string_append(&self->text, " ", 1); - return (const char*)((TidyBuffer*)self->text)->bp; + return self->text.data; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { -- cgit v1.2.3