From 5f283adc8d1a29f420d466e85b216e9d6f4a9822 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Fri, 2 Jul 2021 23:17:06 +0200 Subject: Remove dependency on html tidy --- README.md | 3 + depends/html-parser | 2 +- include/quickmedia/HtmlSearch.h | 53 +++- include/quickmedia/NodeSearch.h | 18 +- include/quickmedia/XpathTokenizer.h | 2 - src/HtmlSearch.c | 512 ++++++++++++++++++++++++++---------- src/NodeSearch.c | 18 +- src/XpathParser.c | 15 +- src/XpathTokenizer.c | 11 - tests/main.c | 46 +++- 10 files changed, 481 insertions(+), 199 deletions(-) diff --git a/README.md b/README.md index 7279c0d..0516066 100644 --- a/README.md +++ b/README.md @@ -1 +1,4 @@ Html search using non-standard xpath, written in C. See tests/main.c + +# Note +This library does not decode html sequences in text and attribute values \ No newline at end of file diff --git a/depends/html-parser b/depends/html-parser index 917f810..fd8f035 160000 --- a/depends/html-parser +++ b/depends/html-parser @@ -1 +1 @@ -Subproject commit 917f810d7f196fef5959bc3096ce7360df961fc0 +Subproject commit fd8f0358ceb43c423a4180e23fcd5b9f6201d829 diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h index bedde03..63f2175 100644 --- a/include/quickmedia/HtmlSearch.h +++ b/include/quickmedia/HtmlSearch.h @@ -2,47 +2,74 @@ #define QUICKMEDIA_HTML_SEARCH_H #include "NodeSearch.h" +#include #include #ifdef __cplusplus extern "C" { #endif +typedef struct QuickMediaHtmlAttribute QuickMediaHtmlAttribute; +typedef struct QuickMediaHtmlNode QuickMediaHtmlNode; +typedef struct QuickMediaHtmlChildNode QuickMediaHtmlChildNode; +typedef struct QuickMediaTextNode QuickMediaTextNode; + typedef struct { char *data; size_t size; size_t capacity; } QuickMediaString; +struct QuickMediaHtmlAttribute { + QuickMediaStringView key; + QuickMediaStringView value; + QuickMediaHtmlAttribute *next; +}; + +struct QuickMediaHtmlNode { + int is_tag; /* 0 = text, 1 = tag */ + QuickMediaStringView name; /* name if the node is a tag, text if the node is a text */ + QuickMediaHtmlAttribute *first_attribute; + QuickMediaHtmlAttribute *last_attribute; + QuickMediaHtmlChildNode *first_child; + QuickMediaHtmlChildNode *last_child; + QuickMediaHtmlNode *parent; +}; + +struct QuickMediaHtmlChildNode { + QuickMediaHtmlNode node; + QuickMediaHtmlChildNode *next; +}; + typedef struct { - const void *doc; - const void *node; - QuickMediaString text; -} QuickMediaHtmlNode; + QuickMediaHtmlNode *node; + QuickMediaString __str; +} QuickMediaMatchNode; typedef struct { - const void *doc; + QuickMediaHtmlNode root_node; } QuickMediaHtmlSearch; /* - Returns NULL if attribute doesn't exist or if it doesn't have any value. + Returns an empty string view if attribute doesn't exist or if it doesn't have any value. The result is only valid within the callback function scope. */ -const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name); +QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name); /* - Returns NULL if the node doesn't have any text. + Returns an empty string if the node doesn't have any text or if there was an error creating the text. The result is only valid within the callback function scope. */ -const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self); +QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self); -/* @node is only valid within the callback function scope */ -typedef void (*QuickMediaHtmlSearchResultCallback)(QuickMediaHtmlNode *node, void *userdata); +/* @node is only valid within the callback function scope. Return 0 to continue */ +typedef int (*QuickMediaHtmlSearchResultCallback)(QuickMediaMatchNode *node, void *userdata); -int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source); +/* |html_source| should be in utf8 format and may contain utf8 BOM */ +int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size); void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self); -/* Non-standard xpath. Doesn't use '@' symbol for accessing properties */ +/* Non-standard xpath. Doesn't use '@' symbol for accessing properties. Returns non-0 value if there is a syntax error in the xpath */ int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata); #ifdef __cplusplus diff --git a/include/quickmedia/NodeSearch.h b/include/quickmedia/NodeSearch.h index adaac44..9e7fd0c 100644 --- a/include/quickmedia/NodeSearch.h +++ b/include/quickmedia/NodeSearch.h @@ -1,30 +1,32 @@ #ifndef QUICKMEDIA_NODE_SEARCH_H #define QUICKMEDIA_NODE_SEARCH_H +#include + #ifdef __cplusplus extern "C" { #endif typedef struct { - char *name; - char *value; + const char *data; + size_t size; +} QuickMediaStringView; + +typedef struct { + QuickMediaStringView name; + QuickMediaStringView value; int defined; } QuickMediaNodeSearchParam; typedef struct QuickMediaNodeSearch QuickMediaNodeSearch; struct QuickMediaNodeSearch { - char *name; /* optional */ + QuickMediaStringView name; /* optional */ int recursive; QuickMediaNodeSearchParam param; /* optional */ QuickMediaNodeSearch *child; /* optional */ }; -typedef struct { - const char *data; - unsigned long long size; -} QuickMediaStringView; - void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self); void quickmedia_node_search_init(QuickMediaNodeSearch *self); void quickmedia_node_search_deinit(QuickMediaNodeSearch *self); diff --git a/include/quickmedia/XpathTokenizer.h b/include/quickmedia/XpathTokenizer.h index cada673..62f6d75 100644 --- a/include/quickmedia/XpathTokenizer.h +++ b/include/quickmedia/XpathTokenizer.h @@ -30,8 +30,6 @@ typedef enum { void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath); QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self); int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token); -char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self); -char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self); #ifdef __cplusplus } diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c index c49ee46..23c4736 100644 --- a/src/HtmlSearch.c +++ b/src/HtmlSearch.c @@ -1,8 +1,10 @@ #include "../include/quickmedia/HtmlSearch.h" #include "../include/quickmedia/XpathParser.h" -#include -#include +#include +#include +#include +#include static void string_init(QuickMediaString *self) { self->data = NULL; @@ -17,75 +19,83 @@ static void string_deinit(QuickMediaString *self) { self->capacity = 0; } -static int string_append(QuickMediaString *self, const char *str, size_t size) { - size_t new_capacity = self->capacity; - if(new_capacity == 0) { - new_capacity = 8; - } +static int string_ensure_capacity(QuickMediaString *self, size_t new_capacity) { + if(self->capacity >= new_capacity) + return 0; + + size_t capacity = self->capacity; + if(capacity == 0) + capacity = 8; - size_t new_size = self->size + size; - while(new_size + 1 > new_capacity) { - new_capacity += (new_capacity >> 1); - } + while(capacity < new_capacity) { + capacity += (capacity >> 1); + } - void *new_data = realloc(self->data, new_capacity); + void *new_data = realloc(self->data, capacity); if(!new_data) { - fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, new_capacity); + fprintf(stderr, "Failed to realloc %p to size: %zu\n", (void*)self->data, capacity); return 1; } - - memcpy((char*)new_data + self->size, str, size); - ((char*)new_data)[self->size + size] = '\0'; - self->data = (char*)new_data; - self->size = new_size; - self->capacity = new_capacity; + + self->data = new_data; + self->capacity = capacity; return 0; } -static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - size_t i = 0; - while(i < size && str[i] == '\n') { - ++i; - } - *output_str = str + i; - *output_size = size - i; -} +static int string_append(QuickMediaString *self, const char *str, size_t size) { + int res = string_ensure_capacity(self, self->size + size); + if(res != 0) + return res; -static void rstrip_newline(const char *str, size_t size, size_t *output_size) { - ssize_t i = size - 1; - while(i >= 0 && str[i] == '\n') { - --i; - } - *output_size = i + 1; + memcpy((char*)self->data + self->size, str, size); + ((char*)self->data)[self->size + size] = '\0'; + self->size += size; + return 0; } -static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - lstrip_newline(str, size, output_str, output_size); - rstrip_newline(*output_str, *output_size, output_size); +static size_t find_first_not_char(const char *str, size_t size, char not_char) { + assert(not_char != '\0'); + size_t i = 0; + for(; i < size && str[i] == not_char; ++i) {} + return i; } -/* Returns pointer to char that is not |not_char|, even if the first matching character is the null terminator. |not_char| can't be '\0' */ -static const char* find_first_not_char(const char *str, char not_char) { - assert(not_char != '\0'); - while(*str == not_char) { ++str; } - return str; +static char string_view_char_or(const QuickMediaStringView *str, size_t index, char fallback) { + if(index < str->size) + return str->data[index]; + else + return fallback; } /* Returns 0 on match */ -static int str_glob_match(const char *str, const char *glob) { +static int str_glob_match(const QuickMediaStringView str, const QuickMediaStringView glob) { + size_t str_index = 0; + size_t glob_index = 0; + + if(str.size == 0) { + /* TODO: What about glob = **** (more than one asterix) */ + if(glob.size == 0 || (glob.size == 1 && glob.data[0] == '*')) + return 0; + else + return 1; + } + for(;;) { - char glob_c = *glob; + char glob_c = string_view_char_or(&glob, glob_index, '\0'); if(glob_c == '*') { - glob = find_first_not_char(glob + 1, '*'); - char next_glob_c = *glob; + glob_index += find_first_not_char(glob.data + glob_index, glob.size - glob_index, '*'); + char next_glob_c = string_view_char_or(&glob, glob_index, '\0'); if(next_glob_c == '\0') return 0; - str = strchr(str, next_glob_c); - if(!str) + const void *s_p = memchr(str.data + str_index, next_glob_c, str.size - str_index); + if(!s_p) return 1; + + const size_t new_str_index = (const char*)s_p - (str.data + str_index); + str_index = new_str_index; } else { - char str_c = *str; + char str_c = string_view_char_or(&str, str_index, '\0'); if(str_c != glob_c) return 1; @@ -93,73 +103,54 @@ static int str_glob_match(const char *str, const char *glob) { return 0; } - ++str; - ++glob; + ++str_index; + ++glob_index; } assert(0); /* shouldn't happen */ return 1; } -static int add_inner_text_recursive(const TidyDoc doc, const TidyNode node, QuickMediaString *str) { - for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { - const char *node_name = tidyNodeGetName(child); - if(node_name && strcmp(node_name, "br") == 0) { - string_append(str, "\n", 1); - } else if(tidyNodeGetType(child) == TidyNode_Start && node_name && strcmp(node_name, "p") == 0) { - if(str->size > 0) - string_append(str, "\n", 1); - } - - if(tidyNodeGetType(child) == TidyNode_Text) { - TidyBuffer tidy_buffer; - tidyBufInit(&tidy_buffer); - if(tidyNodeGetText(doc, child, &tidy_buffer)) { - const char *inner_text = (const char*)tidy_buffer.bp; - size_t inner_text_size = tidy_buffer.size; - strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size); - string_append(str, inner_text, inner_text_size); - } - tidyBufFree(&tidy_buffer); - } else { - int res = add_inner_text_recursive(doc, child, str); - if(res != 0) - return res; - } - } - return 0; +static int string_views_equal(const QuickMediaStringView str1, const QuickMediaStringView str2) { + if(str2.size == str1.size && memcmp(str2.data, str1.data, str1.size) == 0) + return 0; + else + return 1; } -static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { - assert(name); - for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { - const char *attr_name = tidyAttrName(attr); - if(attr_name && strcmp(name, attr_name) == 0) +static QuickMediaHtmlAttribute* get_attribute_by_name(QuickMediaHtmlNode *node, QuickMediaStringView name) { + for(QuickMediaHtmlAttribute *attr = node->first_attribute; attr; attr = attr->next) { + if(string_views_equal(attr->key, name) == 0) return attr; } return NULL; } -static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { +static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + if(!node) + return 0; + /* We use two loops because we want to find children before grandchildren */ - for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { - const char *child_node_name = tidyNodeGetName(child); + for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { /* A text node doesn't have a name */ - if(!child_node_name) + if(!child->node.is_tag || child->node.name.size == 0) continue; /* Match without node name or node name matches */ - if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { + if(search_data->name.size == 0 || string_views_equal(child->node.name, search_data->name) == 0) { #define on_match() do { \ - if(search_data->child) \ - find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ - else { \ - QuickMediaHtmlNode node; \ - node.doc = tdoc; \ - node.node = child; \ - string_init(&node.text); \ - result_callback(&node, userdata); \ - string_deinit(&node.text); \ + if(search_data->child) { \ + if(find_child_nodes(child->node.first_child, search_data->child, result_callback, userdata) != 0) \ + return 1; \ + } else { \ + QuickMediaMatchNode match_node; \ + match_node.node = &child->node; \ + string_init(&match_node.__str); \ + if(result_callback(&match_node, userdata) != 0) { \ + string_deinit(&match_node.__str); \ + return 1; \ + } \ + string_deinit(&match_node.__str); \ } \ } while(0) @@ -169,15 +160,14 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe continue; } - TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); + QuickMediaHtmlAttribute *child_attr = get_attribute_by_name(&child->node, search_data->param.name); /* Couldn't find the param that we want to match against */ if(!child_attr) continue; - const char *attr_value = tidyAttrValue(child_attr); - assert(search_data->param.value); + assert(search_data->param.value.size > 0); /* If the param value matches what we want to search for */ - if(attr_value && str_glob_match(attr_value, search_data->param.value) == 0) { + if(str_glob_match(child_attr->value, search_data->param.value) == 0) { on_match(); continue; } @@ -185,27 +175,13 @@ static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSe } if(search_data->recursive) { - for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { - find_child_nodes(tdoc, child, search_data, result_callback, userdata); + for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { + if(find_child_nodes(child->node.first_child, search_data, result_callback, userdata) != 0) + return 1; } } -} - -const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { - TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); - if(!attr) - return NULL; - return tidyAttrValue(attr); -} -const char* quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { - if(self->text.data) - return self->text.data; - - if(add_inner_text_recursive((TidyDoc)self->doc, (TidyNode)self->node, &self->text) != 0) - string_append(&self->text, " ", 1); - - return self->text.data; + return 0; } static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { @@ -214,29 +190,299 @@ static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNode if(!search_data || !result_callback) return -1; - TidyNode root_node = tidyGetRoot(self->doc); - find_child_nodes(self->doc, root_node, search_data, result_callback, userdata); + find_child_nodes(self->root_node.first_child, search_data, result_callback, userdata); return 0; } -int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) { - self->doc = tidyCreate(); - tidyOptSetBool(self->doc, TidyShowWarnings, no); - tidyOptSetInt(self->doc, TidyUseCustomTags, 1); - tidyOptSetInt(self->doc, TidyWrapLen, 0); - /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */ - if(tidyParseString(self->doc, html_source) < 0) { - tidyRelease(self->doc); - self->doc = NULL; +static void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent); +static void html_node_child_deinit(QuickMediaHtmlChildNode *self); + +static void html_attribute_init(QuickMediaHtmlAttribute *self) { + self->key.data = NULL; + self->key.size = 0; + self->value.data = NULL, + self->value.size = 0; + self->next = NULL; +} + +static void html_attribute_deinit(QuickMediaHtmlAttribute *self) { + if(self->next) { + html_attribute_deinit(self->next); + free(self->next); + self->next = NULL; + } + html_attribute_init(self); +} + +static void html_node_init(QuickMediaHtmlNode *self) { + self->is_tag = 1; + self->name.data = NULL; + self->name.size = 0; + self->first_attribute = NULL; + self->last_attribute = NULL; + self->first_child = NULL; + self->last_child = NULL; + self->parent = NULL; +} + +static void html_node_deinit(QuickMediaHtmlNode *self) { + if(self->first_attribute) { + html_attribute_deinit(self->first_attribute); + free(self->first_attribute); + self->first_attribute = NULL; + } + + if(self->first_child) { + html_node_child_deinit(self->first_child); + free(self->first_child); + self->first_child = NULL; + } + + html_node_init(self); +} + +static int html_node_add_attribute(QuickMediaHtmlNode *self, HtmlStringView key, HtmlStringView value) { + QuickMediaHtmlAttribute *attribute = malloc(sizeof(QuickMediaHtmlAttribute)); + if(!attribute) + return 1; + + html_attribute_init(attribute); + attribute->key.data = key.data; + attribute->key.size = key.size; + attribute->value.data = value.data; + attribute->value.size = value.size; + + if(self->last_attribute) { + self->last_attribute->next = attribute; + self->last_attribute = attribute; + } else { + self->first_attribute = attribute; + self->last_attribute = attribute; } + return 0; } -void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { - if(self->doc) { - tidyRelease(self->doc); - self->doc = NULL; +void html_node_child_init(QuickMediaHtmlChildNode *self, QuickMediaHtmlNode *parent) { + html_node_init(&self->node); + self->node.parent = parent; + if(parent) { + if(parent->last_child) { + parent->last_child->next = self; + parent->last_child = self; + } else { + parent->first_child = self; + parent->last_child = self; + } + } + self->next = NULL; +} + +void html_node_child_deinit(QuickMediaHtmlChildNode *self) { + if(self->next) { + html_node_child_deinit(self->next); + free(self->next); + self->next = NULL; } + html_node_deinit(&self->node); +} + +static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { + QuickMediaHtmlNode **html_node_p = userdata; + QuickMediaHtmlNode *html_node = *html_node_p; + + switch(parse_type) { + case HTML_PARSE_TAG_START: { + QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); + if(!child_node) + return 1; + html_node_child_init(child_node, html_node); + child_node->node.name.data = html_parser->tag_name.data; + child_node->node.name.size = html_parser->tag_name.size; + *html_node_p = &child_node->node; + break; + } + case HTML_PARSE_TAG_END: { + if(html_node->parent) + *html_node_p = html_node->parent; + break; + } + case HTML_PARSE_ATTRIBUTE: { + if(html_node_add_attribute(html_node, html_parser->attribute_key, html_parser->attribute_value) != 0) + return 1; + break; + } + case HTML_PARSE_TEXT: + /* fallthrough */ + case HTML_PARSE_JAVASCRIPT_CODE: { + QuickMediaHtmlChildNode *child_node = malloc(sizeof(QuickMediaHtmlChildNode)); + if(!child_node) + return 1; + html_node_child_init(child_node, html_node); + child_node->node.is_tag = 0; + child_node->node.name.data = html_parser->text.data; + child_node->node.name.size = html_parser->text.size; + break; + } + } + + return 0; +} + +QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name) { + QuickMediaStringView attr_name; + attr_name.data = attribute_name; + attr_name.size = strlen(attribute_name); + + QuickMediaHtmlAttribute *attr = get_attribute_by_name(self->node, attr_name); + if(attr) { + return attr->value; + } else { + QuickMediaStringView attr_value; + attr_value.data = NULL; + attr_value.size = 0; + return attr_value; + } +} + +static int is_whitespace(int c) { + switch(c) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\v': + return 1; + default: + return 0; + } +} + +static int is_newline(int c) { + return c == '\n' || c == '\r'; +} + +static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { + size_t i = 0; + while(i < size && strip_filter_func(str[i])) { + ++i; + } + *output_str = str + i; + *output_size = size - i; +} + +static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) { + ssize_t i = size - 1; + while(i >= 0 && strip_filter_func(str[i])) { + --i; + } + *output_size = i + 1; +} + +static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) { + lstrip(str, size, output_str, output_size, strip_filter_func); + rstrip(*output_str, *output_size, output_size, strip_filter_func); +} + +static int merge_inner_text(QuickMediaHtmlNode *node, QuickMediaString *str) { + if(node->is_tag) { + int newline = 0; + if(node->name.size == 2 && memcmp(node->name.data, "br", 2) == 0) { + if(string_append(str, "\n", 1) != 0) + return 1; + newline = 1; + } else if(node->name.size == 2 && node->name.data[0] == 'h' && (node->name.data[1] >= '1' && node->name.data[1] <= '6')) { + if(str->size > 0) { + if(string_append(str, "\n", 1) != 0) + return 1; + } + newline = 1; + } else if(node->name.size == 1 && node->name.data[0] == 'p') { + if(str->size > 0) { + if(string_append(str, "\n", 1) != 0) + return 1; + } + newline = 1; + } + + size_t prev_size = str->size; + for(QuickMediaHtmlChildNode *child = node->first_child; child; child = child->next) { + merge_inner_text(&child->node, str); + } + + if(newline && str->size > prev_size && str->size > 0) { + if(string_append(str, "\n", 1) != 0) + return 1; + } + } else { + const char *inner_text = node->name.data; + size_t inner_text_size = node->name.size; + strip(inner_text, inner_text_size, &inner_text, &inner_text_size, is_newline); + if(inner_text_size > 0) { + if(string_append(str, node->name.data, node->name.size) != 0) + return 1; + } + } + return 0; +} + +QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) { + if(self->__str.data) { + QuickMediaStringView text; + text.data = self->__str.data; + text.size = self->__str.size; + strip(text.data, text.size, &text.data, &text.size, is_whitespace); + return text; + } + + if(!self->node->first_child) { + QuickMediaStringView text; + text.data = NULL; + text.size = 0; + return text; + } + + /* If the only child is the text node then there is no need to create a copy of it */ + /* TODO: Strip newline and whitespace */ + if(!self->node->first_child->next && !self->node->first_child->node.is_tag) { + QuickMediaStringView text = self->node->first_child->node.name; + strip(text.data, text.size, &text.data, &text.size, is_whitespace); + return text; + } + + if(merge_inner_text(self->node, &self->__str) != 0) { + QuickMediaStringView text; + text.data = NULL; + text.size = 0; + return text; + } + + QuickMediaStringView text; + text.data = self->__str.data; + text.size = self->__str.size; + strip(text.data, text.size, &text.data, &text.size, is_whitespace); + return text; +} + +int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) { + /* Utf8 BOM */ + if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) { + html_source += 3; + size -= 3; + } + + QuickMediaHtmlNode *html_node = &self->root_node; + html_node_init(html_node); + if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) { + quickmedia_html_search_deinit(self); + return 1; + } + + return 0; +} + +void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { + html_node_deinit(&self->root_node); } int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { diff --git a/src/NodeSearch.c b/src/NodeSearch.c index 0a36215..bddb26c 100644 --- a/src/NodeSearch.c +++ b/src/NodeSearch.c @@ -2,28 +2,28 @@ #include void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) { - self->name = NULL; - self->value = NULL; + self->name.data = NULL; + self->name.size = 0; + self->value.data = NULL; + self->value.size = 0; self->defined = 0; } static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) { - free(self->name); - free(self->value); - self->name = NULL; - self->value = NULL; + quickmedia_node_search_param_init(self); } void quickmedia_node_search_init(QuickMediaNodeSearch *self) { - self->name = NULL; + self->name.data = NULL; + self->name.size = 0; self->recursive = 0; quickmedia_node_search_param_init(&self->param); self->child = NULL; } void quickmedia_node_search_deinit(QuickMediaNodeSearch *self) { - free(self->name); - self->name = NULL; + self->name.data = NULL; + self->name.size = 0; quickmedia_node_search_param_deinit(&self->param); if(self->child) { diff --git a/src/XpathParser.c b/src/XpathParser.c index 4326e85..0dbe270 100644 --- a/src/XpathParser.c +++ b/src/XpathParser.c @@ -19,7 +19,7 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) return -1; - result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + result->name = self->tokenizer.identifier; token = quickmedia_xpath_tokenizer_next(&self->tokenizer); if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) @@ -29,7 +29,7 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa if(token != QUICKMEDIA_XPATH_TOKEN_STRING) return -3; - result->value = quickmedia_xpath_tokenizer_copy_string(&self->tokenizer); + result->value = self->tokenizer.string; token = quickmedia_xpath_tokenizer_next(&self->tokenizer); if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) @@ -50,23 +50,22 @@ static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *r if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) return -1; - result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + result->name = self->tokenizer.identifier; int param_result = xpath_parse_param(self, &result->param); - if(param_result < 0) { - quickmedia_node_search_deinit(result); + if(param_result < 0) return param_result; - } result->child = malloc(sizeof(QuickMediaNodeSearch)); + if(!result->child) + return -1; + int node_result = xpath_parse_node(self, result->child); if(node_result > 0) { node_result = 0; /* Didn't have child, remove child */ free(result->child); result->child = NULL; - } else if(node_result < 0) { - quickmedia_node_search_deinit(result); } return node_result; diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c index 32bede9..ae17939 100644 --- a/src/XpathTokenizer.c +++ b/src/XpathTokenizer.c @@ -91,14 +91,3 @@ int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMedi self->code = restore_point; return -1; } - -char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self) { - char *result = malloc(self->identifier.size + 1); - result[self->identifier.size] = '\0'; - memcpy(result, self->identifier.data, self->identifier.size); - return result; -} - -char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self) { - return quickmedia_xpath_tokenizer_copy_identifier(self); -} diff --git a/tests/main.c b/tests/main.c index 7888ea1..2a08ec7 100644 --- a/tests/main.c +++ b/tests/main.c @@ -1,34 +1,51 @@ -#include #include "../include/quickmedia/HtmlSearch.h" -#include +#include #include +#include +#include -static char* get_file_content(const char *filepath) { +static char* get_file_content(const char *filepath, size_t *filesize) { FILE *file = fopen(filepath, "rb"); assert(file); fseek(file, 0, SEEK_END); - size_t filesize = ftell(file); + *filesize = ftell(file); fseek(file, 0, SEEK_SET); - char *buffer = malloc(filesize + 1); - buffer[filesize] = '\0'; - fread(buffer, 1, filesize, file); + char *buffer = malloc((*filesize) + 1); + buffer[*filesize] = '\0'; + fread(buffer, 1, *filesize, file); return buffer; } -static void result_callback(QuickMediaHtmlNode *node, void *userdata) { - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *text = quickmedia_html_node_get_text(node); - printf("a href: %s, node value: %s\n", href, text); +static int result_callback(QuickMediaMatchNode *node, void *userdata) { + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + printf("a href: %.*s, node value: %.*s\n", (int)href.size, href.data, (int)text.size, text.data); + return 0; +} + +static int result_callback_nested_text(QuickMediaMatchNode *node, void *userdata) { + QuickMediaStringView text = quickmedia_html_node_get_text(node); + printf("text: %.*s\n", (int)text.size, text.data); + return 0; +} + +static void test_nested_nodes_get_text() { + const char *html_source = "
hello

text

world
"; + QuickMediaHtmlSearch html_search; + quickmedia_html_search_init(&html_search, html_source, strlen(html_source)); + quickmedia_html_find_nodes_xpath(&html_search, "//div[class='item']", result_callback_nested_text, NULL); + quickmedia_html_search_deinit(&html_search); } -int main(int argc, char **argv) { - char *file_content = get_file_content("test_files/test.html"); +int main() { + size_t filesize = 0; + char *file_content = get_file_content("test_files/test.html", &filesize); QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, file_content); + int result = quickmedia_html_search_init(&html_search, file_content, filesize); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//h3[class=\"story_name\"]//a", result_callback, NULL); @@ -38,5 +55,6 @@ int main(int argc, char **argv) { cleanup: quickmedia_html_search_deinit(&html_search); free(file_content); + test_nested_nodes_get_text(); return result; } -- cgit v1.2.3