From c5f4811d0ba74715c8e128da133248cc399a6a6a Mon Sep 17 00:00:00 2001 From: Aleksi Lindeman Date: Sat, 25 May 2019 03:58:20 +0200 Subject: Allow reusing doc for multiple xpath searches --- include/quickmedia/HtmlSearch.h | 9 ++++++++- src/HtmlSearch.c | 39 +++++++++++++++++++++++---------------- tests/main.c | 12 +++++++++++- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h index e3bea33..568e101 100644 --- a/include/quickmedia/HtmlSearch.h +++ b/include/quickmedia/HtmlSearch.h @@ -13,6 +13,10 @@ typedef struct { void *text; } QuickMediaHtmlNode; +typedef struct { + const void *doc; +} QuickMediaHtmlSearch; + /* Returns NULL if attribute doesn't exist or if it doesn't have any value */ const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name); @@ -22,7 +26,10 @@ const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *sel /* @node is only valid within the callback function scope */ typedef void (*QuickMediaHtmlSearchResultCallback)(QuickMediaHtmlNode *node, void *userdata); -int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata); +int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source); +void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self); + +int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata); #ifdef __cplusplus } diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c index c3608dd..7868f32 100644 --- a/src/HtmlSearch.c +++ b/src/HtmlSearch.c @@ -99,35 +99,42 @@ const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *sel return string_view; } -static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { - assert(html_source); +static int quickmedia_html_find_nodes(QuickMediaHtmlSearch *self, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { assert(search_data); assert(result_callback); - if(!html_source || !search_data || !result_callback) + if(!search_data || !result_callback) return -1; + + TidyNode root_node = tidyGetRoot(self->doc); + find_child_nodes(self->doc, root_node, search_data, result_callback, userdata); + return 0; +} - TidyDoc tdoc = tidyCreate(); - tidyOptSetBool(tdoc, TidyShowWarnings, no); - /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */ - int rc = tidyParseString( tdoc, html_source); - if(rc < 0) { - tidyRelease(tdoc); - return rc; +int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source) { + self->doc = tidyCreate(); + tidyOptSetBool(self->doc, TidyShowWarnings, no); + /* tidyOptSetBool(self->doc, TidyForceOutput, yes); */ + if(tidyParseString(self->doc, html_source) < 0) { + tidyRelease(self->doc); + self->doc = NULL; } - - TidyNode root_node = tidyGetRoot(tdoc); - find_child_nodes(tdoc, root_node, search_data, result_callback, userdata); - tidyRelease(tdoc); return 0; } -int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { +void quickmedia_html_search_deinit(QuickMediaHtmlSearch *self) { + if(self->doc) { + tidyRelease(self->doc); + self->doc = NULL; + } +} + +int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { QuickMediaNodeSearch search_data; quickmedia_node_search_init(&search_data); int result = quickmedia_parse_xpath(xpath, &search_data); if(result != 0) goto cleanup; - result = quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata); + result = quickmedia_html_find_nodes(self, &search_data, result_callback, userdata); cleanup: quickmedia_node_search_deinit(&search_data); return result; diff --git a/tests/main.c b/tests/main.c index 5b697a9..4d16ad6 100644 --- a/tests/main.c +++ b/tests/main.c @@ -26,7 +26,17 @@ static void result_callback(QuickMediaHtmlNode *node, void *userdata) { int main(int argc, char **argv) { char *file_content = get_file_content("test_files/test.html"); - int result = quickmedia_html_find_nodes_xpath(file_content, "//h3[class=\"story_name\"]//a", result_callback, NULL); + QuickMediaHtmlSearch html_search; + + int result = quickmedia_html_search_init(&html_search, file_content); + if(result != 0) + goto cleanup; + result = quickmedia_html_find_nodes_xpath(&html_search, "//h3[class=\"story_name\"]//a", result_callback, NULL); + /* Test that the object can be reused without reloading html doc */ + result = quickmedia_html_find_nodes_xpath(&html_search, "//h3[class=\"story_name\"]//a", result_callback, NULL); + + cleanup: + quickmedia_html_search_deinit(&html_search); free(file_content); return result; } -- cgit v1.2.3