From c17412cce925ce226d3835a2e59b4d9f31b5b3ed Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sat, 25 May 2019 02:17:15 +0200 Subject: Initial commit --- src/HtmlSearch.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/NodeSearch.c | 34 ++++++++++++++ src/XpathParser.c | 88 ++++++++++++++++++++++++++++++++++ src/XpathTokenizer.c | 104 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 356 insertions(+) create mode 100644 src/HtmlSearch.c create mode 100644 src/NodeSearch.c create mode 100644 src/XpathParser.c create mode 100644 src/XpathTokenizer.c (limited to 'src') diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c new file mode 100644 index 0000000..e59dc1e --- /dev/null +++ b/src/HtmlSearch.c @@ -0,0 +1,130 @@ +#include "../include/quickmedia/HtmlSearch.h" +#include "../include/quickmedia/XpathParser.h" + +#include +#include + +static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { + assert(name); + for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { + const char *attr_name = tidyAttrName(attr); + if(attr_name && strcmp(name, attr_name) == 0) + return attr; + } + return NULL; +} + +static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + /* We use two loops because we want to find children before grandchildren */ + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + const char *child_node_name = tidyNodeGetName(child); + /* A text node doesn't have a name */ + if(!child_node_name) + continue; + + /* Match without node name or node name matches */ + if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { + #define on_match() do { \ + if(search_data->child) \ + find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ + else { \ + QuickMediaHtmlNode node; \ + node.doc = tdoc; \ + node.node = child; \ + node.text = NULL; \ + result_callback(&node, userdata); \ + if(node.text){ \ + tidyBufFree(node.text); \ + free(node.text); \ + } \ + } \ + } while(0) + + /* If we search without param, then it's a match */ + if(!search_data->param_defined) { + on_match(); + continue; + } + + TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); + /* Couldn't find the param that we want to match against */ + if(!child_attr) + continue; + + const char *attr_value = tidyAttrValue(child_attr); + assert(search_data->param.value); + /* If the param value matches what we want to search for */ + if(attr_value && strcmp(search_data->param.value, attr_value) == 0) { + on_match(); + continue; + } + } + } + + if(search_data->recursive) { + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + find_child_nodes(tdoc, child, search_data, result_callback, userdata); + } + } +} + +const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { + TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); + if(!attr) + return NULL; + return tidyAttrValue(attr); +} + +const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { + QuickMediaStringView string_view; + string_view.data = NULL; + string_view.size = 0; + + if(self->text) { + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; + } + + TidyNode child_node = tidyGetChild(self->node); + if(tidyNodeGetType(child_node) != TidyNode_Text) + return string_view; + + self->text = malloc(sizeof(TidyBuffer)); + tidyBufInit(self->text); + tidyNodeGetText(self->doc, child_node, self->text); + + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; +} + +static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + assert(html_source); + assert(search_data); + assert(result_callback); + if(!html_source || !search_data || !result_callback) + return -1; + + TidyDoc tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyShowWarnings, no); + /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */ + int rc = tidyParseString( tdoc, html_source); + if(rc < 0) { + tidyRelease(tdoc); + return rc; + } + + TidyNode root_node = tidyGetRoot(tdoc); + find_child_nodes(tdoc, root_node, search_data, result_callback, userdata); + tidyRelease(tdoc); + return 0; +} + +int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + QuickMediaNodeSearch search_data; + int xpath_result = quickmedia_parse_xpath(xpath, &search_data); + if(xpath_result != 0) + return xpath_result; + return quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata); +} diff --git a/src/NodeSearch.c b/src/NodeSearch.c new file mode 100644 index 0000000..198b8cd --- /dev/null +++ b/src/NodeSearch.c @@ -0,0 +1,34 @@ +#include "../include/quickmedia/NodeSearch.h" +#include + +void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) { + self->name = NULL; + self->value = NULL; +} + +static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) { + free(self->name); + free(self->value); + self->name = NULL; + self->value = NULL; +} + +void quickmedia_node_search_init(QuickMediaNodeSearch *self) { + self->name = NULL; + self->recursive = 0; + quickmedia_node_search_param_init(&self->param); + self->param_defined = 0; + self->child = NULL; +} + +void quickmedia_node_search_deinit(QuickMediaNodeSearch *self) { + free(self->name); + self->name = NULL; + quickmedia_node_search_param_deinit(&self->param); + + if(self->child) { + quickmedia_node_search_deinit(self->child); + free(self->child); + self->child = NULL; + } +} diff --git a/src/XpathParser.c b/src/XpathParser.c new file mode 100644 index 0000000..24e1d6e --- /dev/null +++ b/src/XpathParser.c @@ -0,0 +1,88 @@ +#include "../include/quickmedia/XpathParser.h" +#include "../include/quickmedia/XpathTokenizer.h" +#include + +typedef struct { + QuickMediaXpathTokenizer tokenizer; +} QuickMediaXpathParser; + +static void quickmedia_xpath_parser_init(QuickMediaXpathParser *self, const char *xpath) { + quickmedia_xpath_tokenizer_init(&self->tokenizer, xpath); +} + +/* ('[' IDENTIFIER '=' '"' STRING '"' ']')? */ +static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { + if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) + return 1; + + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) + return -1; + + result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) + return -2; + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_STRING) + return -3; + + result->value = quickmedia_xpath_tokenizer_copy_string(&self->tokenizer); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) + return -4; + + return 0; +} + +static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *result) { + quickmedia_node_search_init(result); + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + /* // or / */ + if(token == QUICKMEDIA_XPATH_TOKEN_CHILD || token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE) { + result->recursive = (token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) + return -1; + + result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + + int param_result = xpath_parse_param(self, &result->param); + if(param_result < 0) { + quickmedia_node_search_deinit(result); + return param_result; + } else if(param_result == 0) { + result->param_defined = 1; + } + + result->child = malloc(sizeof(QuickMediaNodeSearch)); + int node_result = xpath_parse_node(self, result->child); + if(node_result > 0) { + node_result = 0; + /* Didn't have child, remove child */ + free(result->child); + result->child = NULL; + } else if(node_result < 0) { + quickmedia_node_search_deinit(result); + } + + return node_result; + } else if(token == QUICKMEDIA_XPATH_TOKEN_END_OF_FILE) { + return 1; + } else { + return -2; + } +} + +int quickmedia_parse_xpath(const char *xpath, QuickMediaNodeSearch *result) { + QuickMediaXpathParser parser; + quickmedia_xpath_parser_init(&parser, xpath); + int parse_result = xpath_parse_node(&parser, result); + if(parse_result > 0) + parse_result = -1; + return parse_result; +} diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c new file mode 100644 index 0000000..32bede9 --- /dev/null +++ b/src/XpathTokenizer.c @@ -0,0 +1,104 @@ +#include "../include/quickmedia/XpathTokenizer.h" +#include +#include + +void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath) { + self->code = xpath; + self->identifier.data = NULL; + self->identifier.size = 0; +} + +static int is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int is_num(char c) { + return c >= '0' && c <= '9'; +} + +static int is_alphanum(char c) { + return is_alpha(c) || is_num(c); +} + +static const char* find_end_of_string(const char *str, char escape_symbol) { + int escape = 0; + while(*str != '\0') { + char c = *str; + if(c == '\\') { + escape = !escape; + } else if(c == escape_symbol) { + if(!escape) + return str; + } else { + escape = 0; + } + ++str; + } + return str; +} + +QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self) { + char c = *self->code; + if(c == '/') { + ++self->code; + c = *self->code; + if(c == '/') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE; + } + return QUICKMEDIA_XPATH_TOKEN_CHILD; + } else if(is_alpha(c)) { + self->identifier.data = self->code; + ++self->code; + while(is_alphanum(*self->code) || *self->code == '_' || *self->code == '-') { + ++self->code; + } + self->identifier.size = self->code - self->identifier.data; + return QUICKMEDIA_XPATH_TOKEN_IDENTIFIER; + } else if(c == '[') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET; + } else if(c == ']') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET; + } else if(c == '=') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_EQUAL; + } else if(c == '"' || c == '\'') { + char escape_symbol = c; + ++self->code; + self->string.data = self->code; + self->code = find_end_of_string(self->string.data, escape_symbol); + if(*self->code == '\0') { + /* Reached end of xpath before end of string */ + return QUICKMEDIA_XPATH_TOKEN_INVALID; + } + self->string.size = self->code - self->string.data; + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_STRING; + } else if(c == '\0') { + return QUICKMEDIA_XPATH_TOKEN_END_OF_FILE; + } else { + /* Invalid symbol @c */ + return QUICKMEDIA_XPATH_TOKEN_INVALID; + } +} + +int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token) { + const char *restore_point = self->code; + if(quickmedia_xpath_tokenizer_next(self) == token) + return 0; + self->code = restore_point; + return -1; +} + +char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self) { + char *result = malloc(self->identifier.size + 1); + result[self->identifier.size] = '\0'; + memcpy(result, self->identifier.data, self->identifier.size); + return result; +} + +char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self) { + return quickmedia_xpath_tokenizer_copy_identifier(self); +} -- cgit v1.2.3