aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2019-05-25 02:17:15 +0200
committerdec05eba <dec05eba@protonmail.com>2020-07-06 06:54:59 +0200
commitc17412cce925ce226d3835a2e59b4d9f31b5b3ed (patch)
treebdef2c8cbbda218f7cb75fbc39e6c786d7dfeaf9 /src
Initial commit
Diffstat (limited to 'src')
-rw-r--r--src/HtmlSearch.c130
-rw-r--r--src/NodeSearch.c34
-rw-r--r--src/XpathParser.c88
-rw-r--r--src/XpathTokenizer.c104
4 files changed, 356 insertions, 0 deletions
diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c
new file mode 100644
index 0000000..e59dc1e
--- /dev/null
+++ b/src/HtmlSearch.c
@@ -0,0 +1,130 @@
+#include "../include/quickmedia/HtmlSearch.h"
+#include "../include/quickmedia/XpathParser.h"
+
+#include <tidy.h>
+#include <tidybuffio.h>
+
+static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
+ assert(name);
+ for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
+ const char *attr_name = tidyAttrName(attr);
+ if(attr_name && strcmp(name, attr_name) == 0)
+ return attr;
+ }
+ return NULL;
+}
+
+static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+ /* We use two loops because we want to find children before grandchildren */
+ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+ const char *child_node_name = tidyNodeGetName(child);
+ /* A text node doesn't have a name */
+ if(!child_node_name)
+ continue;
+
+ /* Match without node name or node name matches */
+ if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) {
+ #define on_match() do { \
+ if(search_data->child) \
+ find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \
+ else { \
+ QuickMediaHtmlNode node; \
+ node.doc = tdoc; \
+ node.node = child; \
+ node.text = NULL; \
+ result_callback(&node, userdata); \
+ if(node.text){ \
+ tidyBufFree(node.text); \
+ free(node.text); \
+ } \
+ } \
+ } while(0)
+
+ /* If we search without param, then it's a match */
+ if(!search_data->param_defined) {
+ on_match();
+ continue;
+ }
+
+ TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name);
+ /* Couldn't find the param that we want to match against */
+ if(!child_attr)
+ continue;
+
+ const char *attr_value = tidyAttrValue(child_attr);
+ assert(search_data->param.value);
+ /* If the param value matches what we want to search for */
+ if(attr_value && strcmp(search_data->param.value, attr_value) == 0) {
+ on_match();
+ continue;
+ }
+ }
+ }
+
+ if(search_data->recursive) {
+ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+ find_child_nodes(tdoc, child, search_data, result_callback, userdata);
+ }
+ }
+}
+
+const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) {
+ TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name);
+ if(!attr)
+ return NULL;
+ return tidyAttrValue(attr);
+}
+
+const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) {
+ QuickMediaStringView string_view;
+ string_view.data = NULL;
+ string_view.size = 0;
+
+ if(self->text) {
+ string_view.data = (const char*)((TidyBuffer*)self->text)->bp;
+ string_view.size = ((TidyBuffer*)self->text)->size;
+ return string_view;
+ }
+
+ TidyNode child_node = tidyGetChild(self->node);
+ if(tidyNodeGetType(child_node) != TidyNode_Text)
+ return string_view;
+
+ self->text = malloc(sizeof(TidyBuffer));
+ tidyBufInit(self->text);
+ tidyNodeGetText(self->doc, child_node, self->text);
+
+ string_view.data = (const char*)((TidyBuffer*)self->text)->bp;
+ string_view.size = ((TidyBuffer*)self->text)->size;
+ return string_view;
+}
+
+static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+ assert(html_source);
+ assert(search_data);
+ assert(result_callback);
+ if(!html_source || !search_data || !result_callback)
+ return -1;
+
+ TidyDoc tdoc = tidyCreate();
+ tidyOptSetBool(tdoc, TidyShowWarnings, no);
+ /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */
+ int rc = tidyParseString( tdoc, html_source);
+ if(rc < 0) {
+ tidyRelease(tdoc);
+ return rc;
+ }
+
+ TidyNode root_node = tidyGetRoot(tdoc);
+ find_child_nodes(tdoc, root_node, search_data, result_callback, userdata);
+ tidyRelease(tdoc);
+ return 0;
+}
+
+int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) {
+ QuickMediaNodeSearch search_data;
+ int xpath_result = quickmedia_parse_xpath(xpath, &search_data);
+ if(xpath_result != 0)
+ return xpath_result;
+ return quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata);
+}
diff --git a/src/NodeSearch.c b/src/NodeSearch.c
new file mode 100644
index 0000000..198b8cd
--- /dev/null
+++ b/src/NodeSearch.c
@@ -0,0 +1,34 @@
+#include "../include/quickmedia/NodeSearch.h"
+#include <stdlib.h>
+
+void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) {
+ self->name = NULL;
+ self->value = NULL;
+}
+
+static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) {
+ free(self->name);
+ free(self->value);
+ self->name = NULL;
+ self->value = NULL;
+}
+
+void quickmedia_node_search_init(QuickMediaNodeSearch *self) {
+ self->name = NULL;
+ self->recursive = 0;
+ quickmedia_node_search_param_init(&self->param);
+ self->param_defined = 0;
+ self->child = NULL;
+}
+
+void quickmedia_node_search_deinit(QuickMediaNodeSearch *self) {
+ free(self->name);
+ self->name = NULL;
+ quickmedia_node_search_param_deinit(&self->param);
+
+ if(self->child) {
+ quickmedia_node_search_deinit(self->child);
+ free(self->child);
+ self->child = NULL;
+ }
+}
diff --git a/src/XpathParser.c b/src/XpathParser.c
new file mode 100644
index 0000000..24e1d6e
--- /dev/null
+++ b/src/XpathParser.c
@@ -0,0 +1,88 @@
+#include "../include/quickmedia/XpathParser.h"
+#include "../include/quickmedia/XpathTokenizer.h"
+#include <stdlib.h>
+
+typedef struct {
+ QuickMediaXpathTokenizer tokenizer;
+} QuickMediaXpathParser;
+
+static void quickmedia_xpath_parser_init(QuickMediaXpathParser *self, const char *xpath) {
+ quickmedia_xpath_tokenizer_init(&self->tokenizer, xpath);
+}
+
+/* ('[' IDENTIFIER '=' '"' STRING '"' ']')? */
+static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) {
+ if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0)
+ return 1;
+
+ QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER)
+ return -1;
+
+ result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer);
+
+ token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL)
+ return -2;
+
+ token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ if(token != QUICKMEDIA_XPATH_TOKEN_STRING)
+ return -3;
+
+ result->value = quickmedia_xpath_tokenizer_copy_string(&self->tokenizer);
+
+ token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET)
+ return -4;
+
+ return 0;
+}
+
+static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *result) {
+ quickmedia_node_search_init(result);
+ QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ /* // or / */
+ if(token == QUICKMEDIA_XPATH_TOKEN_CHILD || token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE) {
+ result->recursive = (token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE);
+
+ token = quickmedia_xpath_tokenizer_next(&self->tokenizer);
+ if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER)
+ return -1;
+
+ result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer);
+
+ int param_result = xpath_parse_param(self, &result->param);
+ if(param_result < 0) {
+ quickmedia_node_search_deinit(result);
+ return param_result;
+ } else if(param_result == 0) {
+ result->param_defined = 1;
+ }
+
+ result->child = malloc(sizeof(QuickMediaNodeSearch));
+ int node_result = xpath_parse_node(self, result->child);
+ if(node_result > 0) {
+ node_result = 0;
+ /* Didn't have child, remove child */
+ free(result->child);
+ result->child = NULL;
+ } else if(node_result < 0) {
+ quickmedia_node_search_deinit(result);
+ }
+
+ return node_result;
+ } else if(token == QUICKMEDIA_XPATH_TOKEN_END_OF_FILE) {
+ return 1;
+ } else {
+ return -2;
+ }
+}
+
+int quickmedia_parse_xpath(const char *xpath, QuickMediaNodeSearch *result) {
+ QuickMediaXpathParser parser;
+ quickmedia_xpath_parser_init(&parser, xpath);
+ int parse_result = xpath_parse_node(&parser, result);
+ if(parse_result > 0)
+ parse_result = -1;
+ return parse_result;
+}
diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c
new file mode 100644
index 0000000..32bede9
--- /dev/null
+++ b/src/XpathTokenizer.c
@@ -0,0 +1,104 @@
+#include "../include/quickmedia/XpathTokenizer.h"
+#include <stdlib.h>
+#include <string.h>
+
+void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath) {
+ self->code = xpath;
+ self->identifier.data = NULL;
+ self->identifier.size = 0;
+}
+
+static int is_alpha(char c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+static int is_num(char c) {
+ return c >= '0' && c <= '9';
+}
+
+static int is_alphanum(char c) {
+ return is_alpha(c) || is_num(c);
+}
+
+static const char* find_end_of_string(const char *str, char escape_symbol) {
+ int escape = 0;
+ while(*str != '\0') {
+ char c = *str;
+ if(c == '\\') {
+ escape = !escape;
+ } else if(c == escape_symbol) {
+ if(!escape)
+ return str;
+ } else {
+ escape = 0;
+ }
+ ++str;
+ }
+ return str;
+}
+
+QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self) {
+ char c = *self->code;
+ if(c == '/') {
+ ++self->code;
+ c = *self->code;
+ if(c == '/') {
+ ++self->code;
+ return QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE;
+ }
+ return QUICKMEDIA_XPATH_TOKEN_CHILD;
+ } else if(is_alpha(c)) {
+ self->identifier.data = self->code;
+ ++self->code;
+ while(is_alphanum(*self->code) || *self->code == '_' || *self->code == '-') {
+ ++self->code;
+ }
+ self->identifier.size = self->code - self->identifier.data;
+ return QUICKMEDIA_XPATH_TOKEN_IDENTIFIER;
+ } else if(c == '[') {
+ ++self->code;
+ return QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET;
+ } else if(c == ']') {
+ ++self->code;
+ return QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET;
+ } else if(c == '=') {
+ ++self->code;
+ return QUICKMEDIA_XPATH_TOKEN_EQUAL;
+ } else if(c == '"' || c == '\'') {
+ char escape_symbol = c;
+ ++self->code;
+ self->string.data = self->code;
+ self->code = find_end_of_string(self->string.data, escape_symbol);
+ if(*self->code == '\0') {
+ /* Reached end of xpath before end of string */
+ return QUICKMEDIA_XPATH_TOKEN_INVALID;
+ }
+ self->string.size = self->code - self->string.data;
+ ++self->code;
+ return QUICKMEDIA_XPATH_TOKEN_STRING;
+ } else if(c == '\0') {
+ return QUICKMEDIA_XPATH_TOKEN_END_OF_FILE;
+ } else {
+ /* Invalid symbol @c */
+ return QUICKMEDIA_XPATH_TOKEN_INVALID;
+ }
+}
+
+int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token) {
+ const char *restore_point = self->code;
+ if(quickmedia_xpath_tokenizer_next(self) == token)
+ return 0;
+ self->code = restore_point;
+ return -1;
+}
+
+char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self) {
+ char *result = malloc(self->identifier.size + 1);
+ result[self->identifier.size] = '\0';
+ memcpy(result, self->identifier.data, self->identifier.size);
+ return result;
+}
+
+char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self) {
+ return quickmedia_xpath_tokenizer_copy_identifier(self);
+}