From c17412cce925ce226d3835a2e59b4d9f31b5b3ed Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sat, 25 May 2019 02:17:15 +0200 Subject: Initial commit --- .gitignore | 6 + LICENSE | 13 + README.md | 1 + include/quickmedia/HtmlSearch.h | 31 +++ include/quickmedia/NodeSearch.h | 29 +++ include/quickmedia/XpathParser.h | 8 + include/quickmedia/XpathTokenizer.h | 32 +++ project.conf | 12 + src/HtmlSearch.c | 130 ++++++++++ src/NodeSearch.c | 34 +++ src/XpathParser.c | 88 +++++++ src/XpathTokenizer.c | 104 ++++++++ test_files/test.html | 478 ++++++++++++++++++++++++++++++++++++ tests/main.c | 33 +++ 14 files changed, 999 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 include/quickmedia/HtmlSearch.h create mode 100644 include/quickmedia/NodeSearch.h create mode 100644 include/quickmedia/XpathParser.h create mode 100644 include/quickmedia/XpathTokenizer.h create mode 100644 project.conf create mode 100644 src/HtmlSearch.c create mode 100644 src/NodeSearch.c create mode 100644 src/XpathParser.c create mode 100644 src/XpathTokenizer.c create mode 100644 test_files/test.html create mode 100644 tests/main.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0dee329 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Compiled sibs files +sibs-build/ +compile_commands.json +tests/sibs-build/ +tests/compile_commands.json +.vscode/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..89abc21 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2019 dec05eba + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a6e5584 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Html search using xpath, written in C diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h new file mode 100644 index 0000000..e3bea33 --- /dev/null +++ b/include/quickmedia/HtmlSearch.h @@ -0,0 +1,31 @@ +#ifndef QUICKMEDIA_HTML_SEARCH_H +#define QUICKMEDIA_HTML_SEARCH_H + +#include "NodeSearch.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + const void *doc; + const void *node; + void *text; +} QuickMediaHtmlNode; + +/* Returns NULL if attribute doesn't exist or if it doesn't have any value */ +const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name); + +/* Returns StringView where data is NULL and size is 0 if node doesn't have any text */ +const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self); + +/* @node is only valid within the callback function scope */ +typedef void (*QuickMediaHtmlSearchResultCallback)(QuickMediaHtmlNode *node, void *userdata); + +int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/quickmedia/NodeSearch.h b/include/quickmedia/NodeSearch.h new file mode 100644 index 0000000..b512296 --- /dev/null +++ b/include/quickmedia/NodeSearch.h @@ -0,0 +1,29 @@ +#ifndef QUICKMEDIA_NODE_SEARCH_H +#define QUICKMEDIA_NODE_SEARCH_H + +typedef struct { + char *name; + char *value; +} QuickMediaNodeSearchParam; + +typedef struct QuickMediaNodeSearch QuickMediaNodeSearch; + +struct QuickMediaNodeSearch { + char *name; /* optional */ + int recursive; + QuickMediaNodeSearchParam param; /* optional */ + int param_defined; + + QuickMediaNodeSearch *child; /* optional */ +}; + +typedef struct { + const char *data; + unsigned long long size; +} QuickMediaStringView; + +void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self); +void quickmedia_node_search_init(QuickMediaNodeSearch *self); +void quickmedia_node_search_deinit(QuickMediaNodeSearch *self); + +#endif diff --git a/include/quickmedia/XpathParser.h b/include/quickmedia/XpathParser.h new file mode 100644 index 0000000..2dfc81e --- /dev/null +++ b/include/quickmedia/XpathParser.h @@ -0,0 +1,8 @@ +#ifndef QUICKMEDIA_XPATH_PARSER_H +#define QUICKMEDIA_XPATH_PARSER_H + +#include "NodeSearch.h" + +int quickmedia_parse_xpath(const char *xpath, QuickMediaNodeSearch *result); + +#endif diff --git a/include/quickmedia/XpathTokenizer.h b/include/quickmedia/XpathTokenizer.h new file mode 100644 index 0000000..8827cff --- /dev/null +++ b/include/quickmedia/XpathTokenizer.h @@ -0,0 +1,32 @@ +#ifndef QUICKMEDIA_XPATH_TOKENIZER_H +#define QUICKMEDIA_XPATH_TOKENIZER_H + +#include "NodeSearch.h" + +typedef struct { + const char *code; + union { + QuickMediaStringView string; + QuickMediaStringView identifier; + }; +} QuickMediaXpathTokenizer; + +typedef enum { + QUICKMEDIA_XPATH_TOKEN_INVALID, + QUICKMEDIA_XPATH_TOKEN_END_OF_FILE, + QUICKMEDIA_XPATH_TOKEN_CHILD, + QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE, + QUICKMEDIA_XPATH_TOKEN_IDENTIFIER, + QUICKMEDIA_XPATH_TOKEN_STRING, + QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET, + QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET, + QUICKMEDIA_XPATH_TOKEN_EQUAL +} QuickMediaXpathToken; + +void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath); +QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self); +int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token); +char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self); +char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self); + +#endif diff --git a/project.conf b/project.conf new file mode 100644 index 0000000..6f63e20 --- /dev/null +++ b/project.conf @@ -0,0 +1,12 @@ +[package] +name = "html-search" +type = "static" +version = "0.1.0" +platforms = ["any"] + +[config] +expose_include_dirs = ["include"] +ignore_dirs = ["test_files"] + +[dependencies] +tidy = "5" \ No newline at end of file diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c new file mode 100644 index 0000000..e59dc1e --- /dev/null +++ b/src/HtmlSearch.c @@ -0,0 +1,130 @@ +#include "../include/quickmedia/HtmlSearch.h" +#include "../include/quickmedia/XpathParser.h" + +#include +#include + +static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { + assert(name); + for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { + const char *attr_name = tidyAttrName(attr); + if(attr_name && strcmp(name, attr_name) == 0) + return attr; + } + return NULL; +} + +static void find_child_nodes(TidyDoc tdoc, TidyNode node, const QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + /* We use two loops because we want to find children before grandchildren */ + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + const char *child_node_name = tidyNodeGetName(child); + /* A text node doesn't have a name */ + if(!child_node_name) + continue; + + /* Match without node name or node name matches */ + if(!search_data->name || strcmp(search_data->name, child_node_name) == 0) { + #define on_match() do { \ + if(search_data->child) \ + find_child_nodes(tdoc, child, search_data->child, result_callback, userdata); \ + else { \ + QuickMediaHtmlNode node; \ + node.doc = tdoc; \ + node.node = child; \ + node.text = NULL; \ + result_callback(&node, userdata); \ + if(node.text){ \ + tidyBufFree(node.text); \ + free(node.text); \ + } \ + } \ + } while(0) + + /* If we search without param, then it's a match */ + if(!search_data->param_defined) { + on_match(); + continue; + } + + TidyAttr child_attr = get_attribute_by_name(child, search_data->param.name); + /* Couldn't find the param that we want to match against */ + if(!child_attr) + continue; + + const char *attr_value = tidyAttrValue(child_attr); + assert(search_data->param.value); + /* If the param value matches what we want to search for */ + if(attr_value && strcmp(search_data->param.value, attr_value) == 0) { + on_match(); + continue; + } + } + } + + if(search_data->recursive) { + for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + find_child_nodes(tdoc, child, search_data, result_callback, userdata); + } + } +} + +const char* quickmedia_html_node_get_attribute_value(QuickMediaHtmlNode *self, const char *attribute_name) { + TidyAttr attr = get_attribute_by_name((TidyNode)self->node, attribute_name); + if(!attr) + return NULL; + return tidyAttrValue(attr); +} + +const QuickMediaStringView quickmedia_html_node_get_text(QuickMediaHtmlNode *self) { + QuickMediaStringView string_view; + string_view.data = NULL; + string_view.size = 0; + + if(self->text) { + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; + } + + TidyNode child_node = tidyGetChild(self->node); + if(tidyNodeGetType(child_node) != TidyNode_Text) + return string_view; + + self->text = malloc(sizeof(TidyBuffer)); + tidyBufInit(self->text); + tidyNodeGetText(self->doc, child_node, self->text); + + string_view.data = (const char*)((TidyBuffer*)self->text)->bp; + string_view.size = ((TidyBuffer*)self->text)->size; + return string_view; +} + +static int quickmedia_html_find_nodes(const char *html_source, QuickMediaNodeSearch *search_data, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + assert(html_source); + assert(search_data); + assert(result_callback); + if(!html_source || !search_data || !result_callback) + return -1; + + TidyDoc tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyShowWarnings, no); + /* tidyOptSetBool(tdoc, TidyForceOutput, yes); */ + int rc = tidyParseString( tdoc, html_source); + if(rc < 0) { + tidyRelease(tdoc); + return rc; + } + + TidyNode root_node = tidyGetRoot(tdoc); + find_child_nodes(tdoc, root_node, search_data, result_callback, userdata); + tidyRelease(tdoc); + return 0; +} + +int quickmedia_html_find_nodes_xpath(const char *html_source, const char *xpath, QuickMediaHtmlSearchResultCallback result_callback, void *userdata) { + QuickMediaNodeSearch search_data; + int xpath_result = quickmedia_parse_xpath(xpath, &search_data); + if(xpath_result != 0) + return xpath_result; + return quickmedia_html_find_nodes(html_source, &search_data, result_callback, userdata); +} diff --git a/src/NodeSearch.c b/src/NodeSearch.c new file mode 100644 index 0000000..198b8cd --- /dev/null +++ b/src/NodeSearch.c @@ -0,0 +1,34 @@ +#include "../include/quickmedia/NodeSearch.h" +#include + +void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) { + self->name = NULL; + self->value = NULL; +} + +static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) { + free(self->name); + free(self->value); + self->name = NULL; + self->value = NULL; +} + +void quickmedia_node_search_init(QuickMediaNodeSearch *self) { + self->name = NULL; + self->recursive = 0; + quickmedia_node_search_param_init(&self->param); + self->param_defined = 0; + self->child = NULL; +} + +void quickmedia_node_search_deinit(QuickMediaNodeSearch *self) { + free(self->name); + self->name = NULL; + quickmedia_node_search_param_deinit(&self->param); + + if(self->child) { + quickmedia_node_search_deinit(self->child); + free(self->child); + self->child = NULL; + } +} diff --git a/src/XpathParser.c b/src/XpathParser.c new file mode 100644 index 0000000..24e1d6e --- /dev/null +++ b/src/XpathParser.c @@ -0,0 +1,88 @@ +#include "../include/quickmedia/XpathParser.h" +#include "../include/quickmedia/XpathTokenizer.h" +#include + +typedef struct { + QuickMediaXpathTokenizer tokenizer; +} QuickMediaXpathParser; + +static void quickmedia_xpath_parser_init(QuickMediaXpathParser *self, const char *xpath) { + quickmedia_xpath_tokenizer_init(&self->tokenizer, xpath); +} + +/* ('[' IDENTIFIER '=' '"' STRING '"' ']')? */ +static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { + if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) + return 1; + + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) + return -1; + + result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) + return -2; + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_STRING) + return -3; + + result->value = quickmedia_xpath_tokenizer_copy_string(&self->tokenizer); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) + return -4; + + return 0; +} + +static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *result) { + quickmedia_node_search_init(result); + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + /* // or / */ + if(token == QUICKMEDIA_XPATH_TOKEN_CHILD || token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE) { + result->recursive = (token == QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE); + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) + return -1; + + result->name = quickmedia_xpath_tokenizer_copy_identifier(&self->tokenizer); + + int param_result = xpath_parse_param(self, &result->param); + if(param_result < 0) { + quickmedia_node_search_deinit(result); + return param_result; + } else if(param_result == 0) { + result->param_defined = 1; + } + + result->child = malloc(sizeof(QuickMediaNodeSearch)); + int node_result = xpath_parse_node(self, result->child); + if(node_result > 0) { + node_result = 0; + /* Didn't have child, remove child */ + free(result->child); + result->child = NULL; + } else if(node_result < 0) { + quickmedia_node_search_deinit(result); + } + + return node_result; + } else if(token == QUICKMEDIA_XPATH_TOKEN_END_OF_FILE) { + return 1; + } else { + return -2; + } +} + +int quickmedia_parse_xpath(const char *xpath, QuickMediaNodeSearch *result) { + QuickMediaXpathParser parser; + quickmedia_xpath_parser_init(&parser, xpath); + int parse_result = xpath_parse_node(&parser, result); + if(parse_result > 0) + parse_result = -1; + return parse_result; +} diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c new file mode 100644 index 0000000..32bede9 --- /dev/null +++ b/src/XpathTokenizer.c @@ -0,0 +1,104 @@ +#include "../include/quickmedia/XpathTokenizer.h" +#include +#include + +void quickmedia_xpath_tokenizer_init(QuickMediaXpathTokenizer *self, const char *xpath) { + self->code = xpath; + self->identifier.data = NULL; + self->identifier.size = 0; +} + +static int is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static int is_num(char c) { + return c >= '0' && c <= '9'; +} + +static int is_alphanum(char c) { + return is_alpha(c) || is_num(c); +} + +static const char* find_end_of_string(const char *str, char escape_symbol) { + int escape = 0; + while(*str != '\0') { + char c = *str; + if(c == '\\') { + escape = !escape; + } else if(c == escape_symbol) { + if(!escape) + return str; + } else { + escape = 0; + } + ++str; + } + return str; +} + +QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *self) { + char c = *self->code; + if(c == '/') { + ++self->code; + c = *self->code; + if(c == '/') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE; + } + return QUICKMEDIA_XPATH_TOKEN_CHILD; + } else if(is_alpha(c)) { + self->identifier.data = self->code; + ++self->code; + while(is_alphanum(*self->code) || *self->code == '_' || *self->code == '-') { + ++self->code; + } + self->identifier.size = self->code - self->identifier.data; + return QUICKMEDIA_XPATH_TOKEN_IDENTIFIER; + } else if(c == '[') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET; + } else if(c == ']') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET; + } else if(c == '=') { + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_EQUAL; + } else if(c == '"' || c == '\'') { + char escape_symbol = c; + ++self->code; + self->string.data = self->code; + self->code = find_end_of_string(self->string.data, escape_symbol); + if(*self->code == '\0') { + /* Reached end of xpath before end of string */ + return QUICKMEDIA_XPATH_TOKEN_INVALID; + } + self->string.size = self->code - self->string.data; + ++self->code; + return QUICKMEDIA_XPATH_TOKEN_STRING; + } else if(c == '\0') { + return QUICKMEDIA_XPATH_TOKEN_END_OF_FILE; + } else { + /* Invalid symbol @c */ + return QUICKMEDIA_XPATH_TOKEN_INVALID; + } +} + +int quickmedia_xpath_tokenizer_next_if(QuickMediaXpathTokenizer *self, QuickMediaXpathToken token) { + const char *restore_point = self->code; + if(quickmedia_xpath_tokenizer_next(self) == token) + return 0; + self->code = restore_point; + return -1; +} + +char* quickmedia_xpath_tokenizer_copy_identifier(QuickMediaXpathTokenizer *self) { + char *result = malloc(self->identifier.size + 1); + result[self->identifier.size] = '\0'; + memcpy(result, self->identifier.data, self->identifier.size); + return result; +} + +char* quickmedia_xpath_tokenizer_copy_string(QuickMediaXpathTokenizer *self) { + return quickmedia_xpath_tokenizer_copy_identifier(self); +} diff --git a/test_files/test.html b/test_files/test.html new file mode 100644 index 0000000..a33081d --- /dev/null +++ b/test_files/test.html @@ -0,0 +1,478 @@ + + + + + + + +Naruto Manga - Browse & Search Manga At MangaNelo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+MENU + +
+
+
+ +
+
+
+
+ +
+
+ +
+

Keyword: naruto

+
+
+ +Naruto + +
+

+Naruto + +

+ + +Chapter 700.5 : Uzumaki Naruto + + +Vol.72 Chapter 700.1 : Book Of Thunder + +Author(s) : Kishimoto Masashi +Updated : Jan-20-2016 11:54 +View : 13,306,950 +
+
+
+ + Boruto: Naruto Next Generations + +
+

+Boruto: Naruto Next Generations + +

+ + +Chapter 34: Training!! + + +Chapter 33: Breaking The Limit + +Author(s) : Kodachi Ukyo, Ikemoto Mikio +Updated : Apr-20-2019 06:52 +View : 3,453,805 +
+
+
+ +Naruto Gaiden: The Seventh Hokage + +
+

+Naruto Gaiden: The Seventh Hokage + +

+ + +Ch.10.1 : Projected Into These Eyes (Full Color Version) + + +Ch.10 : Projected Into These Eyes + +Author(s) : Kishimoto Masashi +Updated : Jan-20-2016 11:44 +View : 230,877 +
+
+
+ +Naruto - Full Color + +
+

+Naruto - Full Color + +

+ + +Vol.5 Chapter 46: The Password Is... + + +Vol.5 Chapter 45: The Second Exam + + Author(s) : Masashi Kishimoto +Updated : May-01-2019 16:19 +View : 86,639 +
+
+
+ +Naruto: Chibi Sasuke's Sharingan Legend + +
+

+Naruto: Chibi Sasuke's Sharingan Legend + +

+ + +Volume 3 Final Chapter: The Uchiha Clan!! + + +Chapter 23: Karin's Battle!! + +Author(s) : Taira Kenji +Updated : Oct-18-2018 14:15 +View : 66,363 +
+
+
+ +Seikimatsu Darling + +
+

+Seikimatsu Darling + +

+ + +Vol.2 Chapter 9 + + +Vol.2 Chapter 8 + +Author(s) : Naruto Maki +Updated : Jan-20-2016 13:56 +View : 50,352 +
+
+
+ +420 Renpai Girl + +
+

+420 Renpai Girl + +

+ + +Vol.1 Ch.1 : Teaser + +Author(s) : Kiriyama Naruto +Updated : Jan-21-2016 13:12 +View : 17,103 +
+
+
+
+
+
+
+
+ +
+
+
+ +
+ + + \ No newline at end of file diff --git a/tests/main.c b/tests/main.c new file mode 100644 index 0000000..eb1abc7 --- /dev/null +++ b/tests/main.c @@ -0,0 +1,33 @@ +#include +#include "../include/quickmedia/HtmlSearch.h" +#include +#include + +static char* get_file_content(const char *filepath) { + FILE *file = fopen(filepath, "rb"); + assert(file); + + fseek(file, 0, SEEK_END); + size_t filesize = ftell(file); + fseek(file, 0, SEEK_SET); + + char *buffer = malloc(filesize + 1); + buffer[filesize] = '\0'; + fread(buffer, 1, filesize, file); + + return buffer; +} + +static void result_callback(QuickMediaHtmlNode *node, void *userdata) { + const char *href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + printf("a href: %s, node value: %.*s\n", href, text.size, text.data); +} + +int main(int argc, char **argv) +{ + char *file_content = get_file_content("test_files/test.html"); + int result = quickmedia_html_find_nodes_xpath(file_content, "//h3[class=\"story_name\"]//a", result_callback, NULL); + free(file_content); + return result; +} -- cgit v1.2.3