From 95c189f7445e6deca85130b7b8fa25dc76fabe12 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Mon, 16 Aug 2021 13:41:20 +0200 Subject: Add indexing and example --- README.md | 8 +++++- include/quickmedia/HtmlSearch.h | 2 +- include/quickmedia/NodeSearch.h | 1 + include/quickmedia/XpathTokenizer.h | 2 ++ src/HtmlSearch.c | 17 ++++++------- src/NodeSearch.c | 1 + src/XpathParser.c | 51 +++++++++++++++++++++++++++---------- src/XpathTokenizer.c | 13 ++++++++++ 8 files changed, 71 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index b7f6ea6..06e29b7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,10 @@ -Html search using non-standard xpath, written in C. See tests/main.c +Html search using non-standard xpath, written in C. See tests/main.c. + +# Syntax examples +``` +//div[id='user']//p[class='sneed*'] +//div[id='user']/div[class='name'][0] +``` # Note This library does not decode html sequences in text and attribute values. diff --git a/include/quickmedia/HtmlSearch.h b/include/quickmedia/HtmlSearch.h index af9bd7e..c0bd17c 100644 --- a/include/quickmedia/HtmlSearch.h +++ b/include/quickmedia/HtmlSearch.h @@ -12,7 +12,6 @@ extern "C" { typedef struct QuickMediaHtmlAttribute QuickMediaHtmlAttribute; typedef struct QuickMediaHtmlNode QuickMediaHtmlNode; typedef struct QuickMediaHtmlChildNode QuickMediaHtmlChildNode; -typedef struct QuickMediaTextNode QuickMediaTextNode; typedef struct { char *data; @@ -54,6 +53,7 @@ typedef struct { Returns an empty string view if attribute doesn't exist or if it doesn't have any value. The result is only valid within the callback function scope. The result is stripped of whitespace on the left and right side. + Case insensitive search. */ QuickMediaStringView quickmedia_html_node_get_attribute_value(QuickMediaMatchNode *self, const char *attribute_name); diff --git a/include/quickmedia/NodeSearch.h b/include/quickmedia/NodeSearch.h index 0e3b3f1..5f50488 100644 --- a/include/quickmedia/NodeSearch.h +++ b/include/quickmedia/NodeSearch.h @@ -17,6 +17,7 @@ typedef struct { QuickMediaStringView value; int defined; int value_is_glob; + int index; /* -1 if no indexing */ } QuickMediaNodeSearchParam; typedef struct QuickMediaNodeSearch QuickMediaNodeSearch; diff --git a/include/quickmedia/XpathTokenizer.h b/include/quickmedia/XpathTokenizer.h index 62f6d75..b4ceecb 100644 --- a/include/quickmedia/XpathTokenizer.h +++ b/include/quickmedia/XpathTokenizer.h @@ -12,6 +12,7 @@ typedef struct { union { QuickMediaStringView string; QuickMediaStringView identifier; + int number; }; } QuickMediaXpathTokenizer; @@ -21,6 +22,7 @@ typedef enum { QUICKMEDIA_XPATH_TOKEN_CHILD, QUICKMEDIA_XPATH_TOKEN_CHILD_RECURSIVE, QUICKMEDIA_XPATH_TOKEN_IDENTIFIER, + QUICKMEDIA_XPATH_TOKEN_NUMBER, QUICKMEDIA_XPATH_TOKEN_STRING, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET, QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET, diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c index edb2a1c..45d8aa4 100644 --- a/src/HtmlSearch.c +++ b/src/HtmlSearch.c @@ -168,6 +168,8 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS if(!node) return 0; + int match_index = 0; + /* We use two loops because we want to find children before grandchildren */ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { /* A text node doesn't have a name */ @@ -194,7 +196,9 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS /* If we search without param, then it's a match */ if(!search_data->param.defined) { - on_match(); + if(search_data->param.index == -1 || search_data->param.index == match_index) + on_match(); + ++match_index; continue; } @@ -206,7 +210,9 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS assert(search_data->param.value.size > 0); /* If the param value matches what we want to search for */ if(str_glob_match(child_attr->value, search_data->param.value, search_data->param.value_is_glob) == 0) { - on_match(); + if(search_data->param.index == -1 || search_data->param.index == match_index) + on_match(); + ++match_index; continue; } } @@ -508,19 +514,12 @@ QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) { } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) { - /* Utf8 BOM */ - if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) { - html_source += 3; - size -= 3; - } - QuickMediaHtmlNode *html_node = &self->root_node; html_node_init(html_node); if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) { quickmedia_html_search_deinit(self); return 1; } - return 0; } diff --git a/src/NodeSearch.c b/src/NodeSearch.c index 376c801..633cf65 100644 --- a/src/NodeSearch.c +++ b/src/NodeSearch.c @@ -8,6 +8,7 @@ void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) { self->value.size = 0; self->defined = 0; self->value_is_glob = 0; + self->index = -1; } static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) { diff --git a/src/XpathParser.c b/src/XpathParser.c index f3248eb..b79fe11 100644 --- a/src/XpathParser.c +++ b/src/XpathParser.c @@ -18,26 +18,29 @@ static void quickmedia_xpath_parser_init(QuickMediaXpathParser *self, const char quickmedia_xpath_tokenizer_init(&self->tokenizer, xpath); } -/* ('[' IDENTIFIER '=' '"' STRING '"' ']')? */ +/* (('[' IDENTIFIER '=' '"' STRING '"' ']') | ('[' NUMBER ']'))? */ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) return 1; QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) - return -1; - - result->name = self->tokenizer.identifier; + if(token == QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) { + result->name = self->tokenizer.identifier; - token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) - return -2; + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) + return -2; - token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_STRING) - return -3; - - result->value = self->tokenizer.string; + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_STRING) + return -3; + + result->value = self->tokenizer.string; + } else if(token == QUICKMEDIA_XPATH_TOKEN_NUMBER) { + result->index = self->tokenizer.number; + } else { + return -1; + } token = quickmedia_xpath_tokenizer_next(&self->tokenizer); if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) @@ -48,6 +51,24 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa return 0; } +/* ('[' NUMBER ']'))? */ +static int xpath_parse_index(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { + if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) + return 1; + + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_NUMBER) + return -1; + + result->index = self->tokenizer.number; + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) + return -4; + + return 0; +} + static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *result) { quickmedia_node_search_init(result); QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); @@ -65,6 +86,10 @@ static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *r if(param_result < 0) return param_result; + int index_result = xpath_parse_index(self, &result->param); + if(index_result < 0) + return index_result; + result->child = malloc(sizeof(QuickMediaNodeSearch)); if(!result->child) return -1; diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c index ae17939..f81f2d7 100644 --- a/src/XpathTokenizer.c +++ b/src/XpathTokenizer.c @@ -16,6 +16,10 @@ static int is_num(char c) { return c >= '0' && c <= '9'; } +static int c_to_num(char c) { + return c - '0'; +} + static int is_alphanum(char c) { return is_alpha(c) || is_num(c); } @@ -55,6 +59,15 @@ QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *s } self->identifier.size = self->code - self->identifier.data; return QUICKMEDIA_XPATH_TOKEN_IDENTIFIER; + } else if(is_num(c)) { + int number = c_to_num(c); + ++self->code; + while(is_num(*self->code)) { + number = number * 10 + c_to_num(*self->code); + ++self->code; + } + self->number = number; + return QUICKMEDIA_XPATH_TOKEN_NUMBER; } else if(c == '[') { ++self->code; return QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET; -- cgit v1.2.3