From 95c189f7445e6deca85130b7b8fa25dc76fabe12 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Mon, 16 Aug 2021 13:41:20 +0200 Subject: Add indexing and example --- src/HtmlSearch.c | 17 ++++++++--------- src/NodeSearch.c | 1 + src/XpathParser.c | 51 ++++++++++++++++++++++++++++++++++++++------------- src/XpathTokenizer.c | 13 +++++++++++++ 4 files changed, 60 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/HtmlSearch.c b/src/HtmlSearch.c index edb2a1c..45d8aa4 100644 --- a/src/HtmlSearch.c +++ b/src/HtmlSearch.c @@ -168,6 +168,8 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS if(!node) return 0; + int match_index = 0; + /* We use two loops because we want to find children before grandchildren */ for(QuickMediaHtmlChildNode *child = node; child; child = child->next) { /* A text node doesn't have a name */ @@ -194,7 +196,9 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS /* If we search without param, then it's a match */ if(!search_data->param.defined) { - on_match(); + if(search_data->param.index == -1 || search_data->param.index == match_index) + on_match(); + ++match_index; continue; } @@ -206,7 +210,9 @@ static int find_child_nodes(QuickMediaHtmlChildNode *node, const QuickMediaNodeS assert(search_data->param.value.size > 0); /* If the param value matches what we want to search for */ if(str_glob_match(child_attr->value, search_data->param.value, search_data->param.value_is_glob) == 0) { - on_match(); + if(search_data->param.index == -1 || search_data->param.index == match_index) + on_match(); + ++match_index; continue; } } @@ -508,19 +514,12 @@ QuickMediaStringView quickmedia_html_node_get_text(QuickMediaMatchNode *self) { } int quickmedia_html_search_init(QuickMediaHtmlSearch *self, const char *html_source, size_t size) { - /* Utf8 BOM */ - if(size >= 3 && memcmp(html_source, "\xef\xbb\xbf", 3) == 0) { - html_source += 3; - size -= 3; - } - QuickMediaHtmlNode *html_node = &self->root_node; html_node_init(html_node); if(html_parser_parse(html_source, size, html_parse_callback, &html_node) != 0) { quickmedia_html_search_deinit(self); return 1; } - return 0; } diff --git a/src/NodeSearch.c b/src/NodeSearch.c index 376c801..633cf65 100644 --- a/src/NodeSearch.c +++ b/src/NodeSearch.c @@ -8,6 +8,7 @@ void quickmedia_node_search_param_init(QuickMediaNodeSearchParam *self) { self->value.size = 0; self->defined = 0; self->value_is_glob = 0; + self->index = -1; } static void quickmedia_node_search_param_deinit(QuickMediaNodeSearchParam *self) { diff --git a/src/XpathParser.c b/src/XpathParser.c index f3248eb..b79fe11 100644 --- a/src/XpathParser.c +++ b/src/XpathParser.c @@ -18,26 +18,29 @@ static void quickmedia_xpath_parser_init(QuickMediaXpathParser *self, const char quickmedia_xpath_tokenizer_init(&self->tokenizer, xpath); } -/* ('[' IDENTIFIER '=' '"' STRING '"' ']')? */ +/* (('[' IDENTIFIER '=' '"' STRING '"' ']') | ('[' NUMBER ']'))? */ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) return 1; QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) - return -1; - - result->name = self->tokenizer.identifier; + if(token == QUICKMEDIA_XPATH_TOKEN_IDENTIFIER) { + result->name = self->tokenizer.identifier; - token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) - return -2; + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_EQUAL) + return -2; - token = quickmedia_xpath_tokenizer_next(&self->tokenizer); - if(token != QUICKMEDIA_XPATH_TOKEN_STRING) - return -3; - - result->value = self->tokenizer.string; + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_STRING) + return -3; + + result->value = self->tokenizer.string; + } else if(token == QUICKMEDIA_XPATH_TOKEN_NUMBER) { + result->index = self->tokenizer.number; + } else { + return -1; + } token = quickmedia_xpath_tokenizer_next(&self->tokenizer); if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) @@ -48,6 +51,24 @@ static int xpath_parse_param(QuickMediaXpathParser *self, QuickMediaNodeSearchPa return 0; } +/* ('[' NUMBER ']'))? */ +static int xpath_parse_index(QuickMediaXpathParser *self, QuickMediaNodeSearchParam *result) { + if(quickmedia_xpath_tokenizer_next_if(&self->tokenizer, QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET) != 0) + return 1; + + QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_NUMBER) + return -1; + + result->index = self->tokenizer.number; + + token = quickmedia_xpath_tokenizer_next(&self->tokenizer); + if(token != QUICKMEDIA_XPATH_TOKEN_CLOSING_BRACKET) + return -4; + + return 0; +} + static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *result) { quickmedia_node_search_init(result); QuickMediaXpathToken token = quickmedia_xpath_tokenizer_next(&self->tokenizer); @@ -65,6 +86,10 @@ static int xpath_parse_node(QuickMediaXpathParser *self, QuickMediaNodeSearch *r if(param_result < 0) return param_result; + int index_result = xpath_parse_index(self, &result->param); + if(index_result < 0) + return index_result; + result->child = malloc(sizeof(QuickMediaNodeSearch)); if(!result->child) return -1; diff --git a/src/XpathTokenizer.c b/src/XpathTokenizer.c index ae17939..f81f2d7 100644 --- a/src/XpathTokenizer.c +++ b/src/XpathTokenizer.c @@ -16,6 +16,10 @@ static int is_num(char c) { return c >= '0' && c <= '9'; } +static int c_to_num(char c) { + return c - '0'; +} + static int is_alphanum(char c) { return is_alpha(c) || is_num(c); } @@ -55,6 +59,15 @@ QuickMediaXpathToken quickmedia_xpath_tokenizer_next(QuickMediaXpathTokenizer *s } self->identifier.size = self->code - self->identifier.data; return QUICKMEDIA_XPATH_TOKEN_IDENTIFIER; + } else if(is_num(c)) { + int number = c_to_num(c); + ++self->code; + while(is_num(*self->code)) { + number = number * 10 + c_to_num(*self->code); + ++self->code; + } + self->number = number; + return QUICKMEDIA_XPATH_TOKEN_NUMBER; } else if(c == '[') { ++self->code; return QUICKMEDIA_XPATH_TOKEN_OPEN_BRACKET; -- cgit v1.2.3