From 338694c827320467dc5bff124c25ff82603e51a6 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Mon, 16 Nov 2020 23:47:21 +0100 Subject: Exclude last parenthesis in url extraction if unbalanced, fix tests --- src/NetUtils.cpp | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'src') diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp index f8b118b..5ca6d3e 100644 --- a/src/NetUtils.cpp +++ b/src/NetUtils.cpp @@ -70,6 +70,10 @@ namespace QuickMedia { return result.str(); } + static bool is_url_start_char(char c) { + return is_alpha(c) || is_digit(c); + } + static bool is_url_character(char c) { switch(c) { case '%': @@ -103,9 +107,10 @@ namespace QuickMedia { } } - // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2 - // TODO: Maybe check if the TLD only contains valid characters (is_alpha)? + // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2. + // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis. void extract_urls(const std::string &str, std::vector &urls) { + int parentheses_depth = 0; size_t url_start = std::string::npos; size_t url_dot_index = std::string::npos; // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string @@ -113,25 +118,33 @@ namespace QuickMedia { char c = str[i]; if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos) url_dot_index = i; - if(is_url_character(c)) { - if(url_start == std::string::npos) - url_start = i; - } else { - if(url_start != std::string::npos) { - // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not. - if(url_dot_index != std::string::npos && url_dot_index != i - 1) { - size_t url_length = i - url_start; - char prev_char = str[i - 1]; - // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..." - // and we want those links to work, I guess? - if(prev_char == '.' || prev_char == ',') - --url_length; + + if(url_start != std::string::npos) { + if(c == '(') + ++parentheses_depth; + else if(c == ')') + --parentheses_depth; + } + + if(url_start == std::string::npos && is_url_start_char(c)) { + url_start = i; + } else if(url_start != std::string::npos && !is_url_character(c)) { + // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not. + if(url_dot_index != std::string::npos && url_dot_index != i - 1) { + size_t url_length = i - url_start; + char prev_char = str[i - 1]; + // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..." + // and we want those links to work, I guess? + if(prev_char == '.' || prev_char == ',') + --url_length; + if(prev_char == ')' && parentheses_depth != 0) + --url_length; + if(url_length > 0) urls.push_back(str.substr(url_start, url_length)); - } - url_start = std::string::npos; - url_dot_index = std::string::npos; } + url_start = std::string::npos; + url_dot_index = std::string::npos; } - } + } } } \ No newline at end of file -- cgit v1.2.3