aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2020-11-16 23:47:21 +0100
committerdec05eba <dec05eba@protonmail.com>2020-11-16 23:47:24 +0100
commit338694c827320467dc5bff124c25ff82603e51a6 (patch)
treee83502c7166ddfb8f760e147a90c71d8f49cf63f /src
parent459f11326feb68947905e267960b736ba0dff8a2 (diff)
Exclude last parenthesis in url extraction if unbalanced, fix tests
Diffstat (limited to 'src')
-rw-r--r--src/NetUtils.cpp51
1 files changed, 32 insertions, 19 deletions
diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index f8b118b..5ca6d3e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -70,6 +70,10 @@ namespace QuickMedia {
return result.str();
}
+ static bool is_url_start_char(char c) {
+ return is_alpha(c) || is_digit(c);
+ }
+
static bool is_url_character(char c) {
switch(c) {
case '%':
@@ -103,9 +107,10 @@ namespace QuickMedia {
}
}
- // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2
- // TODO: Maybe check if the TLD only contains valid characters (is_alpha)?
+ // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
+ // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis.
void extract_urls(const std::string &str, std::vector<std::string> &urls) {
+ int parentheses_depth = 0;
size_t url_start = std::string::npos;
size_t url_dot_index = std::string::npos;
// str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
@@ -113,25 +118,33 @@ namespace QuickMedia {
char c = str[i];
if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos)
url_dot_index = i;
- if(is_url_character(c)) {
- if(url_start == std::string::npos)
- url_start = i;
- } else {
- if(url_start != std::string::npos) {
- // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
- if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
- size_t url_length = i - url_start;
- char prev_char = str[i - 1];
- // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
- // and we want those links to work, I guess?
- if(prev_char == '.' || prev_char == ',')
- --url_length;
+
+ if(url_start != std::string::npos) {
+ if(c == '(')
+ ++parentheses_depth;
+ else if(c == ')')
+ --parentheses_depth;
+ }
+
+ if(url_start == std::string::npos && is_url_start_char(c)) {
+ url_start = i;
+ } else if(url_start != std::string::npos && !is_url_character(c)) {
+ // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
+ if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
+ size_t url_length = i - url_start;
+ char prev_char = str[i - 1];
+ // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
+ // and we want those links to work, I guess?
+ if(prev_char == '.' || prev_char == ',')
+ --url_length;
+ if(prev_char == ')' && parentheses_depth != 0)
+ --url_length;
+ if(url_length > 0)
urls.push_back(str.substr(url_start, url_length));
- }
- url_start = std::string::npos;
- url_dot_index = std::string::npos;
}
+ url_start = std::string::npos;
+ url_dot_index = std::string::npos;
}
- }
+ }
}
} \ No newline at end of file