Exclude last parenthesis in url extraction if unbalanced, fix tests

author: dec05eba <dec05eba@protonmail.com> 2020-11-16 23:47:21 +0100
committer: dec05eba <dec05eba@protonmail.com> 2020-11-16 23:47:24 +0100
commit: 338694c827320467dc5bff124c25ff82603e51a6 (patch)
tree: e83502c7166ddfb8f760e147a90c71d8f49cf63f /src
parent: 459f11326feb68947905e267960b736ba0dff8a2 (diff)
1 files changed, 32 insertions, 19 deletions
diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index f8b118b..5ca6d3e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -70,6 +70,10 @@ namespace QuickMedia {
         return result.str();
     }
 
+    static bool is_url_start_char(char c) {
+        return is_alpha(c) || is_digit(c);
+    }
+
     static bool is_url_character(char c) {
         switch(c) {
             case '%':
@@ -103,9 +107,10 @@ namespace QuickMedia {
         }
     }
 
-    // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2
-    // TODO: Maybe check if the TLD only contains valid characters (is_alpha)?
+    // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
+    // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis.
     void extract_urls(const std::string &str, std::vector<std::string> &urls) {
+        int parentheses_depth = 0;
         size_t url_start = std::string::npos;
         size_t url_dot_index = std::string::npos;
         // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
@@ -113,25 +118,33 @@ namespace QuickMedia {
             char c = str[i];
             if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos)
                 url_dot_index = i;
-            if(is_url_character(c)) {
-                if(url_start == std::string::npos)
-                    url_start = i;
-            } else {
-                if(url_start != std::string::npos) {
-                    // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
-                    if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
-                        size_t url_length = i - url_start;
-                        char prev_char = str[i - 1];
-                        // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
-                        // and we want those links to work, I guess?
-                        if(prev_char == '.' || prev_char == ',')
-                            --url_length;
+
+            if(url_start != std::string::npos) {
+                if(c == '(')
+                    ++parentheses_depth;
+                else if(c == ')')
+                    --parentheses_depth;
+            }
+
+            if(url_start == std::string::npos && is_url_start_char(c)) {
+                url_start = i;
+            } else if(url_start != std::string::npos && !is_url_character(c)) {
+                // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
+                if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
+                    size_t url_length = i - url_start;
+                    char prev_char = str[i - 1];
+                    // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
+                    // and we want those links to work, I guess?
+                    if(prev_char == '.' || prev_char == ',')
+                        --url_length;
+                    if(prev_char == ')' && parentheses_depth != 0)
+                        --url_length;
+                    if(url_length > 0)
                         urls.push_back(str.substr(url_start, url_length));
-                    }
-                    url_start = std::string::npos;
-                    url_dot_index = std::string::npos;
                 }
+                url_start = std::string::npos;
+                url_dot_index = std::string::npos;
             }
-        }
+    }
     }
 }
 \ No newline at end of file
author	dec05eba <dec05eba@protonmail.com>	2020-11-16 23:47:21 +0100
committer	dec05eba <dec05eba@protonmail.com>	2020-11-16 23:47:24 +0100
commit	338694c827320467dc5bff124c25ff82603e51a6 (patch)
tree	e83502c7166ddfb8f760e147a90c71d8f49cf63f /src
parent	459f11326feb68947905e267960b736ba0dff8a2 (diff)