From 6040726f92784978dd91eec4c540e92c4ca54236 Mon Sep 17 00:00:00 2001
From: dec05eba <dec05eba@protonmail.com>
Date: Fri, 9 Apr 2021 21:35:39 +0200
Subject: Add .onion to list of valid tld, properly parse urls protocol part

---
 src/NetUtils.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

(limited to 'src/NetUtils.cpp')

diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index d5795c2..8bb5a0e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -902,6 +902,7 @@ namespace QuickMedia {
         "ong",
         "onl",
         "online",
+        "onion",
         "onyourside",
         "ooo",
         "open",
@@ -1583,7 +1584,11 @@ namespace QuickMedia {
     }
 
     static bool is_url_start_char(char c) {
-        return is_alpha(c) || is_digit(c);
+        return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~';
+    }
+
+    static bool is_url_domain_char(char c) {
+        return is_url_start_char(c);
     }
 
     // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
@@ -1593,10 +1598,12 @@ namespace QuickMedia {
 
         int parentheses_depth = 0;
         bool is_valid_url = false;
+        bool is_domain_part = true;
+        bool contains_dot = false;
         size_t url_start = std::string::npos;
 
         // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
-        for(size_t i = 0; i < (size_t)str.size() + 1; ++i) {
+        for(size_t i = 0; i < (size_t)str.size() + 1;) {
             char c = str[i];
 
             if(url_start != std::string::npos) {
@@ -1606,12 +1613,16 @@ namespace QuickMedia {
                     --parentheses_depth;
             }
 
-            if(url_start != std::string::npos && !is_valid_url && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
+            if(url_start != std::string::npos && c == '.') {
+                contains_dot = true;
+            }
+
+            if(url_start != std::string::npos && !is_valid_url && contains_dot && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
                 size_t tld_end = i - 1;
                 char prev_char = str[i - 1];
                 // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
                 // and we want those links to work, I guess?
-                if(prev_char == '.' || prev_char == ',')
+                if(prev_char == '.' || prev_char == ',' || prev_char == ':')
                     --tld_end;
                 else if(prev_char == ')' && parentheses_depth != 0)
                     --tld_end;
@@ -1623,8 +1634,30 @@ namespace QuickMedia {
                     --tld_start;
                 }
 
-                if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end())
+                if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end()) {
                     is_valid_url = true;
+                    is_domain_part = false;
+                }
+            }
+
+            if(url_start != std::string::npos && is_domain_part && c == ':') {
+                if(i + 2 < (size_t)str.size() + 1 && str[i + 1] == '/' && str[i + 2] == '/') {
+                    i += 3;
+                    continue;
+                } else if(i + 1 < (size_t)str.size() + 1 && is_whitespace(str[i + 1])) {
+                    i += 1;
+                } else {
+                    url_start = std::string::npos;
+                    is_valid_url = false;
+                    is_domain_part = true;
+                    contains_dot = false;
+                }
+            }
+
+            if(url_start != std::string::npos && is_domain_part && !is_url_domain_char(c)) {
+                url_start = std::string::npos;
+                is_valid_url = false;
+                contains_dot = false;
             }
 
             if(url_start == std::string::npos && is_url_start_char(c)) {
@@ -1636,7 +1669,7 @@ namespace QuickMedia {
                     char prev_char = str[i - 1];
                     // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
                     // and we want those links to work, I guess?
-                    if(prev_char == '.' || prev_char == ',')
+                    if(prev_char == '.' || prev_char == ',' || prev_char == ':')
                         --url_length;
                     else if(prev_char == ')' && parentheses_depth != 0)
                         --url_length;
@@ -1646,7 +1679,11 @@ namespace QuickMedia {
 
                 url_start = std::string::npos;
                 is_valid_url = false;
+                is_domain_part = true;
+                contains_dot = false;
             }
+
+            ++i;
         }
 
         return ranges;
-- 
cgit v1.2.3-70-g09d2