Add .onion to list of valid tld, properly parse urls protocol part

author: dec05eba <dec05eba@protonmail.com> 2021-04-09 21:35:39 +0200
committer: dec05eba <dec05eba@protonmail.com> 2021-04-09 21:45:28 +0200
commit: 6040726f92784978dd91eec4c540e92c4ca54236 (patch)
tree: ce37ba29b40bf100c5bd577834ea4b6c90f4ab86
parent: 20a437763e56e5429ebd7f38940c4107418e3dee (diff)
5 files changed, 57 insertions, 7 deletions
diff --git a/include/Body.hpp b/include/Body.hpp
index eb74837..7c8226e 100644
--- a/include/Body.hpp
+++ b/include/Body.hpp
@@ -263,6 +263,7 @@ namespace QuickMedia {
         std::function<void(BodyItem*)> body_item_select_callback;
         sf::Shader *thumbnail_mask_shader;
         AttachSide attach_side = AttachSide::TOP;
+        bool title_mark_urls = false;
     private:
         void draw_item(sf::RenderWindow &window, BodyItem *item, const sf::Vector2f &pos, const sf::Vector2f &size, const float item_height, const int item_index, const Json::Value &content_progress, bool include_embedded_item = true, bool merge_with_previous = false);
         void update_dirty_state(BodyItem *body_item, float width);
diff --git a/src/Body.cpp b/src/Body.cpp
index 1e45dcb..32d70fc 100644
--- a/src/Body.cpp
+++ b/src/Body.cpp
@@ -789,7 +789,7 @@ namespace QuickMedia {
             if(body_item->title_text)
                 body_item->title_text->setString(std::move(str));
             else
-                body_item->title_text = std::make_unique<Text>(std::move(str), false, std::floor(16 * get_ui_scale()), width);
+                body_item->title_text = std::make_unique<Text>(std::move(str), false, std::floor(16 * get_ui_scale()), width, title_mark_urls);
             body_item->title_text->setFillColor(body_item->get_title_color());
             body_item->title_text->updateGeometry();
         }
diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index d5795c2..8bb5a0e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -902,6 +902,7 @@ namespace QuickMedia {
         "ong",
         "onl",
         "online",
+        "onion",
         "onyourside",
         "ooo",
         "open",
@@ -1583,7 +1584,11 @@ namespace QuickMedia {
     }
 
     static bool is_url_start_char(char c) {
-        return is_alpha(c) || is_digit(c);
+        return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~';
+    }
+
+    static bool is_url_domain_char(char c) {
+        return is_url_start_char(c);
     }
 
     // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
@@ -1593,10 +1598,12 @@ namespace QuickMedia {
 
         int parentheses_depth = 0;
         bool is_valid_url = false;
+        bool is_domain_part = true;
+        bool contains_dot = false;
         size_t url_start = std::string::npos;
 
         // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
-        for(size_t i = 0; i < (size_t)str.size() + 1; ++i) {
+        for(size_t i = 0; i < (size_t)str.size() + 1;) {
             char c = str[i];
 
             if(url_start != std::string::npos) {
@@ -1606,12 +1613,16 @@ namespace QuickMedia {
                     --parentheses_depth;
             }
 
-            if(url_start != std::string::npos && !is_valid_url && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
+            if(url_start != std::string::npos && c == '.') {
+                contains_dot = true;
+            }
+
+            if(url_start != std::string::npos && !is_valid_url && contains_dot && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
                 size_t tld_end = i - 1;
                 char prev_char = str[i - 1];
                 // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
                 // and we want those links to work, I guess?
-                if(prev_char == '.' || prev_char == ',')
+                if(prev_char == '.' || prev_char == ',' || prev_char == ':')
                     --tld_end;
                 else if(prev_char == ')' && parentheses_depth != 0)
                     --tld_end;
@@ -1623,8 +1634,30 @@ namespace QuickMedia {
                     --tld_start;
                 }
 
-                if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end())
+                if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end()) {
                     is_valid_url = true;
+                    is_domain_part = false;
+                }
+            }
+
+            if(url_start != std::string::npos && is_domain_part && c == ':') {
+                if(i + 2 < (size_t)str.size() + 1 && str[i + 1] == '/' && str[i + 2] == '/') {
+                    i += 3;
+                    continue;
+                } else if(i + 1 < (size_t)str.size() + 1 && is_whitespace(str[i + 1])) {
+                    i += 1;
+                } else {
+                    url_start = std::string::npos;
+                    is_valid_url = false;
+                    is_domain_part = true;
+                    contains_dot = false;
+                }
+            }
+
+            if(url_start != std::string::npos && is_domain_part && !is_url_domain_char(c)) {
+                url_start = std::string::npos;
+                is_valid_url = false;
+                contains_dot = false;
             }
 
             if(url_start == std::string::npos && is_url_start_char(c)) {
@@ -1636,7 +1669,7 @@ namespace QuickMedia {
                     char prev_char = str[i - 1];
                     // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
                     // and we want those links to work, I guess?
-                    if(prev_char == '.' || prev_char == ',')
+                    if(prev_char == '.' || prev_char == ',' || prev_char == ':')
                         --url_length;
                     else if(prev_char == ')' && parentheses_depth != 0)
                         --url_length;
@@ -1646,7 +1679,11 @@ namespace QuickMedia {
 
                 url_start = std::string::npos;
                 is_valid_url = false;
+                is_domain_part = true;
+                contains_dot = false;
             }
+
+            ++i;
         }
 
         return ranges;
diff --git a/src/QuickMedia.cpp b/src/QuickMedia.cpp
index 4598742..a5318be 100644
--- a/src/QuickMedia.cpp
+++ b/src/QuickMedia.cpp
@@ -2722,6 +2722,8 @@ namespace QuickMedia {
             VIEWING_ATTACHED_IMAGE
         };
 
+        thread_body->title_mark_urls = true;
+
         NavigationStage navigation_stage = NavigationStage::VIEWING_COMMENTS;
         AsyncTask<bool> captcha_request_future;
         AsyncTask<bool> captcha_post_solution_future;
diff --git a/tests/main.cpp b/tests/main.cpp
index c5138e3..306cdf2 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -28,6 +28,11 @@ int main() {
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "example.com");
 
+    str = "example.com: the best test website";
+    urls = QuickMedia::ranges_get_strings(str, QuickMedia::extract_urls(str));
+    assert_equals(urls.size(), 1);
+    assert_equals(urls[0], "example.com");
+
     str = "these. are. not. websites.";
     urls = QuickMedia::ranges_get_strings(str, QuickMedia::extract_urls(str));
     assert_equals(urls.size(), 0);
@@ -47,5 +52,10 @@ int main() {
     urls = QuickMedia::ranges_get_strings(str, QuickMedia::extract_urls(str));
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "https://emojipedia.org/emoji/%23%EF%B8%8F%E2%83%A3/");
+
+    str = "[sneed](https://sneedville.com)";
+    urls = QuickMedia::ranges_get_strings(str, QuickMedia::extract_urls(str));
+    assert_equals(urls.size(), 1);
+    assert_equals(urls[0], "https://sneedville.com");
     return 0;
 }
author	dec05eba <dec05eba@protonmail.com>	2021-04-09 21:35:39 +0200
committer	dec05eba <dec05eba@protonmail.com>	2021-04-09 21:45:28 +0200
commit	6040726f92784978dd91eec4c540e92c4ca54236 (patch)
tree	ce37ba29b40bf100c5bd577834ea4b6c90f4ab86
parent	20a437763e56e5429ebd7f38940c4107418e3dee (diff)