From 6040726f92784978dd91eec4c540e92c4ca54236 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Fri, 9 Apr 2021 21:35:39 +0200 Subject: Add .onion to list of valid tld, properly parse urls protocol part --- src/NetUtils.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'src/NetUtils.cpp') diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp index d5795c2..8bb5a0e 100644 --- a/src/NetUtils.cpp +++ b/src/NetUtils.cpp @@ -902,6 +902,7 @@ namespace QuickMedia { "ong", "onl", "online", + "onion", "onyourside", "ooo", "open", @@ -1583,7 +1584,11 @@ namespace QuickMedia { } static bool is_url_start_char(char c) { - return is_alpha(c) || is_digit(c); + return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~'; + } + + static bool is_url_domain_char(char c) { + return is_url_start_char(c); } // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2. @@ -1593,10 +1598,12 @@ namespace QuickMedia { int parentheses_depth = 0; bool is_valid_url = false; + bool is_domain_part = true; + bool contains_dot = false; size_t url_start = std::string::npos; // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string - for(size_t i = 0; i < (size_t)str.size() + 1; ++i) { + for(size_t i = 0; i < (size_t)str.size() + 1;) { char c = str[i]; if(url_start != std::string::npos) { @@ -1606,12 +1613,16 @@ namespace QuickMedia { --parentheses_depth; } - if(url_start != std::string::npos && !is_valid_url && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) { + if(url_start != std::string::npos && c == '.') { + contains_dot = true; + } + + if(url_start != std::string::npos && !is_valid_url && contains_dot && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) { size_t tld_end = i - 1; char prev_char = str[i - 1]; // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..." // and we want those links to work, I guess? - if(prev_char == '.' || prev_char == ',') + if(prev_char == '.' || prev_char == ',' || prev_char == ':') --tld_end; else if(prev_char == ')' && parentheses_depth != 0) --tld_end; @@ -1623,8 +1634,30 @@ namespace QuickMedia { --tld_start; } - if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end()) + if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end()) { is_valid_url = true; + is_domain_part = false; + } + } + + if(url_start != std::string::npos && is_domain_part && c == ':') { + if(i + 2 < (size_t)str.size() + 1 && str[i + 1] == '/' && str[i + 2] == '/') { + i += 3; + continue; + } else if(i + 1 < (size_t)str.size() + 1 && is_whitespace(str[i + 1])) { + i += 1; + } else { + url_start = std::string::npos; + is_valid_url = false; + is_domain_part = true; + contains_dot = false; + } + } + + if(url_start != std::string::npos && is_domain_part && !is_url_domain_char(c)) { + url_start = std::string::npos; + is_valid_url = false; + contains_dot = false; } if(url_start == std::string::npos && is_url_start_char(c)) { @@ -1636,7 +1669,7 @@ namespace QuickMedia { char prev_char = str[i - 1]; // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..." // and we want those links to work, I guess? - if(prev_char == '.' || prev_char == ',') + if(prev_char == '.' || prev_char == ',' || prev_char == ':') --url_length; else if(prev_char == ')' && parentheses_depth != 0) --url_length; @@ -1646,7 +1679,11 @@ namespace QuickMedia { url_start = std::string::npos; is_valid_url = false; + is_domain_part = true; + contains_dot = false; } + + ++i; } return ranges; -- cgit v1.2.3