aboutsummaryrefslogtreecommitdiff
path: root/src/NetUtils.cpp
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-04-09 21:35:39 +0200
committerdec05eba <dec05eba@protonmail.com>2021-04-09 21:45:28 +0200
commit6040726f92784978dd91eec4c540e92c4ca54236 (patch)
treece37ba29b40bf100c5bd577834ea4b6c90f4ab86 /src/NetUtils.cpp
parent20a437763e56e5429ebd7f38940c4107418e3dee (diff)
Add .onion to list of valid tld, properly parse urls protocol part
Diffstat (limited to 'src/NetUtils.cpp')
-rw-r--r--src/NetUtils.cpp49
1 files changed, 43 insertions, 6 deletions
diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index d5795c2..8bb5a0e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -902,6 +902,7 @@ namespace QuickMedia {
"ong",
"onl",
"online",
+ "onion",
"onyourside",
"ooo",
"open",
@@ -1583,7 +1584,11 @@ namespace QuickMedia {
}
static bool is_url_start_char(char c) {
- return is_alpha(c) || is_digit(c);
+ return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~';
+ }
+
+ static bool is_url_domain_char(char c) {
+ return is_url_start_char(c);
}
// Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
@@ -1593,10 +1598,12 @@ namespace QuickMedia {
int parentheses_depth = 0;
bool is_valid_url = false;
+ bool is_domain_part = true;
+ bool contains_dot = false;
size_t url_start = std::string::npos;
// str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
- for(size_t i = 0; i < (size_t)str.size() + 1; ++i) {
+ for(size_t i = 0; i < (size_t)str.size() + 1;) {
char c = str[i];
if(url_start != std::string::npos) {
@@ -1606,12 +1613,16 @@ namespace QuickMedia {
--parentheses_depth;
}
- if(url_start != std::string::npos && !is_valid_url && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
+ if(url_start != std::string::npos && c == '.') {
+ contains_dot = true;
+ }
+
+ if(url_start != std::string::npos && !is_valid_url && contains_dot && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) {
size_t tld_end = i - 1;
char prev_char = str[i - 1];
// We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
// and we want those links to work, I guess?
- if(prev_char == '.' || prev_char == ',')
+ if(prev_char == '.' || prev_char == ',' || prev_char == ':')
--tld_end;
else if(prev_char == ')' && parentheses_depth != 0)
--tld_end;
@@ -1623,8 +1634,30 @@ namespace QuickMedia {
--tld_start;
}
- if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end())
+ if(tld_start > url_start && TLDS.find(str.substr(tld_start + 1, tld_end - tld_start)) != TLDS.end()) {
is_valid_url = true;
+ is_domain_part = false;
+ }
+ }
+
+ if(url_start != std::string::npos && is_domain_part && c == ':') {
+ if(i + 2 < (size_t)str.size() + 1 && str[i + 1] == '/' && str[i + 2] == '/') {
+ i += 3;
+ continue;
+ } else if(i + 1 < (size_t)str.size() + 1 && is_whitespace(str[i + 1])) {
+ i += 1;
+ } else {
+ url_start = std::string::npos;
+ is_valid_url = false;
+ is_domain_part = true;
+ contains_dot = false;
+ }
+ }
+
+ if(url_start != std::string::npos && is_domain_part && !is_url_domain_char(c)) {
+ url_start = std::string::npos;
+ is_valid_url = false;
+ contains_dot = false;
}
if(url_start == std::string::npos && is_url_start_char(c)) {
@@ -1636,7 +1669,7 @@ namespace QuickMedia {
char prev_char = str[i - 1];
// We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..."
// and we want those links to work, I guess?
- if(prev_char == '.' || prev_char == ',')
+ if(prev_char == '.' || prev_char == ',' || prev_char == ':')
--url_length;
else if(prev_char == ')' && parentheses_depth != 0)
--url_length;
@@ -1646,7 +1679,11 @@ namespace QuickMedia {
url_start = std::string::npos;
is_valid_url = false;
+ is_domain_part = true;
+ contains_dot = false;
}
+
+ ++i;
}
return ranges;