Exclude last parenthesis in url extraction if unbalanced, fix tests

author: dec05eba <dec05eba@protonmail.com> 2020-11-16 23:47:21 +0100
committer: dec05eba <dec05eba@protonmail.com> 2020-11-16 23:47:24 +0100
commit: 338694c827320467dc5bff124c25ff82603e51a6 (patch)
tree: e83502c7166ddfb8f760e147a90c71d8f49cf63f
parent: 459f11326feb68947905e267960b736ba0dff8a2 (diff)
3 files changed, 56 insertions, 28 deletions
diff --git a/TODO b/TODO
index d6bc31f..cdf4c3d 100644
--- a/TODO
+++ b/TODO
@@ -128,7 +128,10 @@ Update room name/avatar with new data in /sync.
 Read marker is incorrect if the last message is an edit/redact, because they are hidden and replaces other body items instead.
 Scroll tabs if there are more than 3 tab items and show arrow on left/right side when there are more items to see.
 Make a shader for Text for changing color instead of updating the text geometry. Or loop vertices and set their color to the new color without updating the text geometry.
-Automatically retry sending messages that fails to send (after timeout).
+Automatically retry sending messages that fails to send (after timeout). These failed to send messages should be stored on disk and retried when going back to the room or restarting QuickMedia.
 Also make message deletion provisional (make it gray while its deleting the message).
 Continue matrix requests when switching room, instead of resetting them when going from chat page to room list page (such as post message request).
-Pinned messages authors doesn't seem to be updated when fetching users, if the pinned messages are fetched before the users (by navigating to the pending messages tab quickly).
-\ No newline at end of file
+Pinned messages authors doesn't seem to be updated when fetching users, if the pinned messages are fetched before the users (by navigating to the pending messages tab quickly).
+Improve /sync by not removing cached data on initial sync, and also always append data to sync file instead of overwriting sync file on "initial sync". Also cache "since", but take into consideration that not all messages are fetched on the initial sync,
+then add a gap between old messages from before sync and after sync so we can fetch the messages between the old messages and new messages and remove the gap when the fetched messages contains any of the old messages.
+Implement m.sticker.
+\ No newline at end of file
diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp
index f8b118b..5ca6d3e 100644
--- a/src/NetUtils.cpp
+++ b/src/NetUtils.cpp
@@ -70,6 +70,10 @@ namespace QuickMedia {
         return result.str();
     }
 
+    static bool is_url_start_char(char c) {
+        return is_alpha(c) || is_digit(c);
+    }
+
     static bool is_url_character(char c) {
         switch(c) {
             case '%':
@@ -103,9 +107,10 @@ namespace QuickMedia {
         }
     }
 
-    // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2
-    // TODO: Maybe check if the TLD only contains valid characters (is_alpha)?
+    // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
+    // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis.
     void extract_urls(const std::string &str, std::vector<std::string> &urls) {
+        int parentheses_depth = 0;
         size_t url_start = std::string::npos;
         size_t url_dot_index = std::string::npos;
         // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
@@ -113,25 +118,33 @@ namespace QuickMedia {
             char c = str[i];
             if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos)
                 url_dot_index = i;
-            if(is_url_character(c)) {
-                if(url_start == std::string::npos)
-                    url_start = i;
-            } else {
-                if(url_start != std::string::npos) {
-                    // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
-                    if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
-                        size_t url_length = i - url_start;
-                        char prev_char = str[i - 1];
-                        // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
-                        // and we want those links to work, I guess?
-                        if(prev_char == '.' || prev_char == ',')
-                            --url_length;
+
+            if(url_start != std::string::npos) {
+                if(c == '(')
+                    ++parentheses_depth;
+                else if(c == ')')
+                    --parentheses_depth;
+            }
+
+            if(url_start == std::string::npos && is_url_start_char(c)) {
+                url_start = i;
+            } else if(url_start != std::string::npos && !is_url_character(c)) {
+                // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
+                if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
+                    size_t url_length = i - url_start;
+                    char prev_char = str[i - 1];
+                    // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
+                    // and we want those links to work, I guess?
+                    if(prev_char == '.' || prev_char == ',')
+                        --url_length;
+                    if(prev_char == ')' && parentheses_depth != 0)
+                        --url_length;
+                    if(url_length > 0)
                         urls.push_back(str.substr(url_start, url_length));
-                    }
-                    url_start = std::string::npos;
-                    url_dot_index = std::string::npos;
                 }
+                url_start = std::string::npos;
+                url_dot_index = std::string::npos;
             }
-        }
+    }
     }
 }
 \ No newline at end of file
diff --git a/tests/main.cpp b/tests/main.cpp
index 38dd534..647fdff 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -7,32 +7,44 @@
 int main() {
     std::vector<std::string> urls;
 
-    urls = QuickMedia::extract_urls("example.com");
+    urls.clear();
+    QuickMedia::extract_urls("example.com", urls);
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "example.com");
 
-    urls = QuickMedia::extract_urls("example.com, is where I like to go");
+    urls.clear();
+    QuickMedia::extract_urls("example.com, is where I like to go", urls);
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "example.com");
 
-    urls = QuickMedia::extract_urls("The website I like to go to is example.com");
+    urls.clear();
+    QuickMedia::extract_urls("The website I like to go to is example.com", urls);
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "example.com");
 
-    urls = QuickMedia::extract_urls("example.com. Is also a website");
+    urls.clear();
+    QuickMedia::extract_urls("example.com. Is also a website", urls);
     assert_equals(urls.size(), 1);
     assert_equals(urls[0], "example.com");
 
-    urls = QuickMedia::extract_urls("these. are. not. websites.");
+    urls.clear();
+    QuickMedia::extract_urls("these. are. not. websites.", urls);
     assert_equals(urls.size(), 0);
 
-    urls = QuickMedia::extract_urls("This is not an url: example.");
+    urls.clear();
+    QuickMedia::extract_urls("This is not an url: example.", urls);
     assert_equals(urls.size(), 0);
 
-    urls = QuickMedia::extract_urls("the.se/~#423-_/2f.no/3df a.re considered sub.websit.es");
+    urls.clear();
+    QuickMedia::extract_urls("the.se/~#423-_/2f.no/3df a.re considered sub.websit.es", urls);
     assert_equals(urls.size(), 3);
     assert_equals(urls[0], "the.se/~#423-_/2f.no/3df");
     assert_equals(urls[1], "a.re");
     assert_equals(urls[2], "sub.websit.es");
+
+    urls.clear();
+    QuickMedia::extract_urls("(see https://emojipedia.org/emoji/%23%EF%B8%8F%E2%83%A3/)", urls);
+    assert_equals(urls.size(), 1);
+    assert_equals(urls[0], "https://emojipedia.org/emoji/%23%EF%B8%8F%E2%83%A3/");
     return 0;
 }
author	dec05eba <dec05eba@protonmail.com>	2020-11-16 23:47:21 +0100
committer	dec05eba <dec05eba@protonmail.com>	2020-11-16 23:47:24 +0100
commit	338694c827320467dc5bff124c25ff82603e51a6 (patch)
tree	e83502c7166ddfb8f760e147a90c71d8f49cf63f
parent	459f11326feb68947905e267960b736ba0dff8a2 (diff)