From 4fe0a037c82332e84b16a6f0e2847a2f9a0bd5d7 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sun, 18 Oct 2020 18:27:11 +0200 Subject: Matrix: add a proper URI parser instead of regex for opening links. Show related body item image when pressing enter (for showing replied to media) --- include/NetUtils.hpp | 11 ++++ plugins/Plugin.hpp | 5 -- src/NetUtils.cpp | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/QuickMedia.cpp | 34 +++++------ src/plugins/Fourchan.cpp | 1 + src/plugins/Mangadex.cpp | 1 + src/plugins/Manganelo.cpp | 1 + src/plugins/Mangatown.cpp | 1 + src/plugins/Matrix.cpp | 1 + src/plugins/NyaaSi.cpp | 1 + src/plugins/Plugin.cpp | 61 -------------------- src/plugins/Pornhub.cpp | 1 + src/plugins/Youtube.cpp | 1 + tests/main.cpp | 38 +++++++++++++ 14 files changed, 215 insertions(+), 82 deletions(-) create mode 100644 include/NetUtils.hpp create mode 100644 src/NetUtils.cpp create mode 100644 tests/main.cpp diff --git a/include/NetUtils.hpp b/include/NetUtils.hpp new file mode 100644 index 0000000..84b9d18 --- /dev/null +++ b/include/NetUtils.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace QuickMedia { + void html_escape_sequences(std::string &str); + void html_unescape_sequences(std::string &str); + std::string url_param_encode(const std::string ¶m); + std::vector extract_urls(const std::string &str); +} \ No newline at end of file diff --git a/plugins/Plugin.hpp b/plugins/Plugin.hpp index 1427233..d754445 100644 --- a/plugins/Plugin.hpp +++ b/plugins/Plugin.hpp @@ -3,7 +3,6 @@ #include "../include/Body.hpp" #include "../include/DownloadUtils.hpp" #include -#include namespace QuickMedia { enum class PluginResult { @@ -36,10 +35,6 @@ namespace QuickMedia { size_t index; }; - void html_escape_sequences(std::string &str); - void html_unescape_sequences(std::string &str); - std::string url_param_encode(const std::string ¶m); - SuggestionResult download_result_to_suggestion_result(DownloadResult download_result); PluginResult download_result_to_plugin_result(DownloadResult download_result); SearchResult download_result_to_search_result(DownloadResult download_result); diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp new file mode 100644 index 0000000..e87c42c --- /dev/null +++ b/src/NetUtils.cpp @@ -0,0 +1,140 @@ +#include "../include/NetUtils.hpp" +#include "../include/StringUtils.hpp" +#include +#include +#include + +namespace QuickMedia { + struct HtmlEscapeSequence { + char unescape_char; + std::string escape_sequence; + }; + + void html_escape_sequences(std::string &str) { + const std::array escape_sequences = { + HtmlEscapeSequence { '&', "&" }, // This should be first, to not accidentally replace a new sequence caused by replacing this + HtmlEscapeSequence { '"', """ }, + HtmlEscapeSequence { '\'', "'" }, + HtmlEscapeSequence { '<', "<" }, + HtmlEscapeSequence { '>', ">" }, + HtmlEscapeSequence { '\n', "
" } + }; + + for(const HtmlEscapeSequence &escape_sequence : escape_sequences) { + string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence); + } + } + + struct HtmlUnescapeSequence { + std::string escape_sequence; + std::string unescaped_str; + }; + + void html_unescape_sequences(std::string &str) { + const std::array unescape_sequences = { + HtmlUnescapeSequence { """, "\"" }, + HtmlUnescapeSequence { "'", "'" }, + HtmlUnescapeSequence { "'", "'" }, + HtmlUnescapeSequence { "<", "<" }, + HtmlUnescapeSequence { ">", ">" }, + HtmlUnescapeSequence { "&", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this + }; + + for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) { + string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str); + } + } + + std::string url_param_encode(const std::string ¶m) { + std::ostringstream result; + result.fill('0'); + result << std::hex; + + for(char c : param) { + if(isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { + result << c; + } else { + result << std::uppercase; + result << "%" << std::setw(2) << (int)(unsigned char)(c); + } + } + + return result.str(); + } + + static bool is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); + } + + static bool is_digit(char c) { + return c >= '0' && c <= '9'; + } + + static bool is_url_character(char c) { + switch(c) { + case '%': + // Reserved + case ':': + case '/': + case '?': + case '#': + case '[': + case ']': + case '@': + case '!': + case '$': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case ';': + case '=': + // Unreserved: + case '-': + case '.': + case '_': + case '~': + return true; + default: + return is_alpha(c) || is_digit(c); + } + } + + // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2 + // TODO: Maybe check if the TLD only contains valid characters (is_alpha)? + std::vector extract_urls(const std::string &str) { + std::vector urls; + + size_t url_start = std::string::npos; + size_t url_dot_index = std::string::npos; + // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string + for(size_t i = 0; i < (size_t)str.size() + 1; ++i) { + char c = str[i]; + if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos) + url_dot_index = i; + if(is_url_character(c)) { + if(url_start == std::string::npos) + url_start = i; + } else { + if(url_start != std::string::npos) { + // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not. + if(url_dot_index != std::string::npos && url_dot_index != i - 1) { + size_t url_length = i - url_start; + char prev_char = str[i - 1]; + // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..." + // and we want those links to work, I guess? + if(prev_char == '.' || prev_char == ',') + --url_length; + urls.push_back(str.substr(url_start, url_length)); + } + url_start = std::string::npos; + url_dot_index = std::string::npos; + } + } + } + return urls; + } +} \ No newline at end of file diff --git a/src/QuickMedia.cpp b/src/QuickMedia.cpp index 070b8fe..e374dfb 100644 --- a/src/QuickMedia.cpp +++ b/src/QuickMedia.cpp @@ -18,12 +18,12 @@ #include "../include/ImageUtils.hpp" #include "../include/base64_url.hpp" #include "../include/Entry.hpp" +#include "../include/NetUtils.hpp" #include #include #include #include -#include #include #include @@ -3225,8 +3225,6 @@ namespace QuickMedia { const float chat_input_padding_x = 15.0f; const float chat_input_padding_y = 15.0f; - std::regex url_extract_regex("(http(s?):\\/\\/)?([a-zA-Z0-9\\-_]+\\.)+[a-zA-Z]+[^\\s.,]+"); - Body url_selection_body(this, font.get(), bold_font.get(), cjk_font.get()); sf::Clock read_marker_timer; @@ -3476,36 +3474,40 @@ namespace QuickMedia { if(tabs[selected_tab].type == ChatTabType::MESSAGES && event.key.code == sf::Keyboard::Enter) { BodyItem *selected = tabs[selected_tab].body->get_selected(); if(selected) { - if(!selected->url.empty()) { - const char *content_type = link_get_content_type(selected->url); + std::string selected_url = selected->url; + if(selected_url.empty() && selected->embedded_item) + selected_url = selected->embedded_item->url; + if(!selected_url.empty()) { + const char *content_type = link_get_content_type(selected_url); if(content_type && (strcmp(content_type, "audio") == 0 || strcmp(content_type, "video") == 0 || strcmp(content_type, "image") == 0)) { page_stack.push(PageType::CHAT); watched_videos.clear(); current_page = PageType::VIDEO_CONTENT; + bool is_audio = strcmp(content_type, "audio") == 0; + bool prev_no_video = no_video; + no_video = is_audio; // TODO: Add title - video_content_page(video_page.get(), selected->url, "No title"); + video_content_page(video_page.get(), selected_url, "No title"); + no_video = prev_no_video; redraw = true; continue; } - launch_url(selected->url.c_str()); + launch_url(selected_url); continue; } // TODO: If content type is a file, show file-manager prompt where it should be saved and asynchronously save it instead - // TODO: Change this when messages are not stored in the description const std::string &message_str = selected->get_description(); - auto urls_begin = std::sregex_iterator(message_str.begin(), message_str.end(), url_extract_regex); - auto urls_end = std::sregex_iterator(); - size_t num_urls = std::distance(urls_begin, urls_end); - if(num_urls == 1) { - launch_url(urls_begin->str()); - } else if(num_urls > 1) { + std::vector urls = extract_urls(message_str); + if(urls.size() == 1) { + launch_url(urls[0]); + } else if(urls.size() > 1) { chat_state = ChatState::URL_SELECTION; url_selection_body.clear_items(); - for(auto it = urls_begin; it != urls_end; ++it) { - auto body_item = BodyItem::create(it->str()); + for(const std::string &url : urls) { + auto body_item = BodyItem::create(url); url_selection_body.items.push_back(std::move(body_item)); } } diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp index 1d3681a..2938319 100644 --- a/src/plugins/Fourchan.cpp +++ b/src/plugins/Fourchan.cpp @@ -2,6 +2,7 @@ #include "../../include/DataView.hpp" #include "../../include/Storage.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include #include #include diff --git a/src/plugins/Mangadex.cpp b/src/plugins/Mangadex.cpp index f0e163e..0ccecdd 100644 --- a/src/plugins/Mangadex.cpp +++ b/src/plugins/Mangadex.cpp @@ -2,6 +2,7 @@ #include "../../include/Storage.hpp" #include "../../include/Notification.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include #include #include diff --git a/src/plugins/Manganelo.cpp b/src/plugins/Manganelo.cpp index e96bc65..b260dea 100644 --- a/src/plugins/Manganelo.cpp +++ b/src/plugins/Manganelo.cpp @@ -1,6 +1,7 @@ #include "../../plugins/Manganelo.hpp" #include "../../include/Notification.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include namespace QuickMedia { diff --git a/src/plugins/Mangatown.cpp b/src/plugins/Mangatown.cpp index 5d6f97f..992e0cc 100644 --- a/src/plugins/Mangatown.cpp +++ b/src/plugins/Mangatown.cpp @@ -1,6 +1,7 @@ #include "../../plugins/Mangatown.hpp" #include "../../include/Notification.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include static const std::string mangatown_url = "https://www.mangatown.com"; diff --git a/src/plugins/Matrix.cpp b/src/plugins/Matrix.cpp index 5cf4611..d81c53c 100644 --- a/src/plugins/Matrix.cpp +++ b/src/plugins/Matrix.cpp @@ -1,6 +1,7 @@ #include "../../plugins/Matrix.hpp" #include "../../include/Storage.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include #include #include diff --git a/src/plugins/NyaaSi.cpp b/src/plugins/NyaaSi.cpp index dc6e19f..860eb71 100644 --- a/src/plugins/NyaaSi.cpp +++ b/src/plugins/NyaaSi.cpp @@ -3,6 +3,7 @@ #include "../../include/Storage.hpp" #include "../../include/Notification.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include namespace QuickMedia { diff --git a/src/plugins/Plugin.cpp b/src/plugins/Plugin.cpp index 3f76b4c..0b554ae 100644 --- a/src/plugins/Plugin.cpp +++ b/src/plugins/Plugin.cpp @@ -1,67 +1,6 @@ #include "../../plugins/Plugin.hpp" -#include "../../include/StringUtils.hpp" -#include -#include -#include namespace QuickMedia { - struct HtmlEscapeSequence { - char unescape_char; - std::string escape_sequence; - }; - - void html_escape_sequences(std::string &str) { - const std::array escape_sequences = { - HtmlEscapeSequence { '&', "&" }, // This should be first, to not accidentally replace a new sequence caused by replacing this - HtmlEscapeSequence { '"', """ }, - HtmlEscapeSequence { '\'', "'" }, - HtmlEscapeSequence { '<', "<" }, - HtmlEscapeSequence { '>', ">" }, - HtmlEscapeSequence { '\n', "
" } - }; - - for(const HtmlEscapeSequence &escape_sequence : escape_sequences) { - string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence); - } - } - - struct HtmlUnescapeSequence { - std::string escape_sequence; - std::string unescaped_str; - }; - - void html_unescape_sequences(std::string &str) { - const std::array unescape_sequences = { - HtmlUnescapeSequence { """, "\"" }, - HtmlUnescapeSequence { "'", "'" }, - HtmlUnescapeSequence { "'", "'" }, - HtmlUnescapeSequence { "<", "<" }, - HtmlUnescapeSequence { ">", ">" }, - HtmlUnescapeSequence { "&", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this - }; - - for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) { - string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str); - } - } - - std::string url_param_encode(const std::string ¶m) { - std::ostringstream result; - result.fill('0'); - result << std::hex; - - for(char c : param) { - if(isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { - result << c; - } else { - result << std::uppercase; - result << "%" << std::setw(2) << (int)(unsigned char)(c); - } - } - - return result.str(); - } - SuggestionResult download_result_to_suggestion_result(DownloadResult download_result) { return (SuggestionResult)download_result; } PluginResult download_result_to_plugin_result(DownloadResult download_result) { return (PluginResult)download_result; } SearchResult download_result_to_search_result(DownloadResult download_result) { return (SearchResult)download_result; } diff --git a/src/plugins/Pornhub.cpp b/src/plugins/Pornhub.cpp index afdd8fc..e8df9d7 100644 --- a/src/plugins/Pornhub.cpp +++ b/src/plugins/Pornhub.cpp @@ -1,5 +1,6 @@ #include "../../plugins/Pornhub.hpp" #include "../../include/StringUtils.hpp" +#include "../../include/NetUtils.hpp" #include #include diff --git a/src/plugins/Youtube.cpp b/src/plugins/Youtube.cpp index 1711e41..424a8d2 100644 --- a/src/plugins/Youtube.cpp +++ b/src/plugins/Youtube.cpp @@ -1,5 +1,6 @@ #include "../../plugins/Youtube.hpp" #include "../../include/Storage.hpp" +#include "../../include/NetUtils.hpp" #include #include diff --git a/tests/main.cpp b/tests/main.cpp new file mode 100644 index 0000000..38dd534 --- /dev/null +++ b/tests/main.cpp @@ -0,0 +1,38 @@ +#include +#include "../include/NetUtils.hpp" + +#define assert_fail(str) do { fprintf(stderr, "Assert failed on line %d, reason: %s\n", __LINE__, (str)); exit(1); } while(0) +#define assert_equals(a, b) do { if((a) != (b)) { fprintf(stderr, "Assert failed on line %d, %s == %s\n", __LINE__, #a, #b); exit(1); } } while(0) + +int main() { + std::vector urls; + + urls = QuickMedia::extract_urls("example.com"); + assert_equals(urls.size(), 1); + assert_equals(urls[0], "example.com"); + + urls = QuickMedia::extract_urls("example.com, is where I like to go"); + assert_equals(urls.size(), 1); + assert_equals(urls[0], "example.com"); + + urls = QuickMedia::extract_urls("The website I like to go to is example.com"); + assert_equals(urls.size(), 1); + assert_equals(urls[0], "example.com"); + + urls = QuickMedia::extract_urls("example.com. Is also a website"); + assert_equals(urls.size(), 1); + assert_equals(urls[0], "example.com"); + + urls = QuickMedia::extract_urls("these. are. not. websites."); + assert_equals(urls.size(), 0); + + urls = QuickMedia::extract_urls("This is not an url: example."); + assert_equals(urls.size(), 0); + + urls = QuickMedia::extract_urls("the.se/~#423-_/2f.no/3df a.re considered sub.websit.es"); + assert_equals(urls.size(), 3); + assert_equals(urls[0], "the.se/~#423-_/2f.no/3df"); + assert_equals(urls[1], "a.re"); + assert_equals(urls[2], "sub.websit.es"); + return 0; +} -- cgit v1.2.3