#include "../include/NetUtils.hpp" #include "../include/StringUtils.hpp" #include #include #include namespace QuickMedia { struct HtmlEscapeSequence { char unescape_char; std::string escape_sequence; }; void html_escape_sequences(std::string &str) { const std::array escape_sequences = { HtmlEscapeSequence { '&', "&" }, // This should be first, to not accidentally replace a new sequence caused by replacing this HtmlEscapeSequence { '"', """ }, HtmlEscapeSequence { '\'', "'" }, HtmlEscapeSequence { '<', "<" }, HtmlEscapeSequence { '>', ">" }, HtmlEscapeSequence { '\n', "
" } }; for(const HtmlEscapeSequence &escape_sequence : escape_sequences) { string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence); } } struct HtmlUnescapeSequence { std::string escape_sequence; std::string unescaped_str; }; void html_unescape_sequences(std::string &str) { const std::array unescape_sequences = { HtmlUnescapeSequence { """, "\"" }, HtmlUnescapeSequence { "'", "'" }, HtmlUnescapeSequence { "'", "'" }, HtmlUnescapeSequence { "<", "<" }, HtmlUnescapeSequence { ">", ">" }, HtmlUnescapeSequence { "&", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this }; for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) { string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str); } } std::string url_param_encode(const std::string ¶m) { std::ostringstream result; result.fill('0'); result << std::hex; for(char c : param) { if(isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { result << c; } else { result << std::uppercase; result << "%" << std::setw(2) << (int)(unsigned char)(c); } } return result.str(); } static bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool is_digit(char c) { return c >= '0' && c <= '9'; } static bool is_url_character(char c) { switch(c) { case '%': // Reserved case ':': case '/': case '?': case '#': case '[': case ']': case '@': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': // Unreserved: case '-': case '.': case '_': case '~': return true; default: return is_alpha(c) || is_digit(c); } } // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2 // TODO: Maybe check if the TLD only contains valid characters (is_alpha)? std::vector extract_urls(const std::string &str) { std::vector urls; size_t url_start = std::string::npos; size_t url_dot_index = std::string::npos; // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string for(size_t i = 0; i < (size_t)str.size() + 1; ++i) { char c = str[i]; if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos) url_dot_index = i; if(is_url_character(c)) { if(url_start == std::string::npos) url_start = i; } else { if(url_start != std::string::npos) { // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not. if(url_dot_index != std::string::npos && url_dot_index != i - 1) { size_t url_length = i - url_start; char prev_char = str[i - 1]; // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..." // and we want those links to work, I guess? if(prev_char == '.' || prev_char == ',') --url_length; urls.push_back(str.substr(url_start, url_length)); } url_start = std::string::npos; url_dot_index = std::string::npos; } } } return urls; } }