#include "../include/NetUtils.hpp" #include "../include/StringUtils.hpp" #include "../generated/Tlds.hpp" #include #include #include #include #include #include // TODO: Add all html sequences: https://html.spec.whatwg.org/multipage/named-characters.html namespace QuickMedia { struct HtmlEscapeSequence { char unescape_char; std::string escape_sequence; }; void html_escape_sequences(std::string &str) { const std::array escape_sequences = { HtmlEscapeSequence { '&', "&" }, // This should be first, to not accidentally replace a new sequence caused by replacing this HtmlEscapeSequence { '"', """ }, HtmlEscapeSequence { '\'', "'" }, HtmlEscapeSequence { '<', "<" }, HtmlEscapeSequence { '>', ">" }, HtmlEscapeSequence { '\n', "
" } }; for(const HtmlEscapeSequence &escape_sequence : escape_sequences) { string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence); } } struct HtmlUnescapeSequence { std::string escape_sequence; std::string unescaped_str; }; static void html_unescape_sequence_numbers(std::string &str) { size_t index = 0; while(true) { index = str.find("&#", index); if(index == std::string::npos) break; index += 2; size_t end_index = str.find(';', index); if(end_index != std::string::npos && end_index - index <= 3) { if(str[index] == 'x') { ++index; const size_t num_length = end_index - index; int num; if(to_num_hex(str.c_str() + index, num_length, num)) { const char num_c = (char)num; str.replace(index - 3, 3 + num_length + 1, &num_c, 1); index += (-3 + 1); } } else { const size_t num_length = end_index - index; int num; if(to_num(str.c_str() + index, num_length, num)) { const char num_c = (char)num; str.replace(index - 2, 2 + num_length + 1, &num_c, 1); index += (-2 + 1); } } } } } void html_unescape_sequences(std::string &str) { html_unescape_sequence_numbers(str); // TODO: Use string find and find & and ; instead of string_replace_all const std::array unescape_sequences = { HtmlUnescapeSequence { """, "\"" }, HtmlUnescapeSequence { "'", "'" }, HtmlUnescapeSequence { "<", "<" }, HtmlUnescapeSequence { ">", ">" }, HtmlUnescapeSequence { "—", "—" }, HtmlUnescapeSequence { " ", " " }, HtmlUnescapeSequence { "&", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this }; for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) { string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str); } } static bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool is_digit(char c) { return c >= '0' && c <= '9'; } // Returns -1 if its not a hex value static int get_hex_value(char c) { if(c >= '0' && c <= '9') return c - '0'; else if(c >= 'A' && c <= 'F') return 10 + (c - 'A'); else if(c >= 'a' && c <= 'f') return 10 + (c - 'a'); else return -1; } static bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\n'; } std::string url_param_encode(const std::string ¶m) { std::ostringstream result; result.fill('0'); result << std::hex; for(char c : param) { if(is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '.' || c == '~') { result << c; } else { result << std::uppercase; result << "%" << std::setw(2) << (int)(unsigned char)(c); } } return result.str(); } std::string url_param_decode(const std::string ¶m) { std::string result; for(int i = 0; i < (int)param.size();) { char c = param[i]; if(c == '%') { if(i < (int)param.size() - 1 && param[i + 1] == '%') { result += c; i += 2; } else if(i < (int)param.size() - 2) { int first_c = get_hex_value(param[i + 1]); int second_c = get_hex_value(param[i + 2]); if(first_c != -1 && second_c != -1) { result += (char)((first_c << 4) | second_c); i += 3; } else { i += 1; } } else { i += 1; } } else if(c == '+') { result += ' '; i += 1; } else { result += c; i += 1; } } return result; } static bool is_url_start_char(char c) { return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~'; } static bool is_url_domain_char(char c) { return is_url_start_char(c); } // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2. // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis. std::vector extract_urls(const std::string &str) { std::vector ranges; if(str.empty()) return ranges; int parentheses_depth = 0; bool is_valid_url = false; bool is_domain_part = true; bool contains_dot = false; size_t url_start = std::string::npos; // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string for(size_t i = 0; i < (size_t)str.size() + 1;) { char c = str[i]; if(url_start != std::string::npos) { if(c == '(') ++parentheses_depth; else if(c == ')') --parentheses_depth; } if(url_start != std::string::npos && c == '.') { contains_dot = true; } if(url_start != std::string::npos && !is_valid_url && contains_dot && (is_whitespace(c) || c == '/' || c == ',' || c == ':' || c == '?' || c == ')' || c == '\0' || (c == '.' && i == str.size()))) { size_t tld_end = i - 1; char prev_char = str[i - 1]; // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..." // and we want those links to work, I guess? if(prev_char == '.' || prev_char == ',' || prev_char == ':' || prev_char == '?') --tld_end; else if(prev_char == ')' && parentheses_depth != 0) --tld_end; size_t tld_start = tld_end; while(tld_start > url_start) { if(str[tld_start] == '.') break; --tld_start; } if(tld_start > url_start && is_tld(str.substr(tld_start + 1, tld_end - tld_start))) { is_valid_url = true; is_domain_part = false; } } if(url_start != std::string::npos && is_domain_part && c == ':') { if(i + 2 < (size_t)str.size() + 1 && str[i + 1] == '/' && str[i + 2] == '/') { i += 3; continue; } else if(i + 1 < (size_t)str.size() + 1 && is_whitespace(str[i + 1])) { i += 1; } else { url_start = std::string::npos; is_valid_url = false; is_domain_part = true; contains_dot = false; } } if(url_start != std::string::npos && is_domain_part && !is_url_domain_char(c)) { url_start = std::string::npos; is_valid_url = false; contains_dot = false; } if(url_start == std::string::npos && is_url_start_char(c)) { url_start = i; } else if(url_start != std::string::npos && (is_whitespace(c) || c == '\0')) { // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not. if(is_valid_url) { size_t url_length = i - url_start; char prev_char = str[i - 1]; // We want to remove the last . or , because the string could contain for example "click on this link: example.com. There you can..." // and we want those links to work, I guess? if(prev_char == '.' || prev_char == ',' || prev_char == ':' || prev_char == '?') --url_length; else if(prev_char == ')' && parentheses_depth != 0) --url_length; else if(prev_char < 32) --url_length; if(url_length > 0) ranges.push_back({url_start, url_length}); } url_start = std::string::npos; is_valid_url = false; is_domain_part = true; contains_dot = false; } ++i; } return ranges; } std::vector ranges_get_strings(const std::string &str, const std::vector &ranges) { std::vector strings(ranges.size()); for(size_t i = 0; i < ranges.size(); ++i) { const Range &range = ranges[i]; strings[i].assign(str.begin() + range.start, str.begin() + range.start + range.length); } return strings; } std::string header_extract_value(const std::string &header, const std::string &type) { std::string result; string_split(header, '\n', [&type, &result](const char *str, size_t size) { while(size > 0 && (*str == ' ' || *str == '\t')) { ++str; --size; } if(size < type.size() || !strncase_equals(str, type.c_str(), type.size()) || size == type.size()) return true; str += type.size(); size -= type.size(); const void *colon_ptr = memchr(str, ':', size); if(!colon_ptr) return true; const size_t colon_offset = (const char*)colon_ptr - str; str += (colon_offset + 1); size -= (colon_offset + 1); // lstrip space while(size > 0 && (*str == ' ' || *str == '\t')) { ++str; --size; } // rstrip whitespace while(size > 0 && (str[size - 1] == ' ' || str[size - 1] == '\t' || str[size - 1] == '\r' || str[size - 1] == '\n')) { --size; } result.assign(str, size); return false; }); return result; } }