src/NetUtils.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#include "../include/NetUtils.hpp"
#include "../include/StringUtils.hpp"
#include <array>
#include <sstream>
#include <iomanip>

namespace QuickMedia {
    struct HtmlEscapeSequence {
        char unescape_char;
        std::string escape_sequence;
    };

    void html_escape_sequences(std::string &str) {
        const std::array<HtmlEscapeSequence, 6> escape_sequences = {
            HtmlEscapeSequence { '&', "&amp;" }, // This should be first, to not accidentally replace a new sequence caused by replacing this
            HtmlEscapeSequence { '"', "&quot;" },
            HtmlEscapeSequence { '\'', "&#39;" },
            HtmlEscapeSequence { '<', "&lt;" },
            HtmlEscapeSequence { '>', "&gt;" },
            HtmlEscapeSequence { '\n', "<br>" }
        };

        for(const HtmlEscapeSequence &escape_sequence : escape_sequences) {
            string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence);
        }
    }

    struct HtmlUnescapeSequence {
        std::string escape_sequence;
        std::string unescaped_str;
    };

    void html_unescape_sequences(std::string &str) {
        const std::array<HtmlUnescapeSequence, 6> unescape_sequences = {
            HtmlUnescapeSequence { "&quot;", "\"" },
            HtmlUnescapeSequence { "&#039;", "'" },
            HtmlUnescapeSequence { "&#39;", "'" },
            HtmlUnescapeSequence { "&lt;", "<" },
            HtmlUnescapeSequence { "&gt;", ">" },
            HtmlUnescapeSequence { "&amp;", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this
        };

        for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) {
            string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str);
        }
    }

    static bool is_alpha(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    }

    static bool is_digit(char c) {
        return c >= '0' && c <= '9';
    }

    std::string url_param_encode(const std::string &param) {
        std::ostringstream result;
        result.fill('0');
        result << std::hex;

        for(char c : param) {
            if(is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '.' || c == '~') {
                result << c;
            } else {
                result << std::uppercase;
                result << "%" << std::setw(2) << (int)(unsigned char)(c);
            }
        }

        return result.str();
    }

    static bool is_url_start_char(char c) {
        return is_alpha(c) || is_digit(c);
    }

    static bool is_url_character(char c) {
        switch(c) {
            case '%':
            // Reserved
            case ':':
            case '/':
            case '?':
            case '#':
            case '[':
            case ']':
            case '@':
            case '!':
            case '$':
            case '&':
            case '\'':
            case '(':
            case ')':
            case '*':
            case '+':
            case ',':
            case ';':
            case '=':
            // Unreserved:
            case '-':
            case '.':
            case '_':
            case '~':
                return true;
            default:
                return is_alpha(c) || is_digit(c);
        }
    }

    // Implementation follows URI standard in general: https://tools.ietf.org/html/rfc3986#section-2.2.
    // Also checks for balanced parentheses to allow text such as: (see: example.com/) that excludes the last parenthesis.
    void extract_urls(const std::string &str, std::vector<std::string> &urls) {
        int parentheses_depth = 0;
        size_t url_start = std::string::npos;
        size_t url_dot_index = std::string::npos;
        // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
        for(size_t i = 0; i < (size_t)str.size() + 1; ++i) {
            char c = str[i];
            if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos)
                url_dot_index = i;

            if(url_start != std::string::npos) {
                if(c == '(')
                    ++parentheses_depth;
                else if(c == ')')
                    --parentheses_depth;
            }

            if(url_start == std::string::npos && is_url_start_char(c)) {
                url_start = i;
            } else if(url_start != std::string::npos && !is_url_character(c)) {
                // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
                if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
                    size_t url_length = i - url_start;
                    char prev_char = str[i - 1];
                    // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
                    // and we want those links to work, I guess?
                    if(prev_char == '.' || prev_char == ',')
                        --url_length;
                    if(prev_char == ')' && parentheses_depth != 0)
                        --url_length;
                    if(url_length > 0)
                        urls.push_back(str.substr(url_start, url_length));
                }
                url_start = std::string::npos;
                url_dot_index = std::string::npos;
            }
    }
    }
}