aboutsummaryrefslogtreecommitdiff
path: root/src/NetUtils.cpp
blob: e87c42c3897f7c359b8662ed3188129ca7b4d378 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#include "../include/NetUtils.hpp"
#include "../include/StringUtils.hpp"
#include <array>
#include <sstream>
#include <iomanip>

namespace QuickMedia {
    struct HtmlEscapeSequence {
        char unescape_char;
        std::string escape_sequence;
    };

    void html_escape_sequences(std::string &str) {
        const std::array<HtmlEscapeSequence, 6> escape_sequences = {
            HtmlEscapeSequence { '&', "&amp;" }, // This should be first, to not accidentally replace a new sequence caused by replacing this
            HtmlEscapeSequence { '"', "&quot;" },
            HtmlEscapeSequence { '\'', "&#39;" },
            HtmlEscapeSequence { '<', "&lt;" },
            HtmlEscapeSequence { '>', "&gt;" },
            HtmlEscapeSequence { '\n', "<br>" }
        };

        for(const HtmlEscapeSequence &escape_sequence : escape_sequences) {
            string_replace_all(str, escape_sequence.unescape_char, escape_sequence.escape_sequence);
        }
    }

    struct HtmlUnescapeSequence {
        std::string escape_sequence;
        std::string unescaped_str;
    };

    void html_unescape_sequences(std::string &str) {
        const std::array<HtmlUnescapeSequence, 6> unescape_sequences = {
            HtmlUnescapeSequence { "&quot;", "\"" },
            HtmlUnescapeSequence { "&#039;", "'" },
            HtmlUnescapeSequence { "&#39;", "'" },
            HtmlUnescapeSequence { "&lt;", "<" },
            HtmlUnescapeSequence { "&gt;", ">" },
            HtmlUnescapeSequence { "&amp;", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this
        };

        for(const HtmlUnescapeSequence &unescape_sequence : unescape_sequences) {
            string_replace_all(str, unescape_sequence.escape_sequence, unescape_sequence.unescaped_str);
        }
    }

    std::string url_param_encode(const std::string &param) {
        std::ostringstream result;
        result.fill('0');
        result << std::hex;

        for(char c : param) {
            if(isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') {
                result << c;
            } else {
                result << std::uppercase;
                result << "%" << std::setw(2) << (int)(unsigned char)(c);
            }
        }

        return result.str();
    }

    static bool is_alpha(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    }

    static bool is_digit(char c) {
        return c >= '0' && c <= '9';
    }

    static bool is_url_character(char c) {
        switch(c) {
            case '%':
            // Reserved
            case ':':
            case '/':
            case '?':
            case '#':
            case '[':
            case ']':
            case '@':
            case '!':
            case '$':
            case '&':
            case '\'':
            case '(':
            case ')':
            case '*':
            case '+':
            case ',':
            case ';':
            case '=':
            // Unreserved:
            case '-':
            case '.':
            case '_':
            case '~':
                return true;
            default:
                return is_alpha(c) || is_digit(c);
        }
    }

    // Implementation follows URI standard: https://tools.ietf.org/html/rfc3986#section-2.2
    // TODO: Maybe check if the TLD only contains valid characters (is_alpha)?
    std::vector<std::string> extract_urls(const std::string &str) {
        std::vector<std::string> urls;

        size_t url_start = std::string::npos;
        size_t url_dot_index = std::string::npos;
        // str.size() is fine, we want to include the NULL character so we can extract url at the end of the string
        for(size_t i = 0; i < (size_t)str.size() + 1; ++i) {
            char c = str[i];
            if(c == '.' && url_start != std::string::npos && url_dot_index == std::string::npos)
                url_dot_index = i;
            if(is_url_character(c)) {
                if(url_start == std::string::npos)
                    url_start = i;
            } else {
                if(url_start != std::string::npos) {
                    // Its only an url if there is a dot and the dot is not the last character in the url, for example "example.com" is an url but "example." is not.
                    if(url_dot_index != std::string::npos && url_dot_index != i - 1) {
                        size_t url_length = i - url_start;
                        char prev_char = str[i - 1];
                        // We want to remove the last . or , because the string could contain for example "click on this like: example.com. There you can..."
                        // and we want those links to work, I guess?
                        if(prev_char == '.' || prev_char == ',')
                            --url_length;
                        urls.push_back(str.substr(url_start, url_length));
                    }
                    url_start = std::string::npos;
                    url_dot_index = std::string::npos;
                }
            }
        }
        return urls;
    }
}