generate-emoji-sequences.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

#!/usr/bin/env python3

import requests

response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
response.raise_for_status()

all_sequences = []
longest_sequence = 0

for line in response.text.splitlines(False):
    if len(line) == 0 or line[0] == '#' or line.find("fully-qualified") == -1:
        continue

    columns = line.split(";")
    codepoints = columns[0].split()
    sequence = []
    for codepoint in codepoints:
        sequence.append(int(codepoint, base=16))
    longest_sequence = max(longest_sequence, len(sequence))
    all_sequences.append(sequence)

with open("generated/Emoji.hpp", "w") as header_file:
    header_file.write("""#pragma once

#include <stdint.h>
#include <stddef.h>

// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!

namespace QuickMedia {
    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
}""")

with open("generated/Emoji.cpp", "w") as source_file:
    source_file.write("""#include "Emoji.hpp"
#include <unordered_map>
#include <array>
#include <mglpp/system/Utf8.hpp>

// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!

namespace QuickMedia {
    static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
""" % (longest_sequence - 1))
    for sequence in all_sequences:
        remaining_sequences = [ hex(c) for c in sequence[1:] ]
        source_file.write("        { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))

    source_file.write(
"""    };

    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
        uint32_t codepoint;
        size_t clen;
        if(!mgl::utf8_decode(str, size, &codepoint, &clen))
            return false;

        const size_t str_start_index = clen;
        sequence[0] = codepoint;

        auto range = emoji_sequences.equal_range(codepoint);
        if(range.first == range.second)
            return false;

        auto longest_match_it = range.first;
        size_t longest_match_byte_length = str_start_index;
        bool match_found = false;

        for(auto it = range.first, end = range.second; it != end; ++it) {
            size_t str_index = str_start_index;

            for(size_t i = 0; i < it->second.size(); ++i) {
                const uint32_t codepoint_in_sequence = it->second[i];
                if(codepoint_in_sequence == 0)
                    break;

                if(str_index >= size)
                    goto next;

                if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
                    goto next;

                if(codepoint != codepoint_in_sequence)
                    goto next;

                str_index += clen;
            }

            if(str_index >= longest_match_byte_length) {
                longest_match_it = it;
                longest_match_byte_length = str_index;
            }

            match_found = true;
            next:;
        }

        if(!match_found)
            return false;

        size_t sequence_index = 1;
        for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
            const uint32_t codepoint_in_sequence = longest_match_it->second[i];
            if(codepoint_in_sequence == 0)
                break;

            sequence[sequence_index] = codepoint_in_sequence;
            ++sequence_index;
        }

        sequence_len = sequence_index;
        byte_length = longest_match_byte_length;
        return true;
    }
}
""")