1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#!/usr/bin/env python3
import requests
response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
response.raise_for_status()
all_sequences = []
longest_sequence = 0
for line in response.text.splitlines(False):
if len(line) == 0 or line[0] == '#' or line.find("minimally-qualified") != -1:
continue
columns = line.split(";")
codepoints = columns[0].split()
sequence = []
for codepoint in codepoints:
sequence.append(int(codepoint, base=16))
longest_sequence = max(longest_sequence, len(sequence))
all_sequences.append(sequence)
with open("generated/Emoji.hpp", "w") as header_file:
header_file.write("""#pragma once
#include <stdint.h>
#include <stddef.h>
// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
namespace QuickMedia {
bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
}""")
with open("generated/Emoji.cpp", "w") as source_file:
source_file.write("""#include "Emoji.hpp"
#include <unordered_map>
#include <array>
#include <mglpp/system/Utf8.hpp>
// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
namespace QuickMedia {
static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
""" % (longest_sequence - 1))
for sequence in all_sequences:
remaining_sequences = [ hex(c) for c in sequence[1:] ]
source_file.write(" { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))
source_file.write(
""" };
bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
uint32_t codepoint;
size_t clen;
if(!mgl::utf8_decode(str, size, &codepoint, &clen))
return false;
const size_t str_start_index = clen;
sequence[0] = codepoint;
auto range = emoji_sequences.equal_range(codepoint);
if(range.first == range.second)
return false;
auto longest_match_it = range.first;
size_t longest_match_byte_length = str_start_index;
bool match_found = false;
for(auto it = range.first, end = range.second; it != end; ++it) {
size_t str_index = str_start_index;
for(size_t i = 0; i < it->second.size(); ++i) {
const uint32_t codepoint_in_sequence = it->second[i];
if(codepoint_in_sequence == 0)
break;
if(str_index >= size)
goto next;
if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
goto next;
if(codepoint != codepoint_in_sequence)
goto next;
str_index += clen;
}
if(str_index >= longest_match_byte_length) {
longest_match_it = it;
longest_match_byte_length = str_index;
}
match_found = true;
next:;
}
if(!match_found)
return false;
size_t sequence_index = 1;
for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
const uint32_t codepoint_in_sequence = longest_match_it->second[i];
if(codepoint_in_sequence == 0)
break;
sequence[sequence_index] = codepoint_in_sequence;
++sequence_index;
}
sequence_len = sequence_index;
byte_length = longest_match_byte_length;
return true;
}
}
""")
|