diff options
author | dec05eba <dec05eba@protonmail.com> | 2022-11-05 15:53:28 +0100 |
---|---|---|
committer | dec05eba <dec05eba@protonmail.com> | 2022-11-05 15:53:28 +0100 |
commit | 4daa57f6d139f51a62ea4bcffa738bd5035df33a (patch) | |
tree | ba767bbbf1c7153a12e3d9e866a079c30d1c69e0 /generate-emoji-sequences.py | |
parent | f89117b5cf36797b04291942b2f2494895fc58dd (diff) |
Support as many emoji as possible, using separate emoji images in text
Diffstat (limited to 'generate-emoji-sequences.py')
-rwxr-xr-x | generate-emoji-sequences.py | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/generate-emoji-sequences.py b/generate-emoji-sequences.py new file mode 100755 index 0000000..b796548 --- /dev/null +++ b/generate-emoji-sequences.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +import requests + +response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt") +response.raise_for_status() + +all_sequences = [] +longest_sequence = 0 + +for line in response.text.splitlines(False): + if len(line) == 0 or line[0] == '#' or line.find("fully-qualified") == -1: + continue + + columns = line.split(";") + codepoints = columns[0].split() + sequence = [] + for codepoint in codepoints: + sequence.append(int(codepoint, base=16)) + longest_sequence = max(longest_sequence, len(sequence)) + all_sequences.append(sequence) + +with open("generated/Emoji.hpp", "w") as header_file: + header_file.write("""#pragma once + +#include <stdint.h> +#include <stddef.h> + +// This file was automatically generated with generate-emoji-sequences.py, do not edit manually! + +namespace QuickMedia { + bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length); +}""") + +with open("generated/Emoji.cpp", "w") as source_file: + source_file.write("""#include "Emoji.hpp" +#include <unordered_map> +#include <array> +#include <mglpp/system/Utf8.hpp> + +// This file was automatically generated with generate-emoji-sequences.py, do not edit manually! + +namespace QuickMedia { + static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = { +""" % (longest_sequence - 1)) + for sequence in all_sequences: + remaining_sequences = [ hex(c) for c in sequence[1:] ] + source_file.write(" { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences))) + + source_file.write( +""" }; + + bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) { + uint32_t codepoint; + size_t clen; + if(!mgl::utf8_decode(str, size, &codepoint, &clen)) + return false; + + const size_t str_start_index = clen; + sequence[0] = codepoint; + + auto range = emoji_sequences.equal_range(codepoint); + if(range.first == range.second) + return false; + + auto longest_match_it = range.first; + size_t longest_match_byte_length = str_start_index; + bool match_found = false; + + for(auto it = range.first, end = range.second; it != end; ++it) { + size_t str_index = str_start_index; + + for(size_t i = 0; i < it->second.size(); ++i) { + const uint32_t codepoint_in_sequence = it->second[i]; + if(codepoint_in_sequence == 0) + break; + + if(str_index >= size) + goto next; + + if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen)) + goto next; + + if(codepoint != codepoint_in_sequence) + goto next; + + str_index += clen; + } + + if(str_index >= longest_match_byte_length) { + longest_match_it = it; + longest_match_byte_length = str_index; + } + + match_found = true; + next:; + } + + if(!match_found) + return false; + + size_t sequence_index = 1; + for(size_t i = 0; i < longest_match_it->second.size(); ++i) { + const uint32_t codepoint_in_sequence = longest_match_it->second[i]; + if(codepoint_in_sequence == 0) + break; + + sequence[sequence_index] = codepoint_in_sequence; + ++sequence_index; + } + + sequence_len = sequence_index; + byte_length = longest_match_byte_length; + return true; + } +} +""")
\ No newline at end of file |