Support as many emoji as possible, using separate emoji images in text

author: dec05eba <dec05eba@protonmail.com> 2022-11-05 15:53:28 +0100
committer: dec05eba <dec05eba@protonmail.com> 2022-11-05 15:53:28 +0100
commit: 4daa57f6d139f51a62ea4bcffa738bd5035df33a (patch)
tree: ba767bbbf1c7153a12e3d9e866a079c30d1c69e0 /generate-emoji-sequences.py
parent: f89117b5cf36797b04291942b2f2494895fc58dd (diff)
1 files changed, 117 insertions, 0 deletions
diff --git a/generate-emoji-sequences.py b/generate-emoji-sequences.py
new file mode 100755
index 0000000..b796548
--- /dev/null
+++ b/generate-emoji-sequences.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+import requests
+
+response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
+response.raise_for_status()
+
+all_sequences = []
+longest_sequence = 0
+
+for line in response.text.splitlines(False):
+    if len(line) == 0 or line[0] == '#' or line.find("fully-qualified") == -1:
+        continue
+
+    columns = line.split(";")
+    codepoints = columns[0].split()
+    sequence = []
+    for codepoint in codepoints:
+        sequence.append(int(codepoint, base=16))
+    longest_sequence = max(longest_sequence, len(sequence))
+    all_sequences.append(sequence)
+
+with open("generated/Emoji.hpp", "w") as header_file:
+    header_file.write("""#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
+
+namespace QuickMedia {
+    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
+}""")
+
+with open("generated/Emoji.cpp", "w") as source_file:
+    source_file.write("""#include "Emoji.hpp"
+#include <unordered_map>
+#include <array>
+#include <mglpp/system/Utf8.hpp>
+
+// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
+
+namespace QuickMedia {
+    static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
+""" % (longest_sequence - 1))
+    for sequence in all_sequences:
+        remaining_sequences = [ hex(c) for c in sequence[1:] ]
+        source_file.write("        { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))
+
+    source_file.write(
+"""    };
+
+    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
+        uint32_t codepoint;
+        size_t clen;
+        if(!mgl::utf8_decode(str, size, &codepoint, &clen))
+            return false;
+
+        const size_t str_start_index = clen;
+        sequence[0] = codepoint;
+
+        auto range = emoji_sequences.equal_range(codepoint);
+        if(range.first == range.second)
+            return false;
+
+        auto longest_match_it = range.first;
+        size_t longest_match_byte_length = str_start_index;
+        bool match_found = false;
+
+        for(auto it = range.first, end = range.second; it != end; ++it) {
+            size_t str_index = str_start_index;
+
+            for(size_t i = 0; i < it->second.size(); ++i) {
+                const uint32_t codepoint_in_sequence = it->second[i];
+                if(codepoint_in_sequence == 0)
+                    break;
+
+                if(str_index >= size)
+                    goto next;
+
+                if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
+                    goto next;
+
+                if(codepoint != codepoint_in_sequence)
+                    goto next;
+
+                str_index += clen;
+            }
+
+            if(str_index >= longest_match_byte_length) {
+                longest_match_it = it;
+                longest_match_byte_length = str_index;
+            }
+
+            match_found = true;
+            next:;
+        }
+
+        if(!match_found)
+            return false;
+
+        size_t sequence_index = 1;
+        for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
+            const uint32_t codepoint_in_sequence = longest_match_it->second[i];
+            if(codepoint_in_sequence == 0)
+                break;
+
+            sequence[sequence_index] = codepoint_in_sequence;
+            ++sequence_index;
+        }
+
+        sequence_len = sequence_index;
+        byte_length = longest_match_byte_length;
+        return true;
+    }
+}
+""")
+\ No newline at end of file
author	dec05eba <dec05eba@protonmail.com>	2022-11-05 15:53:28 +0100
committer	dec05eba <dec05eba@protonmail.com>	2022-11-05 15:53:28 +0100
commit	4daa57f6d139f51a62ea4bcffa738bd5035df33a (patch)
tree	ba767bbbf1c7153a12e3d9e866a079c30d1c69e0 /generate-emoji-sequences.py
parent	f89117b5cf36797b04291942b2f2494895fc58dd (diff)