#!/usr/bin/env python3 import requests response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt") response.raise_for_status() all_sequences = [] longest_sequence = 0 for line in response.text.splitlines(False): if len(line) == 0 or line[0] == '#' or line.find("minimally-qualified") != -1: continue columns = line.split(";") codepoints = columns[0].split() sequence = [] for codepoint in codepoints: sequence.append(int(codepoint, base=16)) longest_sequence = max(longest_sequence, len(sequence)) all_sequences.append(sequence) with open("generated/Emoji.hpp", "w") as header_file: header_file.write("""#pragma once #include #include // This file was automatically generated with generate-emoji-sequences.py, do not edit manually! namespace QuickMedia { bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length); }""") with open("generated/Emoji.cpp", "w") as source_file: source_file.write("""#include "Emoji.hpp" #include #include #include // This file was automatically generated with generate-emoji-sequences.py, do not edit manually! namespace QuickMedia { static std::unordered_multimap> emoji_sequences = { """ % (longest_sequence - 1)) for sequence in all_sequences: remaining_sequences = [ hex(c) for c in sequence[1:] ] source_file.write(" { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences))) source_file.write( """ }; bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) { uint32_t codepoint; size_t clen; if(!mgl::utf8_decode(str, size, &codepoint, &clen)) return false; const size_t str_start_index = clen; sequence[0] = codepoint; auto range = emoji_sequences.equal_range(codepoint); if(range.first == range.second) return false; auto longest_match_it = range.first; size_t longest_match_byte_length = str_start_index; bool match_found = false; for(auto it = range.first, end = range.second; it != end; ++it) { size_t str_index = str_start_index; for(size_t i = 0; i < it->second.size(); ++i) { const uint32_t codepoint_in_sequence = it->second[i]; if(codepoint_in_sequence == 0) break; if(str_index >= size) goto next; if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen)) goto next; if(codepoint != codepoint_in_sequence) goto next; str_index += clen; } if(str_index >= longest_match_byte_length) { longest_match_it = it; longest_match_byte_length = str_index; } match_found = true; next:; } if(!match_found) return false; size_t sequence_index = 1; for(size_t i = 0; i < longest_match_it->second.size(); ++i) { const uint32_t codepoint_in_sequence = longest_match_it->second[i]; if(codepoint_in_sequence == 0) break; sequence[sequence_index] = codepoint_in_sequence; ++sequence_index; } sequence_len = sequence_index; byte_length = longest_match_byte_length; return true; } } """)