#include "../../include/mgl/system/utf8.h" static inline bool utf8_get_codepoint_length(unsigned char b, size_t *codepoint_length) { if((b & 0x80) == 0) { *codepoint_length = 1; return true; } else if((b & 0xE0) == 0xC0) { *codepoint_length = 2; return true; } else if((b & 0xF0) == 0xE0) { *codepoint_length = 3; return true; } else if((b & 0xF8) == 0xF0) { *codepoint_length = 4; return true; } else { return false; } } /* TODO: Optimize (remove branching, etc) */ bool mgl_utf8_decode(const unsigned char *str, size_t size, uint32_t *decoded_codepoint, size_t *codepoint_length) { if(size == 0) { *decoded_codepoint = 0; *codepoint_length = 0; return false; } size_t clen; if(!utf8_get_codepoint_length(str[0], &clen)) { *decoded_codepoint = str[0]; *codepoint_length = 1; return false; } if(size < clen) { *decoded_codepoint = str[0]; *codepoint_length = 1; return false; } for(size_t i = 1; i < clen; ++i) { if((str[i] & 0xC0) != 0x80) { *decoded_codepoint = str[0]; *codepoint_length = 1; return false; } } uint32_t codepoint; switch(clen) { case 1: codepoint = (uint32_t)(str[0] & 0x7F); break; case 2: codepoint = ((uint32_t)(str[0] & 0x1F) << 6); codepoint |= (uint32_t)(str[1] & 0x3F); break; case 3: codepoint = ((uint32_t)(str[0] & 0x0F) << 12); codepoint |= ((uint32_t)(str[1] & 0x3F) << 6); codepoint |= (uint32_t)(str[2] & 0x3F); break; case 4: codepoint = ((uint32_t)(str[0] & 0x07) << 18); codepoint |= ((uint32_t)(str[1] & 0x3F) << 12); codepoint |= ((uint32_t)(str[2] & 0x3F) << 6); codepoint |= (uint32_t)(str[3] & 0x3F); break; } *codepoint_length = clen; *decoded_codepoint = codepoint; return true; } /* TODO: Optimize (remove branching, etc) */ size_t mgl_utf8_get_start_of_codepoint(const unsigned char *str, size_t size, size_t offset) { if(size == 0) return 0; if(offset > size - 1) offset = size - 1; /* i <= offset is an overflow (underflow?) check */ for(size_t i = offset; i <= offset; --i) { if((str[i] & 0xC0) != 0x80) return i; } return 0; } /* TODO: Optimize (remove branching, etc) */ size_t mgl_utf8_index_to_offset(const unsigned char *str, size_t size, size_t index) { size_t codepoint_index = 0; for(size_t i = 0; i < size;) { const unsigned char *cp = &str[i]; uint32_t codepoint; size_t clen; if(!mgl_utf8_decode(cp, size - i, &codepoint, &clen)) { codepoint = *cp; clen = 1; } if(codepoint_index >= index) return i; i += clen; ++codepoint_index; } return size; }