#include "../../include/mgl/system/utf8.h" /* TODO: Optimize (remove branching, etc) */ bool mgl_utf8_is_valid(const unsigned char *str, size_t size) { size_t i = 0; while(i < size) { size_t codepoint_length = 0; const unsigned char b = str[i]; if((b & 0x80) == 0) codepoint_length = 1; else if((b & 0xE0) == 0xC0) codepoint_length = 2; else if((b & 0xF0) == 0xE0) codepoint_length = 3; else if((b & 0xF8) == 0xF0) codepoint_length = 4; else return false; const size_t next = i + codepoint_length; if(next > size) return false; /* TODO: Remove this overflow check? */ /* Check overflow */ if(next <= i) return false; ++i; for(; i < next; ++i) { if((str[i] & 0xC0) != 0x80) return false; } } return true; } static inline size_t utf8_get_codepoint_length(unsigned char b) { const unsigned int length1 = b >> 7; const unsigned int length2 = length1 & ((b & 0x40) >> 6); const unsigned int length3 = length2 & ((b & 0x20) >> 5); const unsigned int length4 = length3 & ((b & 0x10) >> 4); return (length1 ^ 0x01) + length1 + length2 + length3 + length4; } /* TODO: Optimize (remove branching, etc) */ size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) { const size_t length = utf8_get_codepoint_length(str[0]); uint32_t codepoint; switch(length) { case 1: codepoint = (uint32_t)(str[0] & 0x7F); break; case 2: codepoint = ((uint32_t)(str[0] & 0x1F) << 6); codepoint |= (uint32_t)(str[1] & 0x3F); break; case 3: codepoint = ((uint32_t)(str[0] & 0x0F) << 12); codepoint |= ((uint32_t)(str[1] & 0x3F) << 6); codepoint |= (uint32_t)(str[2] & 0x3F); break; case 4: codepoint = ((uint32_t)(str[0] & 0x07) << 18); codepoint |= ((uint32_t)(str[1] & 0x3F) << 12); codepoint |= ((uint32_t)(str[2] & 0x3F) << 6); codepoint |= (uint32_t)(str[3] & 0x3F); break; } *decoded_codepoint = codepoint; return length; }