diff options
Diffstat (limited to 'src/system')
-rw-r--r-- | src/system/utf8.c | 74 |
1 files changed, 32 insertions, 42 deletions
diff --git a/src/system/utf8.c b/src/system/utf8.c index cb14691..35b0f2f 100644 --- a/src/system/utf8.c +++ b/src/system/utf8.c @@ -1,54 +1,42 @@ #include "../../include/mgl/system/utf8.h" +static inline bool utf8_get_codepoint_length(unsigned char b, size_t *codepoint_length) { + if((b & 0x80) == 0) { + *codepoint_length = 1; + return true; + } else if((b & 0xE0) == 0xC0) { + *codepoint_length = 2; + return true; + } else if((b & 0xF0) == 0xE0) { + *codepoint_length = 3; + return true; + } else if((b & 0xF8) == 0xF0) { + *codepoint_length = 4; + return true; + } else { + return false; + } +} + /* TODO: Optimize (remove branching, etc) */ -bool mgl_utf8_is_valid(const unsigned char *str, size_t size) { - size_t i = 0; - while(i < size) { - size_t codepoint_length = 0; - const unsigned char b = str[i]; +bool mgl_utf8_decode(const unsigned char *str, size_t size, uint32_t *decoded_codepoint, size_t *codepoint_length) { + if(size == 0) + return false; - if((b & 0x80) == 0) - codepoint_length = 1; - else if((b & 0xE0) == 0xC0) - codepoint_length = 2; - else if((b & 0xF0) == 0xE0) - codepoint_length = 3; - else if((b & 0xF8) == 0xF0) - codepoint_length = 4; - else - return false; + size_t clen; + if(!utf8_get_codepoint_length(str[0], &clen)) + return false; - const size_t next = i + codepoint_length; - if(next > size) - return false; + if(size < clen) + return false; - /* TODO: Remove this overflow check? */ - /* Check overflow */ - if(next <= i) + for(size_t i = 1; i < clen; ++i) { + if((str[i] & 0xC0) != 0x80) return false; - - ++i; - for(; i < next; ++i) { - if((str[i] & 0xC0) != 0x80) - return false; - } } - return true; -} - -static inline size_t utf8_get_codepoint_length(unsigned char b) { - const unsigned int length1 = b >> 7; - const unsigned int length2 = length1 & ((b & 0x40) >> 6); - const unsigned int length3 = length2 & ((b & 0x20) >> 5); - const unsigned int length4 = length3 & ((b & 0x10) >> 4); - return (length1 ^ 0x01) + length1 + length2 + length3 + length4; -} -/* TODO: Optimize (remove branching, etc) */ -size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) { - const size_t length = utf8_get_codepoint_length(str[0]); uint32_t codepoint; - switch(length) { + switch(clen) { case 1: codepoint = (uint32_t)(str[0] & 0x7F); break; @@ -68,6 +56,8 @@ size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) { codepoint |= (uint32_t)(str[3] & 0x3F); break; } + + *codepoint_length = clen; *decoded_codepoint = codepoint; - return length; + return true; } |