diff options
Diffstat (limited to 'src/system')
-rw-r--r-- | src/system/utf8.c | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/src/system/utf8.c b/src/system/utf8.c new file mode 100644 index 0000000..cb14691 --- /dev/null +++ b/src/system/utf8.c @@ -0,0 +1,73 @@ +#include "../../include/mgl/system/utf8.h" + +/* TODO: Optimize (remove branching, etc) */ +bool mgl_utf8_is_valid(const unsigned char *str, size_t size) { + size_t i = 0; + while(i < size) { + size_t codepoint_length = 0; + const unsigned char b = str[i]; + + if((b & 0x80) == 0) + codepoint_length = 1; + else if((b & 0xE0) == 0xC0) + codepoint_length = 2; + else if((b & 0xF0) == 0xE0) + codepoint_length = 3; + else if((b & 0xF8) == 0xF0) + codepoint_length = 4; + else + return false; + + const size_t next = i + codepoint_length; + if(next > size) + return false; + + /* TODO: Remove this overflow check? */ + /* Check overflow */ + if(next <= i) + return false; + + ++i; + for(; i < next; ++i) { + if((str[i] & 0xC0) != 0x80) + return false; + } + } + return true; +} + +static inline size_t utf8_get_codepoint_length(unsigned char b) { + const unsigned int length1 = b >> 7; + const unsigned int length2 = length1 & ((b & 0x40) >> 6); + const unsigned int length3 = length2 & ((b & 0x20) >> 5); + const unsigned int length4 = length3 & ((b & 0x10) >> 4); + return (length1 ^ 0x01) + length1 + length2 + length3 + length4; +} + +/* TODO: Optimize (remove branching, etc) */ +size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) { + const size_t length = utf8_get_codepoint_length(str[0]); + uint32_t codepoint; + switch(length) { + case 1: + codepoint = (uint32_t)(str[0] & 0x7F); + break; + case 2: + codepoint = ((uint32_t)(str[0] & 0x1F) << 6); + codepoint |= (uint32_t)(str[1] & 0x3F); + break; + case 3: + codepoint = ((uint32_t)(str[0] & 0x0F) << 12); + codepoint |= ((uint32_t)(str[1] & 0x3F) << 6); + codepoint |= (uint32_t)(str[2] & 0x3F); + break; + case 4: + codepoint = ((uint32_t)(str[0] & 0x07) << 18); + codepoint |= ((uint32_t)(str[1] & 0x3F) << 12); + codepoint |= ((uint32_t)(str[2] & 0x3F) << 6); + codepoint |= (uint32_t)(str[3] & 0x3F); + break; + } + *decoded_codepoint = codepoint; + return length; +} |