aboutsummaryrefslogtreecommitdiff
path: root/src/system/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/system/utf8.c')
-rw-r--r--src/system/utf8.c74
1 files changed, 32 insertions, 42 deletions
diff --git a/src/system/utf8.c b/src/system/utf8.c
index cb14691..35b0f2f 100644
--- a/src/system/utf8.c
+++ b/src/system/utf8.c
@@ -1,54 +1,42 @@
#include "../../include/mgl/system/utf8.h"
+static inline bool utf8_get_codepoint_length(unsigned char b, size_t *codepoint_length) {
+ if((b & 0x80) == 0) {
+ *codepoint_length = 1;
+ return true;
+ } else if((b & 0xE0) == 0xC0) {
+ *codepoint_length = 2;
+ return true;
+ } else if((b & 0xF0) == 0xE0) {
+ *codepoint_length = 3;
+ return true;
+ } else if((b & 0xF8) == 0xF0) {
+ *codepoint_length = 4;
+ return true;
+ } else {
+ return false;
+ }
+}
+
/* TODO: Optimize (remove branching, etc) */
-bool mgl_utf8_is_valid(const unsigned char *str, size_t size) {
- size_t i = 0;
- while(i < size) {
- size_t codepoint_length = 0;
- const unsigned char b = str[i];
+bool mgl_utf8_decode(const unsigned char *str, size_t size, uint32_t *decoded_codepoint, size_t *codepoint_length) {
+ if(size == 0)
+ return false;
- if((b & 0x80) == 0)
- codepoint_length = 1;
- else if((b & 0xE0) == 0xC0)
- codepoint_length = 2;
- else if((b & 0xF0) == 0xE0)
- codepoint_length = 3;
- else if((b & 0xF8) == 0xF0)
- codepoint_length = 4;
- else
- return false;
+ size_t clen;
+ if(!utf8_get_codepoint_length(str[0], &clen))
+ return false;
- const size_t next = i + codepoint_length;
- if(next > size)
- return false;
+ if(size < clen)
+ return false;
- /* TODO: Remove this overflow check? */
- /* Check overflow */
- if(next <= i)
+ for(size_t i = 1; i < clen; ++i) {
+ if((str[i] & 0xC0) != 0x80)
return false;
-
- ++i;
- for(; i < next; ++i) {
- if((str[i] & 0xC0) != 0x80)
- return false;
- }
}
- return true;
-}
-
-static inline size_t utf8_get_codepoint_length(unsigned char b) {
- const unsigned int length1 = b >> 7;
- const unsigned int length2 = length1 & ((b & 0x40) >> 6);
- const unsigned int length3 = length2 & ((b & 0x20) >> 5);
- const unsigned int length4 = length3 & ((b & 0x10) >> 4);
- return (length1 ^ 0x01) + length1 + length2 + length3 + length4;
-}
-/* TODO: Optimize (remove branching, etc) */
-size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) {
- const size_t length = utf8_get_codepoint_length(str[0]);
uint32_t codepoint;
- switch(length) {
+ switch(clen) {
case 1:
codepoint = (uint32_t)(str[0] & 0x7F);
break;
@@ -68,6 +56,8 @@ size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) {
codepoint |= (uint32_t)(str[3] & 0x3F);
break;
}
+
+ *codepoint_length = clen;
*decoded_codepoint = codepoint;
- return length;
+ return true;
}