aboutsummaryrefslogtreecommitdiff
path: root/src/system
diff options
context:
space:
mode:
Diffstat (limited to 'src/system')
-rw-r--r--src/system/utf8.c73
1 files changed, 73 insertions, 0 deletions
diff --git a/src/system/utf8.c b/src/system/utf8.c
new file mode 100644
index 0000000..cb14691
--- /dev/null
+++ b/src/system/utf8.c
@@ -0,0 +1,73 @@
+#include "../../include/mgl/system/utf8.h"
+
+/* TODO: Optimize (remove branching, etc) */
+bool mgl_utf8_is_valid(const unsigned char *str, size_t size) {
+ size_t i = 0;
+ while(i < size) {
+ size_t codepoint_length = 0;
+ const unsigned char b = str[i];
+
+ if((b & 0x80) == 0)
+ codepoint_length = 1;
+ else if((b & 0xE0) == 0xC0)
+ codepoint_length = 2;
+ else if((b & 0xF0) == 0xE0)
+ codepoint_length = 3;
+ else if((b & 0xF8) == 0xF0)
+ codepoint_length = 4;
+ else
+ return false;
+
+ const size_t next = i + codepoint_length;
+ if(next > size)
+ return false;
+
+ /* TODO: Remove this overflow check? */
+ /* Check overflow */
+ if(next <= i)
+ return false;
+
+ ++i;
+ for(; i < next; ++i) {
+ if((str[i] & 0xC0) != 0x80)
+ return false;
+ }
+ }
+ return true;
+}
+
+static inline size_t utf8_get_codepoint_length(unsigned char b) {
+ const unsigned int length1 = b >> 7;
+ const unsigned int length2 = length1 & ((b & 0x40) >> 6);
+ const unsigned int length3 = length2 & ((b & 0x20) >> 5);
+ const unsigned int length4 = length3 & ((b & 0x10) >> 4);
+ return (length1 ^ 0x01) + length1 + length2 + length3 + length4;
+}
+
+/* TODO: Optimize (remove branching, etc) */
+size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) {
+ const size_t length = utf8_get_codepoint_length(str[0]);
+ uint32_t codepoint;
+ switch(length) {
+ case 1:
+ codepoint = (uint32_t)(str[0] & 0x7F);
+ break;
+ case 2:
+ codepoint = ((uint32_t)(str[0] & 0x1F) << 6);
+ codepoint |= (uint32_t)(str[1] & 0x3F);
+ break;
+ case 3:
+ codepoint = ((uint32_t)(str[0] & 0x0F) << 12);
+ codepoint |= ((uint32_t)(str[1] & 0x3F) << 6);
+ codepoint |= (uint32_t)(str[2] & 0x3F);
+ break;
+ case 4:
+ codepoint = ((uint32_t)(str[0] & 0x07) << 18);
+ codepoint |= ((uint32_t)(str[1] & 0x3F) << 12);
+ codepoint |= ((uint32_t)(str[2] & 0x3F) << 6);
+ codepoint |= (uint32_t)(str[3] & 0x3F);
+ break;
+ }
+ *decoded_codepoint = codepoint;
+ return length;
+}