aboutsummaryrefslogtreecommitdiff
path: root/src/system/utf8.c
blob: cb14691f235b1acff3f0d70f9384f586c1b183f6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include "../../include/mgl/system/utf8.h"

/* TODO: Optimize (remove branching, etc) */
bool mgl_utf8_is_valid(const unsigned char *str, size_t size) {
    size_t i = 0;
    while(i < size) {
        size_t codepoint_length = 0;
        const unsigned char b = str[i];

        if((b & 0x80) == 0)
            codepoint_length = 1;
        else if((b & 0xE0) == 0xC0)
            codepoint_length = 2;
        else if((b & 0xF0) == 0xE0)
            codepoint_length = 3;
        else if((b & 0xF8) == 0xF0)
            codepoint_length = 4;
        else
            return false;

        const size_t next = i + codepoint_length;
        if(next > size)
            return false;

        /* TODO: Remove this overflow check? */
        /* Check overflow */
        if(next <= i)
            return false;

        ++i;
        for(; i < next; ++i) {
            if((str[i] & 0xC0) != 0x80)
                return false;
        }
    }
    return true;
}

static inline size_t utf8_get_codepoint_length(unsigned char b) {
    const unsigned int length1 = b >> 7;
    const unsigned int length2 = length1 & ((b & 0x40) >> 6);
    const unsigned int length3 = length2 & ((b & 0x20) >> 5);
    const unsigned int length4 = length3 & ((b & 0x10) >> 4);
    return (length1 ^ 0x01) + length1 + length2 + length3 + length4;
}

/* TODO: Optimize (remove branching, etc) */
size_t mgl_utf8_decode(const unsigned char *str, uint32_t *decoded_codepoint) {
    const size_t length = utf8_get_codepoint_length(str[0]);
    uint32_t codepoint;
    switch(length) {
        case 1:
            codepoint =  (uint32_t)(str[0] & 0x7F);
            break;
        case 2:
            codepoint = ((uint32_t)(str[0] & 0x1F) << 6);
            codepoint |= (uint32_t)(str[1] & 0x3F);
            break;
        case 3:
            codepoint =  ((uint32_t)(str[0] & 0x0F) << 12);
            codepoint |= ((uint32_t)(str[1] & 0x3F) << 6);
            codepoint |=  (uint32_t)(str[2] & 0x3F);
            break;
        case 4:
            codepoint =  ((uint32_t)(str[0] & 0x07) << 18);
            codepoint |= ((uint32_t)(str[1] & 0x3F) << 12);
            codepoint |= ((uint32_t)(str[2] & 0x3F) << 6);
            codepoint |=  (uint32_t)(str[3] & 0x3F);
            break;
    }
    *decoded_codepoint = codepoint;
    return length;
}