aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-11-28 09:03:23 +0100
committerdec05eba <dec05eba@protonmail.com>2021-11-28 09:03:23 +0100
commit8792ee2cc5b501f0611e6304f529226a495825db (patch)
tree38d98dfd7bbcc5d48d13271a9e8923dd7b61dd91
parent993eea20151d881735c667757e3b64e4f85ac687 (diff)
Add utf8 index to offset function
-rw-r--r--include/mgl/system/utf8.h2
-rw-r--r--src/system/utf8.c21
2 files changed, 23 insertions, 0 deletions
diff --git a/include/mgl/system/utf8.h b/include/mgl/system/utf8.h
index 794884d..f6fe150 100644
--- a/include/mgl/system/utf8.h
+++ b/include/mgl/system/utf8.h
@@ -16,6 +16,8 @@ bool mgl_utf8_decode(const unsigned char *str, size_t size, uint32_t *decoded_co
Returns 0 if start of codepoint is not found.
*/
size_t mgl_utf8_get_start_of_codepoint(const unsigned char *str, size_t size, size_t offset);
+/* Returns |size| if not found */
+size_t mgl_utf8_index_to_offset(const unsigned char *str, size_t size, size_t index);
#endif /* MGL_UTF8_H */
diff --git a/src/system/utf8.c b/src/system/utf8.c
index 5d5ec79..d38b292 100644
--- a/src/system/utf8.c
+++ b/src/system/utf8.c
@@ -90,3 +90,24 @@ size_t mgl_utf8_get_start_of_codepoint(const unsigned char *str, size_t size, si
return 0;
}
+
+/* TODO: Optimize (remove branching, etc) */
+size_t mgl_utf8_index_to_offset(const unsigned char *str, size_t size, size_t index) {
+ size_t codepoint_index = 0;
+ for(size_t i = 0; i < size;) {
+ const unsigned char *cp = &str[i];
+ uint32_t codepoint;
+ size_t clen;
+ if(!mgl_utf8_decode(cp, size - i, &codepoint, &clen)) {
+ codepoint = *cp;
+ clen = 1;
+ }
+
+ if(codepoint_index >= index)
+ return i;
+
+ i += clen;
+ ++codepoint_index;
+ }
+ return size;
+}