bstr: add function for splitting UTF-8

This commit is contained in:
wm4 2014-01-15 16:13:07 +01:00
parent 904060ad7b
commit ca8937d7d2
2 changed files with 18 additions and 1 deletions

View File

@ -296,6 +296,17 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
return codepoint;
}
struct bstr bstr_split_utf8(struct bstr str, struct bstr *out_next)
{
bstr rest;
int code = bstr_decode_utf8(str, &rest);
if (code < 0)
return (bstr){0};
if (out_next)
*out_next = rest;
return bstr_splice(str, 0, str.len - rest.len);
}
int bstr_validate_utf8(struct bstr s)
{
while (s.len) {

View File

@ -81,13 +81,19 @@ double bstrtod(struct bstr str, struct bstr *rest);
void bstr_lower(struct bstr str);
int bstr_sscanf(struct bstr str, const char *format, ...);
// Decode the UTF-8 code point at the start of the string,, and return the
// Decode the UTF-8 code point at the start of the string, and return the
// character.
// After calling this function, *out_next will point to the next character.
// out_next can be NULL.
// On error, -1 is returned, and *out_next is not modified.
int bstr_decode_utf8(struct bstr str, struct bstr *out_next);
// Return the UTF-8 code point at the start of the string.
// After calling this function, *out_next will point to the next character.
// out_next can be NULL.
// On error, an empty string is returned, and *out_next is not modified.
struct bstr bstr_split_utf8(struct bstr str, struct bstr *out_next);
// Return the length of the UTF-8 sequence that starts with the given byte.
// Given a string char *s, the next UTF-8 code point is to be expected at
// s + bstr_parse_utf8_code_length(s[0])