mirror of
https://github.com/mpv-player/mpv
synced 2025-02-03 21:52:12 +00:00
bstr: add UTF-8 validation and sanitation functions
This commit is contained in:
parent
04bc16f6ea
commit
380fa71fc7
@ -273,7 +273,7 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
|
|||||||
return -1;
|
return -1;
|
||||||
codepoint &= 127 >> bytes;
|
codepoint &= 127 >> bytes;
|
||||||
for (int n = 1; n < bytes; n++) {
|
for (int n = 1; n < bytes; n++) {
|
||||||
int tmp = s.start[0];
|
int tmp = (unsigned char)s.start[0];
|
||||||
if ((tmp & 0xC0) != 0x80)
|
if ((tmp & 0xC0) != 0x80)
|
||||||
return -1;
|
return -1;
|
||||||
codepoint = (codepoint << 6) | (tmp & ~0xC0);
|
codepoint = (codepoint << 6) | (tmp & ~0xC0);
|
||||||
@ -285,6 +285,69 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
|
|||||||
return codepoint;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bstr_validate_utf8(struct bstr s)
|
||||||
|
{
|
||||||
|
while (s.len) {
|
||||||
|
if (bstr_decode_utf8(s, &s) < 0) {
|
||||||
|
// Try to guess whether the sequence was just cut-off.
|
||||||
|
unsigned int codepoint = (unsigned char)s.start[0];
|
||||||
|
int bytes = bstr_parse_utf8_code_length(codepoint);
|
||||||
|
if (bytes > 1 && s.len < 6) {
|
||||||
|
// Manually check validity of left bytes
|
||||||
|
for (int n = 1; n < bytes; n++) {
|
||||||
|
if (n >= s.len) {
|
||||||
|
// Everything valid until now - just cut off.
|
||||||
|
return -(bytes - s.len);
|
||||||
|
}
|
||||||
|
int tmp = (unsigned char)s.start[n];
|
||||||
|
if ((tmp & 0xC0) != 0x80)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void append_bstr(bstr *buf, bstr s)
|
||||||
|
{
|
||||||
|
buf->start = talloc_realloc(NULL, buf->start, unsigned char, buf->len + s.len);
|
||||||
|
memcpy(buf->start + buf->len, s.start, s.len);
|
||||||
|
buf->len += s.len;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s)
|
||||||
|
{
|
||||||
|
bstr new = {0};
|
||||||
|
bstr left = s;
|
||||||
|
unsigned char *first_ok = s.start;
|
||||||
|
while (left.len) {
|
||||||
|
int r = bstr_decode_utf8(left, &left);
|
||||||
|
if (r < 0) {
|
||||||
|
append_bstr(&new, (bstr){first_ok, left.start - first_ok});
|
||||||
|
uint32_t codepoint = (unsigned char)left.start[0];
|
||||||
|
char data[8];
|
||||||
|
uint8_t tmp;
|
||||||
|
char *output = data;
|
||||||
|
PUT_UTF8(codepoint, tmp, *output++ = tmp;);
|
||||||
|
append_bstr(&new, (bstr){data, output - data});
|
||||||
|
left.start += 1;
|
||||||
|
left.len -= 1;
|
||||||
|
first_ok = left.start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!new.start)
|
||||||
|
return s;
|
||||||
|
if (first_ok != left.start)
|
||||||
|
append_bstr(&new, (bstr){first_ok, left.start - first_ok});
|
||||||
|
// For convenience
|
||||||
|
append_bstr(&new, (bstr){"\0", 1});
|
||||||
|
new.len -= 1;
|
||||||
|
talloc_steal(talloc_ctx, new.start);
|
||||||
|
return new;
|
||||||
|
}
|
||||||
|
|
||||||
bool bstr_case_startswith(struct bstr s, struct bstr prefix)
|
bool bstr_case_startswith(struct bstr s, struct bstr prefix)
|
||||||
{
|
{
|
||||||
struct bstr start = bstr_splice(s, 0, prefix.len);
|
struct bstr start = bstr_splice(s, 0, prefix.len);
|
||||||
|
@ -92,6 +92,23 @@ int bstr_decode_utf8(struct bstr str, struct bstr *out_next);
|
|||||||
// On error, -1 is returned. On success, it returns a value in the range [1, 4].
|
// On error, -1 is returned. On success, it returns a value in the range [1, 4].
|
||||||
int bstr_parse_utf8_code_length(unsigned char b);
|
int bstr_parse_utf8_code_length(unsigned char b);
|
||||||
|
|
||||||
|
// Return >= 0 if the string is valid UTF-8, otherwise negative error code.
|
||||||
|
// Embedded \0 bytes are considered valid.
|
||||||
|
// This returns -N if the UTF-8 string was likely just cut-off in the middle of
|
||||||
|
// an UTF-8 sequence: -1 means 1 byte was missing, -5 5 bytes missing.
|
||||||
|
// If the string was likely not cut off, -8 is returned.
|
||||||
|
// Use (return_value > -8) to check whether the string is valid UTF-8 or valid
|
||||||
|
// but cut-off UTF-8.
|
||||||
|
int bstr_validate_utf8(struct bstr s);
|
||||||
|
|
||||||
|
// Force the input string to valid UTF-8. If invalid UTF-8 encoding is
|
||||||
|
// encountered, the invalid bytes are interpreted as Latin-1.
|
||||||
|
// Embedded \0 bytes are considered valid.
|
||||||
|
// If replacement happens, a newly allocated string is returned (with a \0
|
||||||
|
// byte added past its end for convenience). The string is allocated via
|
||||||
|
// talloc, with talloc_ctx as parent.
|
||||||
|
struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s);
|
||||||
|
|
||||||
// Return the text before the next line break, and return it. Change *rest to
|
// Return the text before the next line break, and return it. Change *rest to
|
||||||
// point to the text following this line break. (rest can be NULL.)
|
// point to the text following this line break. (rest can be NULL.)
|
||||||
// Line break characters are not stripped.
|
// Line break characters are not stripped.
|
||||||
|
Loading…
Reference in New Issue
Block a user