mirror of
https://github.com/mpv-player/mpv
synced 2025-04-17 04:40:09 +00:00
charset_conv: use our own UTF-8 check with ENCA only
Some charsets can look like valid UTF-8, but aren't UTF-8. One example is ISO-2022-JP. While ENCA apparently likes to get misdetect real UTF-8, this is not the case with uchardet. uchardet can detect ISO-2022-JP correctly, but didn't even get to try, because our own UTF-8 check succeeded. So run the UTF-8 check when using ENCA only. Fixes #2195.
This commit is contained in:
parent
3ab6155d21
commit
e5d3180889
@ -107,6 +107,11 @@ static const char *ms_bom_guess(bstr buf)
|
|||||||
#if HAVE_ENCA
|
#if HAVE_ENCA
|
||||||
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
|
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
|
||||||
{
|
{
|
||||||
|
// Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes
|
||||||
|
// (suggested by divVerent). Explicitly allow cut-off UTF-8.
|
||||||
|
if (bstr_validate_utf8(buf) > -8)
|
||||||
|
return "UTF-8";
|
||||||
|
|
||||||
if (!language || !language[0])
|
if (!language || !language[0])
|
||||||
language = "__"; // neutral language
|
language = "__"; // neutral language
|
||||||
|
|
||||||
@ -202,12 +207,6 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do our own UTF-8 detection, because at least ENCA seems to get it
|
|
||||||
// wrong sometimes (suggested by divVerent).
|
|
||||||
int r = bstr_validate_utf8(buf);
|
|
||||||
if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
|
|
||||||
return "UTF-8";
|
|
||||||
|
|
||||||
bstr params[3] = {{0}};
|
bstr params[3] = {{0}};
|
||||||
split_colon(user_cp, 3, params);
|
split_colon(user_cp, 3, params);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user