mirror of
https://github.com/mpv-player/mpv
synced 2025-04-11 04:01:31 +00:00
sub: if charset detection fails, treat it as broken UTF-8
Broken UTF-8 in this context means we treat it as UTF-8, but we also interpret broken UTF-8 sequences as Latin1. Also, run our own UTF-8 check function before the charset detectors. This prevents from ENCA's UTF-8 check possibly messing up (like detecting 7-bit clean UTF-8 as ASCII, or other things). It also takes care of UTF-8 detection if no charset detector (ENCA, libguess) is compiled in, and it lets us deal better with cut-off UTF-8 sequences.
This commit is contained in:
parent
380fa71fc7
commit
acb51c9243
@ -110,9 +110,6 @@ static const char *enca_guess(bstr buf, const char *language)
|
|||||||
#ifdef CONFIG_LIBGUESS
|
#ifdef CONFIG_LIBGUESS
|
||||||
static const char *libguess_guess(bstr buf, const char *language)
|
static const char *libguess_guess(bstr buf, const char *language)
|
||||||
{
|
{
|
||||||
if (libguess_validate_utf8(buf.start, buf.len))
|
|
||||||
return "UTF-8";
|
|
||||||
|
|
||||||
if (!language || !language[0] || strcmp(language, "help") == 0) {
|
if (!language || !language[0] || strcmp(language, "help") == 0) {
|
||||||
mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: "
|
mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: "
|
||||||
"japanese taiwanese chinese korean russian arabic turkish "
|
"japanese taiwanese chinese korean russian arabic turkish "
|
||||||
@ -129,11 +126,17 @@ static const char *libguess_guess(bstr buf, const char *language)
|
|||||||
// If user_cp doesn't refer to any known auto-detection (for example because
|
// If user_cp doesn't refer to any known auto-detection (for example because
|
||||||
// it's a real iconv codepage), user_cp is returned without even looking at
|
// it's a real iconv codepage), user_cp is returned without even looking at
|
||||||
// the buf data.
|
// the buf data.
|
||||||
const char *mp_charset_guess(bstr buf, const char *user_cp)
|
const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
|
||||||
{
|
{
|
||||||
if (!mp_charset_requires_guess(user_cp))
|
if (!mp_charset_requires_guess(user_cp))
|
||||||
return user_cp;
|
return user_cp;
|
||||||
|
|
||||||
|
// Do our own UTF-8 detection, because at least ENCA seems to get it
|
||||||
|
// wrong sometimes (suggested by divVerent).
|
||||||
|
int r = bstr_validate_utf8(buf);
|
||||||
|
if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
|
||||||
|
return "UTF-8";
|
||||||
|
|
||||||
bstr params[3] = {{0}};
|
bstr params[3] = {{0}};
|
||||||
split_colon(user_cp, 3, params);
|
split_colon(user_cp, 3, params);
|
||||||
|
|
||||||
@ -160,9 +163,12 @@ const char *mp_charset_guess(bstr buf, const char *user_cp)
|
|||||||
res = fallback;
|
res = fallback;
|
||||||
mp_msg(MSGT_SUBREADER, MSGL_DBG2,
|
mp_msg(MSGT_SUBREADER, MSGL_DBG2,
|
||||||
"Detection with %.*s failed: fallback to %s\n",
|
"Detection with %.*s failed: fallback to %s\n",
|
||||||
BSTR_P(type), res && res[0] ? res : "no conversion");
|
BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!res && !(flags & MP_STRICT_UTF8))
|
||||||
|
res = "UTF-8-BROKEN";
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,7 +182,7 @@ const char *mp_charset_guess(bstr buf, const char *user_cp)
|
|||||||
// returns: same as mp_iconv_to_utf8()
|
// returns: same as mp_iconv_to_utf8()
|
||||||
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
|
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
|
||||||
{
|
{
|
||||||
return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
|
return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use iconv to convert buf to UTF-8.
|
// Use iconv to convert buf to UTF-8.
|
||||||
@ -201,6 +207,9 @@ bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
|
|||||||
if (strcasecmp(cp, "ASCII") == 0)
|
if (strcasecmp(cp, "ASCII") == 0)
|
||||||
return buf;
|
return buf;
|
||||||
|
|
||||||
|
if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
|
||||||
|
return bstr_sanitize_utf8_latin1(NULL, buf);
|
||||||
|
|
||||||
iconv_t icdsc;
|
iconv_t icdsc;
|
||||||
if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
|
if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
|
||||||
if (flags & MP_ICONV_VERBOSE)
|
if (flags & MP_ICONV_VERBOSE)
|
||||||
|
@ -7,10 +7,11 @@
|
|||||||
enum {
|
enum {
|
||||||
MP_ICONV_VERBOSE = 1, // print errors instead of failing silently
|
MP_ICONV_VERBOSE = 1, // print errors instead of failing silently
|
||||||
MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data
|
MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data
|
||||||
|
MP_STRICT_UTF8 = 4, // don't fall back to UTF-8-BROKEN when guessing
|
||||||
};
|
};
|
||||||
|
|
||||||
bool mp_charset_requires_guess(const char *user_cp);
|
bool mp_charset_requires_guess(const char *user_cp);
|
||||||
const char *mp_charset_guess(bstr buf, const char *user_cp);
|
const char *mp_charset_guess(bstr buf, const char *user_cp, int flags);
|
||||||
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
|
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
|
||||||
bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
|
bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
|
|||||||
memcpy(text.start + text.len + pkt->len, sep, sep_len);
|
memcpy(text.start + text.len + pkt->len, sep, sep_len);
|
||||||
text.len += pkt->len + sep_len;
|
text.len += pkt->len + sep_len;
|
||||||
}
|
}
|
||||||
const char *guess = mp_charset_guess(text, usercp);
|
const char *guess = mp_charset_guess(text, usercp, 0);
|
||||||
talloc_free(text.start);
|
talloc_free(text.start);
|
||||||
return guess;
|
return guess;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user