mirror of https://github.com/mpv-player/mpv
sub: allow specifying a fallback codepage if input is not UTF-8
Normally, --subcp always forces conversion. This really always forces conversion, even if the UTF-8 check on the input succeeds. Extend the --subcp to allow codepages as fallback if UTF-8 doesn't work. So, for example --subcp=utf8:cp1250 will use UTF-8 if the input looks like UTF-8, and will fall back to use cp1250 if the UTF-8 check fails. I think this should actually be the default, but on the other hand, this changes the semantics of the option, and a user would actually expect --subcp to force conversion, rather than silently using UTF-8 if that happens to work.
This commit is contained in:
parent
00f735d5cb
commit
fe3c445112
|
@ -2035,10 +2035,14 @@
|
||||||
If your system supports ``iconv(3)``, you can use this option to specify
|
If your system supports ``iconv(3)``, you can use this option to specify
|
||||||
the subtitle codepage.
|
the subtitle codepage.
|
||||||
|
|
||||||
|
Warning: if you force the charset, even subtitles that are known to be
|
||||||
|
UTF-8 will be recoded, which is perhaps not what you expect.
|
||||||
|
|
||||||
.. admonition:: Examples
|
.. admonition:: Examples
|
||||||
|
|
||||||
- ``--subcp=latin2``
|
- ``--subcp=utf8:latin2`` Use Latin 2 if input is not UTF-8.
|
||||||
- ``--subcp=cp1250``
|
- ``--subcp=utf8:cp1250`` Use CP1250 if input is not UTF-8.
|
||||||
|
- ``--subcp=cp1250`` Always force recoding to cp1250.
|
||||||
|
|
||||||
If the player was compiled with ENCA support, you can use special syntax
|
If the player was compiled with ENCA support, you can use special syntax
|
||||||
to use that::
|
to use that::
|
||||||
|
@ -2049,6 +2053,8 @@
|
||||||
ENCA detect the codepage automatically. If unsure, enter anything (if the
|
ENCA detect the codepage automatically. If unsure, enter anything (if the
|
||||||
language is invalid, mpv will complain and list valid languages).
|
language is invalid, mpv will complain and list valid languages).
|
||||||
Fallback codepage specifies the codepage to use if autodetection fails.
|
Fallback codepage specifies the codepage to use if autodetection fails.
|
||||||
|
If no fallback is specified, the subtitle will be interpreted as UTF-8,
|
||||||
|
but with "Latin 1" as fallback for bytes that are not valid UTF-8 sequences.
|
||||||
|
|
||||||
.. admonition:: Examples
|
.. admonition:: Examples
|
||||||
|
|
||||||
|
|
|
@ -70,9 +70,13 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr)
|
||||||
bool mp_charset_requires_guess(const char *user_cp)
|
bool mp_charset_requires_guess(const char *user_cp)
|
||||||
{
|
{
|
||||||
bstr res[2] = {{0}};
|
bstr res[2] = {{0}};
|
||||||
split_colon(user_cp, 2, res);
|
int r = split_colon(user_cp, 2, res);
|
||||||
|
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
|
||||||
|
// by default, plus a codepage that is used if the input is not UTF-8.
|
||||||
return bstrcasecmp0(res[0], "enca") == 0 ||
|
return bstrcasecmp0(res[0], "enca") == 0 ||
|
||||||
bstrcasecmp0(res[0], "guess") == 0;
|
bstrcasecmp0(res[0], "guess") == 0 ||
|
||||||
|
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
|
||||||
|
(r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ENCA
|
#ifdef CONFIG_ENCA
|
||||||
|
@ -155,6 +159,10 @@ const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
|
||||||
if (bstrcasecmp0(type, "guess") == 0)
|
if (bstrcasecmp0(type, "guess") == 0)
|
||||||
res = libguess_guess(buf, lang);
|
res = libguess_guess(buf, lang);
|
||||||
#endif
|
#endif
|
||||||
|
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
|
||||||
|
if (!fallback)
|
||||||
|
fallback = params[1].start; // must be already 0-terminated
|
||||||
|
}
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
|
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
|
||||||
|
|
Loading…
Reference in New Issue