mirror of https://github.com/mpv-player/mpv
sub: allow specifying a fallback codepage if input is not UTF-8
Normally, --subcp always forces conversion. This really always forces conversion, even if the UTF-8 check on the input succeeds. Extend the --subcp to allow codepages as fallback if UTF-8 doesn't work. So, for example --subcp=utf8:cp1250 will use UTF-8 if the input looks like UTF-8, and will fall back to use cp1250 if the UTF-8 check fails. I think this should actually be the default, but on the other hand, this changes the semantics of the option, and a user would actually expect --subcp to force conversion, rather than silently using UTF-8 if that happens to work.
This commit is contained in:
parent
00f735d5cb
commit
fe3c445112
|
@ -2035,10 +2035,14 @@
|
|||
If your system supports ``iconv(3)``, you can use this option to specify
|
||||
the subtitle codepage.
|
||||
|
||||
Warning: if you force the charset, even subtitles that are known to be
|
||||
UTF-8 will be recoded, which is perhaps not what you expect.
|
||||
|
||||
.. admonition:: Examples
|
||||
|
||||
- ``--subcp=latin2``
|
||||
- ``--subcp=cp1250``
|
||||
- ``--subcp=utf8:latin2`` Use Latin 2 if input is not UTF-8.
|
||||
- ``--subcp=utf8:cp1250`` Use CP1250 if input is not UTF-8.
|
||||
- ``--subcp=cp1250`` Always force recoding to cp1250.
|
||||
|
||||
If the player was compiled with ENCA support, you can use special syntax
|
||||
to use that::
|
||||
|
@ -2049,6 +2053,8 @@
|
|||
ENCA detect the codepage automatically. If unsure, enter anything (if the
|
||||
language is invalid, mpv will complain and list valid languages).
|
||||
Fallback codepage specifies the codepage to use if autodetection fails.
|
||||
If no fallback is specified, the subtitle will be interpreted as UTF-8,
|
||||
but with "Latin 1" as fallback for bytes that are not valid UTF-8 sequences.
|
||||
|
||||
.. admonition:: Examples
|
||||
|
||||
|
|
|
@ -70,9 +70,13 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr)
|
|||
bool mp_charset_requires_guess(const char *user_cp)
|
||||
{
|
||||
bstr res[2] = {{0}};
|
||||
split_colon(user_cp, 2, res);
|
||||
int r = split_colon(user_cp, 2, res);
|
||||
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
|
||||
// by default, plus a codepage that is used if the input is not UTF-8.
|
||||
return bstrcasecmp0(res[0], "enca") == 0 ||
|
||||
bstrcasecmp0(res[0], "guess") == 0;
|
||||
bstrcasecmp0(res[0], "guess") == 0 ||
|
||||
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
|
||||
(r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ENCA
|
||||
|
@ -155,6 +159,10 @@ const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
|
|||
if (bstrcasecmp0(type, "guess") == 0)
|
||||
res = libguess_guess(buf, lang);
|
||||
#endif
|
||||
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
|
||||
if (!fallback)
|
||||
fallback = params[1].start; // must be already 0-terminated
|
||||
}
|
||||
|
||||
if (res) {
|
||||
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
|
||||
|
|
Loading…
Reference in New Issue