mirror of https://github.com/mpv-player/mpv
charset_conv: add uchardet support
For now, it needs to be explicitly selected. ENCA is still the default. This assumes uchardet returns iconv names. This doesn't seem to be always the case, and the result are lots of iconv errors. So explicitly check for this situation, and print a warning if it occurs. It's entirely possible that uchardet support is actually useless, because names are not necessarily iconv-compatible (but uchardet doesn't seem to document whether it attempts to return iconv-compatible names if possible). Fixes #908.
This commit is contained in:
parent
11f2be2bcc
commit
a74914a057
|
@ -1491,6 +1491,12 @@ Subtitles
|
||||||
mode. Use ``--sub-codepage=guess:help`` to get a list of
|
mode. Use ``--sub-codepage=guess:help`` to get a list of
|
||||||
languages subject to the same caveat as with ENCA above.
|
languages subject to the same caveat as with ENCA above.
|
||||||
|
|
||||||
|
If the player was compiled with uchardet support you can use it with:
|
||||||
|
|
||||||
|
``--sub-codepage=uchardet``
|
||||||
|
|
||||||
|
This mode doesn't take language or fallback codepage.
|
||||||
|
|
||||||
``--sub-fix-timing``, ``--no-sub-fix-timing``
|
``--sub-fix-timing``, ``--no-sub-fix-timing``
|
||||||
By default, external text subtitles are preprocessed to remove minor gaps
|
By default, external text subtitles are preprocessed to remove minor gaps
|
||||||
or overlaps between subtitles (if the difference is smaller than 200 ms,
|
or overlaps between subtitles (if the difference is smaller than 200 ms,
|
||||||
|
|
|
@ -176,6 +176,7 @@ options_state_machine() {
|
||||||
opt_yes_no _dvdread "libdvdread"
|
opt_yes_no _dvdread "libdvdread"
|
||||||
opt_yes_no _dvdnav "libdvdnav"
|
opt_yes_no _dvdnav "libdvdnav"
|
||||||
opt_yes_no _enca "ENCA charset oracle library"
|
opt_yes_no _enca "ENCA charset oracle library"
|
||||||
|
opt_yes_no _uchardet "uchardet charset detection library"
|
||||||
opt_yes_no _libass "subtitle rendering with libass"
|
opt_yes_no _libass "subtitle rendering with libass"
|
||||||
opt_yes_no _libavdevice "libavdevice demuxers"
|
opt_yes_no _libavdevice "libavdevice demuxers"
|
||||||
opt_yes_no _libavfilter "libavfilter"
|
opt_yes_no _libavfilter "libavfilter"
|
||||||
|
@ -732,6 +733,7 @@ echo "LIBASS_OSD = $_libass" >> $CONFIG_MAK
|
||||||
echo "DUMMY_OSD = $_dummy_osd" >> $CONFIG_MAK
|
echo "DUMMY_OSD = $_dummy_osd" >> $CONFIG_MAK
|
||||||
|
|
||||||
check_pkg_config "ENCA" $_enca ENCA 'enca'
|
check_pkg_config "ENCA" $_enca ENCA 'enca'
|
||||||
|
check_pkg_config "uchardet" $_uchardet UCHARDET 'uchardet'
|
||||||
|
|
||||||
check_pkg_config "zlib" auto ZLIB 'zlib'
|
check_pkg_config "zlib" auto ZLIB 'zlib'
|
||||||
test $(defretval) = no && die "Unable to find development files for zlib."
|
test $(defretval) = no && die "Unable to find development files for zlib."
|
||||||
|
|
|
@ -36,6 +36,10 @@
|
||||||
#include <libguess.h>
|
#include <libguess.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_UCHARDET
|
||||||
|
#include <uchardet.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if HAVE_ICONV
|
#if HAVE_ICONV
|
||||||
#include <iconv.h>
|
#include <iconv.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
|
||||||
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
|
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
|
||||||
// by default, plus a codepage that is used if the input is not UTF-8.
|
// by default, plus a codepage that is used if the input is not UTF-8.
|
||||||
return bstrcasecmp0(res[0], "enca") == 0 ||
|
return bstrcasecmp0(res[0], "enca") == 0 ||
|
||||||
|
bstrcasecmp0(res[0], "uchardet") == 0 ||
|
||||||
bstrcasecmp0(res[0], "auto") == 0 ||
|
bstrcasecmp0(res[0], "auto") == 0 ||
|
||||||
bstrcasecmp0(res[0], "guess") == 0 ||
|
bstrcasecmp0(res[0], "guess") == 0 ||
|
||||||
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
|
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
|
||||||
|
@ -145,6 +150,35 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_UCHARDET
|
||||||
|
static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
|
||||||
|
{
|
||||||
|
uchardet_t det = uchardet_new();
|
||||||
|
if (!det)
|
||||||
|
return NULL;
|
||||||
|
if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
|
||||||
|
uchardet_delete(det);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
uchardet_data_end(det);
|
||||||
|
char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
|
||||||
|
if (res && !res[0])
|
||||||
|
res = NULL;
|
||||||
|
if (res) {
|
||||||
|
iconv_t icdsc = iconv_open("UTF-8", res);
|
||||||
|
if (icdsc == (iconv_t)(-1)) {
|
||||||
|
mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
|
||||||
|
res);
|
||||||
|
res = NULL;
|
||||||
|
} else {
|
||||||
|
iconv_close(icdsc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uchardet_delete(det);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Runs charset auto-detection on the input buffer, and returns the result.
|
// Runs charset auto-detection on the input buffer, and returns the result.
|
||||||
// If auto-detection fails, NULL is returned.
|
// If auto-detection fails, NULL is returned.
|
||||||
// If user_cp doesn't refer to any known auto-detection (for example because
|
// If user_cp doesn't refer to any known auto-detection (for example because
|
||||||
|
@ -196,6 +230,11 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
|
||||||
if (bstrcasecmp0(type, "guess") == 0)
|
if (bstrcasecmp0(type, "guess") == 0)
|
||||||
res = libguess_guess(log, buf, lang);
|
res = libguess_guess(log, buf, lang);
|
||||||
#endif
|
#endif
|
||||||
|
#if HAVE_UCHARDET
|
||||||
|
if (bstrcasecmp0(type, "uchardet") == 0)
|
||||||
|
res = mp_uchardet(talloc_ctx, log, buf);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
|
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
|
||||||
if (!fallback)
|
if (!fallback)
|
||||||
fallback = params[1].start; // must be already 0-terminated
|
fallback = params[1].start; // must be already 0-terminated
|
||||||
|
|
Loading…
Reference in New Issue