sub: add subtitle charset conversion

This code was once part of subreader.c, then traveled to libass, and now
made its way back to the fork of the fork of the original code, MPlayer.

It works pretty much the same as subreader.c, except that we have to
concatenate some packets to do auto-detection. This is rather annoying,
but for all we know the actual source file could be a binary format.

Unlike subreader.c, the iconv context is reopened on each packet. This
is simpler, and with respect to multibyte encodings, more robust.
Reopening is probably not a very fast, but I suspect subtitle charset
conversion is not an operation that happens often or has to be fast.

Also, this auto-detection is disabled for microdvd - this is the only
format we know that has binary data in its packets, but is actually
decoded to text. FFmpeg doesn't really allow us to solve this properly,
because a) the input packets can be binary, and b) the output will be
checked whether it's UTF-8, and if it's not, the output is thrown away
and an error message is printed. We could just recode the decoded
subtitles before sd_ass if it weren't for that.
This commit is contained in:
wm4 2013-06-23 22:15:04 +02:00
parent feb64c2717
commit f735a03346
5 changed files with 332 additions and 5 deletions

View File

@ -2031,9 +2031,9 @@
``--subcp=enca:<language>:<fallback codepage>``
You can specify your language using a two letter language code to make
ENCA detect the codepage automatically. If unsure, enter anything and
watch mpv ``-v`` output for available languages. Fallback codepage
specifies the codepage to use, when autodetection fails.
ENCA detect the codepage automatically. If unsure, enter anything (if the
language is invalid, mpv will complain and list valid languages).
Fallback codepage specifies the codepage to use if autodetection fails.
*EXAMPLE*:
@ -2041,6 +2041,8 @@
are Czech, fall back on latin 2, if the detection fails.
- ``--subcp=enca:pl:cp1250`` guess the encoding for Polish, fall back on
cp1250.
- ``--subcp=enca:pl`` guess the encoding for Polish, fall back on UTF-8.
- ``--subcp=enca`` try universal detection, fall back on UTF-8.
--sub-delay=<sec>
Delays subtitles by <sec> seconds. Can be negative.

View File

@ -170,6 +170,7 @@ SOURCES = talloc.c \
core/av_log.c \
core/av_opts.c \
core/bstr.c \
core/charset_conv.c \
core/codecs.c \
core/command.c \
core/cpudetect.c \

240
core/charset_conv.c Normal file
View File

@ -0,0 +1,240 @@
/*
* This file is part of mpv.
*
* Based on code taken from libass (ISC license), which was originally part
* of MPlayer (GPL).
* Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
*
* mpv is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "config.h"
#include "core/mp_msg.h"
#ifdef CONFIG_ENCA
#include <enca.h>
#endif
#ifdef CONFIG_ICONV
#include <iconv.h>
#endif
#include "charset_conv.h"
// Split the string on ':' into components.
// out_arr is at least max entries long.
// Return number of out_arr entries filled.
static int split_colon(const char *user_cp, int max, bstr *out_arr)
{
if (!user_cp || max < 1)
return 0;
int count = 0;
while (1) {
const char *next = strchr(user_cp, ':');
if (next && max - count > 1) {
out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
user_cp = next + 1;
} else {
out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
break;
}
}
return count;
}
// Returns true if user_cp implies that calling mp_charset_guess() on the
// input data is required to determine the real codepage. This is the case
// if user_cp is not a real iconv codepage, but a magic value that requests
// for example ENCA charset auto-detection.
bool mp_charset_requires_guess(const char *user_cp)
{
bstr res[2] = {{0}};
split_colon(user_cp, 2, res);
return bstrcasecmp0(res[0], "enca") == 0;
}
#ifdef CONFIG_ENCA
static const char *enca_guess(bstr buf, const char *language)
{
if (!language || !language[0])
language = "__"; // neutral language
const char *detected_cp = NULL;
EncaAnalyser analyser = enca_analyser_alloc(language);
if (analyser) {
enca_set_termination_strictness(analyser, 0);
EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
if (tmp && enc.charset != ENCA_CS_UNKNOWN)
detected_cp = tmp;
enca_analyser_free(analyser);
} else {
mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n",
language);
size_t langcnt;
const char **languages = enca_get_languages(&langcnt);
mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:");
for (int i = 0; i < langcnt; i++)
mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]);
mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n");
free(languages);
}
return detected_cp;
}
#endif
// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
const char *mp_charset_guess(bstr buf, const char *user_cp)
{
if (!mp_charset_requires_guess(user_cp))
return user_cp;
bstr params[3] = {{0}};
split_colon(user_cp, 3, params);
bstr type = params[0];
char lang[100];
snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
const char *fallback = params[2].start; // last item, already 0-terminated
const char *res = NULL;
#ifdef CONFIG_ENCA
if (bstrcasecmp0(type, "enca") == 0)
res = enca_guess(buf, lang);
#endif
if (res) {
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
BSTR_P(type), res);
} else {
res = fallback;
mp_msg(MSGT_SUBREADER, MSGL_DBG2,
"Detection with %.*s failed: fallback to %s\n",
BSTR_P(type), res && res[0] ? res : "no conversion");
}
return res;
}
// Convert the data in buf to UTF-8. The charset argument can be an iconv
// codepage, a value returned by mp_charset_conv_guess(), or a special value
// that triggers autodetection of the charset (e.g. using ENCA).
// The auto-detection is the only difference to mp_iconv_to_utf8().
// buf: same as mp_iconv_to_utf8()
// user_cp: iconv codepage, special value, NULL
// flags: same as mp_iconv_to_utf8()
// returns: same as mp_iconv_to_utf8()
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
{
return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
}
// Use iconv to convert buf to UTF-8.
// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
// obviously no conversion required (e.g. if cp is "UTF-8").
// Returns a newly allocated buffer if conversion is done and succeeds. The
// buffer will be terminated with 0 for convenience (the terminating 0 is not
// included in the returned length).
// Free the returned buffer with talloc_free().
// buf: input data
// cp: iconv codepage (or NULL)
// flags: combination of MP_ICONV_* flags
// returns: buf (no conversion), .start==NULL (error), or allocated buffer
bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
{
#ifdef CONFIG_ICONV
const char *tocp = "UTF-8";
if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0)
return buf;
if (strcasecmp(cp, "ASCII") == 0)
return buf;
iconv_t icdsc;
if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
if (flags & MP_ICONV_VERBOSE)
mp_msg(MSGT_SUBREADER, MSGL_ERR,
"Error opening iconv with codepage '%s'\n", cp);
goto failure;
}
size_t size = buf.len;
size_t osize = size;
size_t ileft = size;
size_t oleft = size - 1;
char *outbuf = talloc_size(NULL, osize);
char *ip = buf.start;
char *op = outbuf;
while (1) {
int clear = 0;
size_t rc;
if (ileft)
rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
else {
clear = 1; // clear the conversion state and leave
rc = iconv(icdsc, NULL, NULL, &op, &oleft);
}
if (rc == (size_t) (-1)) {
if (errno == E2BIG) {
size_t offset = op - outbuf;
outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
op = outbuf + offset;
osize += size;
oleft += size;
} else {
if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
// This is intended for cases where the input buffer is cut
// at a random byte position. If this happens in the middle
// of the buffer, it should still be an error. We say it's
// fine if the error is within 10 bytes of the end.
if (ileft <= 10)
break;
}
if (flags & MP_ICONV_VERBOSE) {
mp_msg(MSGT_SUBREADER, MSGL_ERR,
"Error recoding text with codepage '%s'\n", cp);
}
talloc_free(outbuf);
iconv_close(icdsc);
goto failure;
}
} else if (clear)
break;
}
iconv_close(icdsc);
outbuf[osize - oleft - 1] = 0;
return (bstr){outbuf, osize - oleft - 1};
#endif
failure:
return (bstr){0};
}

17
core/charset_conv.h Normal file
View File

@ -0,0 +1,17 @@
#ifndef MP_CHARSET_CONV_H
#define MP_CHARSET_CONV_H
#include <stdbool.h>
#include "core/bstr.h"
enum {
MP_ICONV_VERBOSE = 1, // print errors instead of failing silently
MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data
};
bool mp_charset_requires_guess(const char *user_cp);
const char *mp_charset_guess(bstr buf, const char *user_cp);
bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
#endif

View File

@ -18,6 +18,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include "config.h"
@ -27,6 +28,7 @@
#include "dec_sub.h"
#include "core/options.h"
#include "core/mp_msg.h"
#include "core/charset_conv.h"
extern const struct sd_functions sd_ass;
extern const struct sd_functions sd_lavc;
@ -56,6 +58,7 @@ struct dec_sub {
struct sd init_sd;
double video_fps;
const char *charset;
struct sd *sd[MAX_NUM_SD];
int num_sd;
@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh)
sh->gsh->codec ? sh->gsh->codec : "<unknown>");
}
static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
{
if (!mp_charset_requires_guess(usercp))
return usercp;
// Concat all subs into a buffer. We can't probably do much better without
// having the original data (which we don't, not anymore).
int max_size = 2 * 1024 * 1024;
const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
int sep_len = strlen(sep);
int num_pkt = 0;
int size = 0;
for (int n = 0; n < subs->num_packets; n++) {
struct demux_packet *pkt = subs->packets[n];
if (size + pkt->len > max_size)
break;
size += pkt->len + sep_len;
num_pkt++;
}
bstr text = {talloc_size(NULL, size), 0};
for (int n = 0; n < num_pkt; n++) {
struct demux_packet *pkt = subs->packets[n];
memcpy(text.start + text.len, pkt->buffer, pkt->len);
memcpy(text.start + text.len + pkt->len, sep, sep_len);
text.len += pkt->len + sep_len;
}
const char *guess = mp_charset_guess(text, usercp);
talloc_free(text.start);
return guess;
}
static void multiply_timings(struct packet_list *subs, double factor)
{
for (int n = 0; n < subs->num_packets; n++) {
@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
if (!sub_accept_packets_in_advance(sub) || sh->track)
return false;
const char *codec = sh->gsh->codec ? sh->gsh->codec : "";
void *tmp = talloc_new(NULL);
struct packet_list subs = {0};
@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt);
}
// Can't run auto-detection on movtext packets: it's the only codec that
// even though it decodes to text has binary input data.
if (opts->sub_cp && strcmp(codec, "movtext") != 0)
sub->charset = guess_sub_cp(&subs, opts->sub_cp);
if (sub->charset)
mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset);
// 23.976 FPS is used as default timebase for frame based formats
if (sub->video_fps && sh->frame_based)
multiply_timings(&subs, sub->video_fps / 23.976);
@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet)
}
}
static struct demux_packet *recode_packet(struct demux_packet *in,
const char *charset)
{
struct demux_packet *pkt = NULL;
bstr in_buf = {in->buffer, in->len};
bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE);
if (conv.start && conv.start != in_buf.start) {
pkt = talloc_ptrtype(NULL, pkt);
talloc_steal(pkt, conv.start);
*pkt = (struct demux_packet) {
.buffer = conv.start,
.len = conv.len,
.pts = in->pts,
.duration = in->duration,
.avpacket = in->avpacket, // questionable, but gives us sidedata
};
}
return pkt;
}
void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
{
if (sub->num_sd > 0)
decode_next(sub, 0, packet);
if (sub->num_sd > 0) {
struct demux_packet *recoded = NULL;
if (sub->charset)
recoded = recode_packet(packet, sub->charset);
decode_next(sub, 0, recoded ? recoded : packet);
}
}
void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,