mirror of
https://github.com/mpv-player/mpv
synced 2025-03-25 04:38:01 +00:00
sub: detect charset in demuxer
Slightly simpler, and removes the need to pre-read all subtitle packets. This still does the subtitle charset conversion on the packet level (instead converting when parsing the file), so in theory this still could provide a way to change the charset at runtime. But maybe even this should be removed, as FFmpeg is somewhat likely to get its own charset detection and conversion mechanism in the future. (Would have to keep the subtitle file in memory to allow changing the charset on the fly, I guess.)
This commit is contained in:
parent
e798cf1ff6
commit
74c11f0c84
@ -40,6 +40,7 @@
|
||||
#include "common/tags.h"
|
||||
#include "common/av_common.h"
|
||||
#include "misc/bstr.h"
|
||||
#include "misc/charset_conv.h"
|
||||
|
||||
#include "stream/stream.h"
|
||||
#include "demux.h"
|
||||
@ -108,16 +109,16 @@ struct format_hack {
|
||||
bool no_stream : 1; // do not wrap struct stream as AVIOContext
|
||||
bool use_stream_ids : 1; // export the native stream IDs
|
||||
bool fully_read : 1; // set demuxer.fully_read flag
|
||||
bool detect_charset : 1; // format is a small text file, possibly not UTF8
|
||||
bool image_format : 1; // expected to contain exactly 1 frame
|
||||
bool utf8_subs : 1; // subtitles are (mostly) guaranteed UTF-8
|
||||
// Do not confuse player's position estimation (position is into external
|
||||
// segment, with e.g. HLS, player knows about the playlist main file only).
|
||||
bool clear_filepos : 1;
|
||||
};
|
||||
|
||||
#define BLACKLIST(fmt) {fmt, .ignore = true}
|
||||
#define TEXTSUB(fmt) {fmt, .fully_read = true}
|
||||
#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true}
|
||||
#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true}
|
||||
#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true}
|
||||
#define IMAGEFMT(fmt) {fmt, .image_format = true}
|
||||
|
||||
static const struct format_hack format_hacks[] = {
|
||||
@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = {
|
||||
TEXTSUB_UTF8("webvtt"),
|
||||
TEXTSUB_UTF8("ass"),
|
||||
|
||||
// Formats which support muxed subtitles, and always use UTF-8 for them.
|
||||
{"mov", .utf8_subs = true},
|
||||
{"mkv", .utf8_subs = true},
|
||||
|
||||
// Useless non-sense, sometimes breaks MLP2 subreader.c fallback
|
||||
BLACKLIST("tty"),
|
||||
// Let's open files with extremely generic extensions (.bin) with a
|
||||
@ -174,6 +171,7 @@ typedef struct lavf_priv {
|
||||
int cur_program;
|
||||
char *mime_type;
|
||||
bool merge_track_metadata;
|
||||
char *file_charset;
|
||||
} lavf_priv_t;
|
||||
|
||||
// At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name
|
||||
@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer)
|
||||
MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name);
|
||||
}
|
||||
|
||||
static void detect_charset(struct demuxer *demuxer)
|
||||
{
|
||||
lavf_priv_t *priv = demuxer->priv;
|
||||
char *cp = demuxer->opts->sub_cp;
|
||||
if (mp_charset_requires_guess(cp)) {
|
||||
bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE);
|
||||
cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0);
|
||||
MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)");
|
||||
}
|
||||
if (cp && !mp_charset_is_utf8(cp))
|
||||
MP_INFO(demuxer, "Using subtitle charset: %s\n", cp);
|
||||
// libavformat transparently converts UTF-16 to UTF-8
|
||||
if (mp_charset_is_utf16(priv->file_charset))
|
||||
cp = NULL;
|
||||
priv->file_charset = cp;
|
||||
}
|
||||
|
||||
static char *remove_prefix(char *s, const char *const *prefixes)
|
||||
{
|
||||
for (int n = 0; prefixes[n]; n++) {
|
||||
@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check)
|
||||
|
||||
demuxer->filetype = priv->avif->name;
|
||||
|
||||
if (priv->format_hack.detect_charset)
|
||||
detect_charset(demuxer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i)
|
||||
}
|
||||
}
|
||||
|
||||
sh_sub->is_utf8 = priv->format_hack.utf8_subs;
|
||||
sh_sub->charset = priv->file_charset;
|
||||
|
||||
break;
|
||||
}
|
||||
|
@ -93,7 +93,7 @@ typedef struct sh_video {
|
||||
typedef struct sh_sub {
|
||||
double frame_based; // timestamps are frame-based (and this is the
|
||||
// fallback framerate used for timestamps)
|
||||
bool is_utf8; // if false, subtitle packet charset is unknown
|
||||
char *charset; // assumed 8 bit subtitle charset (can be NULL)
|
||||
struct dec_sub *dec_sub; // decoder context
|
||||
} sh_sub_t;
|
||||
|
||||
|
@ -52,6 +52,13 @@ bool mp_charset_is_utf8(const char *user_cp)
|
||||
strcasecmp(user_cp, "utf-8") == 0);
|
||||
}
|
||||
|
||||
bool mp_charset_is_utf16(const char *user_cp)
|
||||
{
|
||||
bstr s = bstr0(user_cp);
|
||||
return bstr_case_startswith(s, bstr0("utf16")) ||
|
||||
bstr_case_startswith(s, bstr0("utf-16"));
|
||||
}
|
||||
|
||||
// Split the string on ':' into components.
|
||||
// out_arr is at least max entries long.
|
||||
// Return number of out_arr entries filled.
|
||||
|
@ -13,6 +13,7 @@ enum {
|
||||
};
|
||||
|
||||
bool mp_charset_is_utf8(const char *user_cp);
|
||||
bool mp_charset_is_utf16(const char *user_cp);
|
||||
bool mp_charset_requires_guess(const char *user_cp);
|
||||
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
|
||||
const char *user_cp, int flags);
|
||||
|
@ -54,7 +54,7 @@ struct dec_sub {
|
||||
struct MPOpts *opts;
|
||||
struct sd init_sd;
|
||||
|
||||
const char *charset;
|
||||
struct sh_stream *sh;
|
||||
|
||||
struct sd *sd[MAX_NUM_SD];
|
||||
int num_sd;
|
||||
@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh)
|
||||
|
||||
pthread_mutex_lock(&sub->lock);
|
||||
|
||||
sub->sh = sh;
|
||||
|
||||
if (sh->extradata && !sub->init_sd.extradata)
|
||||
sub_set_extradata(sub, sh->extradata, sh->extradata_size);
|
||||
struct sd init_sd = sub->init_sd;
|
||||
@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet
|
||||
{
|
||||
if (sub->num_sd > 0) {
|
||||
struct demux_packet *recoded = NULL;
|
||||
if (sub->charset)
|
||||
recoded = recode_packet(sub->log, packet, sub->charset);
|
||||
if (sub->sh && sub->sh->sub->charset)
|
||||
recoded = recode_packet(sub->log, packet, sub->sh->sub->charset);
|
||||
decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet);
|
||||
talloc_free(recoded);
|
||||
}
|
||||
@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
|
||||
pthread_mutex_unlock(&sub->lock);
|
||||
}
|
||||
|
||||
static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx,
|
||||
struct packet_list *subs, const char *usercp)
|
||||
{
|
||||
if (!mp_charset_requires_guess(usercp))
|
||||
return usercp;
|
||||
|
||||
// Concat all subs into a buffer. We can't probably do much better without
|
||||
// having the original data (which we don't, not anymore).
|
||||
int max_size = 2 * 1024 * 1024;
|
||||
const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
|
||||
int sep_len = strlen(sep);
|
||||
int num_pkt = 0;
|
||||
int size = 0;
|
||||
for (int n = 0; n < subs->num_packets; n++) {
|
||||
struct demux_packet *pkt = subs->packets[n];
|
||||
if (size + pkt->len > max_size)
|
||||
break;
|
||||
size += pkt->len + sep_len;
|
||||
num_pkt++;
|
||||
}
|
||||
bstr text = {talloc_size(NULL, size), 0};
|
||||
for (int n = 0; n < num_pkt; n++) {
|
||||
struct demux_packet *pkt = subs->packets[n];
|
||||
memcpy(text.start + text.len, pkt->buffer, pkt->len);
|
||||
memcpy(text.start + text.len + pkt->len, sep, sep_len);
|
||||
text.len += pkt->len + sep_len;
|
||||
}
|
||||
const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0);
|
||||
talloc_free(text.start);
|
||||
return guess;
|
||||
}
|
||||
|
||||
static void add_sub_list(struct dec_sub *sub, struct packet_list *subs)
|
||||
{
|
||||
struct sd *sd = sub_get_last_sd(sub);
|
||||
@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt)
|
||||
bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
|
||||
{
|
||||
assert(sh && sh->sub);
|
||||
struct MPOpts *opts = sub->opts;
|
||||
|
||||
pthread_mutex_lock(&sub->lock);
|
||||
|
||||
@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
|
||||
talloc_free(pkt);
|
||||
}
|
||||
|
||||
if (opts->sub_cp && !sh->sub->is_utf8)
|
||||
sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp);
|
||||
|
||||
if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset))
|
||||
MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset);
|
||||
|
||||
add_sub_list(sub, subs);
|
||||
|
||||
pthread_mutex_unlock(&sub->lock);
|
||||
|
Loading…
Reference in New Issue
Block a user