demux: add an option to control tag charset

Fucking gross that you need this in almost-2020.

Fixes: #7255
This commit is contained in:
wm4 2019-12-20 12:37:26 +01:00
parent 0e98b2ad8e
commit 8448fe0b62
3 changed files with 96 additions and 1 deletions

View File

@ -6266,6 +6266,19 @@ Miscellaneous
See the FFmpeg libavfilter documentation for details on the available
filters.
``--metadata-codepage=<codepage>``
Codepage for various input metadata (default: ``utf-8``). This affects how
file tags, chapter titles, etc. are interpreted. You can for example set
this to ``auto`` to enable autodetection of the codepage. (This is not the
default because non-UTF-8 codepages are an obscure fringe use-case.)
See ``--sub-codepage`` option on how codepages are specified and further
details regarding autodetection and codepage conversion. (The underlying
code is the same.)
Conversion is not applied to metadata that is updated at runtime.
Debugging
---------

View File

@ -38,6 +38,7 @@
#include "common/msg.h"
#include "common/global.h"
#include "common/recorder.h"
#include "misc/charset_conv.h"
#include "misc/thread_tools.h"
#include "osdep/atomic.h"
#include "osdep/timer.h"
@ -97,6 +98,7 @@ struct demux_opts {
int audio_back_preroll;
int back_batch[STREAM_TYPE_COUNT];
double back_seek_size;
char *meta_cp;
};
#define OPT_BASE_STRUCT struct demux_opts
@ -128,6 +130,7 @@ const struct m_sub_options demux_conf = {
OPT_INTRANGE("audio-backward-batch", back_batch[STREAM_AUDIO], 0, 0, 1024),
OPT_DOUBLE("demuxer-backward-playback-step", back_seek_size, M_OPT_MIN,
.min = 0),
OPT_STRING("metadata-codepage", meta_cp, 0),
{0}
},
.size = sizeof(struct demux_opts),
@ -146,6 +149,7 @@ const struct m_sub_options demux_conf = {
[STREAM_VIDEO] = 1,
[STREAM_AUDIO] = 10,
},
.meta_cp = "utf-8",
},
};
@ -181,6 +185,8 @@ struct demux_internal {
struct sh_stream **streams;
int num_streams;
char *meta_charset;
// If non-NULL, a stream which is used for global (timed) metadata. It will
// be an arbitrary stream, which hopefully will happen to work.
struct sh_stream *metadata_stream;
@ -443,6 +449,7 @@ static struct demux_packet *find_seek_target(struct demux_queue *queue,
double pts, int flags);
static void prune_old_packets(struct demux_internal *in);
static void dumper_close(struct demux_internal *in);
static void demux_convert_tags_charset(struct demuxer *demuxer);
static uint64_t get_foward_buffered_bytes(struct demux_stream *ds)
{
@ -3232,6 +3239,7 @@ static struct demuxer *open_given_type(struct mpv_global *global,
}
demux_init_cuesheet(in->d_thread);
demux_init_ccs(demuxer, opts);
demux_convert_tags_charset(in->d_thread);
demux_copy(in->d_user, in->d_thread);
in->duration = in->d_thread->duration;
demuxer_sort_chapters(demuxer);
@ -4402,3 +4410,77 @@ struct demux_chapter *demux_copy_chapter_data(struct demux_chapter *c, int num)
}
return new;
}
static void visit_tags(void *ctx, void (*visit)(void *ctx, void *ta, char **s),
struct mp_tags *tags)
{
for (int n = 0; n < (tags ? tags->num_keys : 0); n++)
visit(ctx, tags, &tags->values[n]);
}
static void visit_meta(struct demuxer *demuxer, void *ctx,
void (*visit)(void *ctx, void *ta, char **s))
{
struct demux_internal *in = demuxer->in;
for (int n = 0; n < in->num_streams; n++) {
struct sh_stream *sh = in->streams[n];
visit(ctx, sh, &sh->title);
visit_tags(ctx, visit, sh->tags);
}
for (int n = 0; n < demuxer->num_chapters; n++)
visit_tags(ctx, visit, demuxer->chapters[n].metadata);
visit_tags(ctx, visit, demuxer->metadata);
}
static void visit_detect(void *ctx, void *ta, char **s)
{
char **all = ctx;
abort();
if (*s)
*all = talloc_asprintf_append_buffer(*all, "%s\n", *s);
}
static void visit_convert(void *ctx, void *ta, char **s)
{
struct demuxer *demuxer = ctx;
struct demux_internal *in = demuxer->in;
if (!*s)
return;
bstr data = bstr0(*s);
bstr conv = mp_iconv_to_utf8(in->log, data, in->meta_charset,
MP_ICONV_VERBOSE);
if (conv.start && conv.start != data.start) {
char *ns = conv.start; // 0-termination is guaranteed
// (The old string might not be an alloc, but if it is, it's a talloc
// child, and will not leak, even if it stays allocated uselessly.)
*s = ns;
talloc_steal(ta, *s);
}
}
static void demux_convert_tags_charset(struct demuxer *demuxer)
{
struct demux_internal *in = demuxer->in;
char *cp = in->opts->meta_cp;
if (!cp || mp_charset_is_utf8(cp))
return;
char *data = talloc_strdup(NULL, "");
visit_meta(demuxer, &data, visit_detect);
in->meta_charset = (char *)mp_charset_guess(in, in->log, bstr0(data), cp, 0);
if (in->meta_charset && !mp_charset_is_utf8(in->meta_charset)) {
MP_INFO(demuxer, "Using tag charset: %s\n", in->meta_charset);
visit_meta(demuxer, demuxer, visit_convert);
}
talloc_free(data);
}

View File

@ -365,7 +365,7 @@ static void convert_charset(struct demuxer *demuxer)
{
lavf_priv_t *priv = demuxer->priv;
char *cp = priv->opts->sub_cp;
if (!cp || mp_charset_is_utf8(cp))
if (!cp || !cp[0] || mp_charset_is_utf8(cp))
return;
bstr data = stream_read_complete(priv->stream, NULL, 128 * 1024 * 1024);
if (!data.start) {