core: completely change handling of attached picture pseudo video

Before this commit, we tried to play along with libavformat and tried to pretend that attached pictures are video streams with a single frame, and that the frame magically appeared at the seek position when seeking. The playback core would then switch to a mode where the video has ended, and the "remaining" audio is played. This didn't work very well: - we needed a hack in demux.c, because we tried to read more packets in order to find the "next" video frame (libavformat doesn't tell us if a stream has ended) - switching the video stream didn't work, because we can't tell libavformat to send the packet again - seeking and resuming after was hacky (for some reason libavformat sets the returned packet's PTS to that of the previously returned audio packet in generic code not related to attached pictures, and this happened to work) - if the user did something stupid and e.g. inserted a deinterlacer by default, a picture was never displayed, only an inactive VO window) - same when using a command that reconfigured the VO (like switching aspect or video filters) - hr-seek didn't work For this reason, handle attached pictures as separate case with a separate video decoding function, which doesn't read packets. Also, do not synchronize audio to video start in this case.
2025-04-18 05:07:18 +00:00 · 2013-07-11 19:23:56 +02:00 · 2013-07-11 19:23:56 +02:00 · 4cda1d113e
commit 4cda1d113e
parent 86cc3bd9be
5 changed files with 63 additions and 50 deletions
--- a/core/mp_core.h
+++ b/core/mp_core.h
@ -172,6 +172,9 @@ typedef struct MPContext {
    /* We're starting playback from scratch or after a seek. Show first
     * video frame immediately and reinitialize sync. */
    bool restart_playback;
+    /* Set if audio should be timed to start with video frame after seeking,
+     * not set when e.g. playing cover art */
+    bool sync_audio_to_video;
    /* After playback restart (above) or audio stream change, adjust audio
     * stream by cutting samples or adding silence at the beginning to make
     * audio playback position match video position. */
--- a/core/mplayer.c
+++ b/core/mplayer.c
@ -467,6 +467,7 @@ void uninit_player(struct MPContext *mpctx, unsigned int mask)
        if (mpctx->sh_video)
            uninit_video(mpctx->sh_video);
        cleanup_demux_stream(mpctx, STREAM_VIDEO);
+        mpctx->sync_audio_to_video = false;
    }

    if (mask & INITIALIZED_DEMUXER) {
@ -946,7 +947,7 @@ static struct track *add_stream_track(struct MPContext *mpctx,
        .demuxer_id = stream->demuxer_id,
        .title = stream->title,
        .default_track = stream->default_track,
-        .attached_picture = stream->attached_picture,
+        .attached_picture = stream->attached_picture != NULL,
        .lang = stream->lang,
        .under_timeline = under_timeline,
        .demuxer = stream->demuxer,
@ -1134,7 +1135,7 @@ static void print_status(struct MPContext *mpctx)
        saddf(&line, " x%4.2f", opts->playback_speed);

    // A-V sync
-    if (mpctx->sh_audio && sh_video) {
+    if (mpctx->sh_audio && sh_video && mpctx->sync_audio_to_video) {
        if (mpctx->last_av_difference != MP_NOPTS_VALUE)
            saddf(&line, " A-V:%7.3f", mpctx->last_av_difference);
        else
@ -2173,7 +2174,7 @@ static int fill_audio_out_buffers(struct MPContext *mpctx, double endpts)
        playsize = ao_get_space(ao);

    // Coming here with hrseek_active still set means audio-only
-    if (!mpctx->sh_video)
+    if (!mpctx->sh_video || !mpctx->sync_audio_to_video)
        mpctx->syncing_audio = false;
    if (!opts->initial_audio_sync || !modifiable_audio_format) {
        mpctx->syncing_audio = false;
@ -2353,6 +2354,7 @@ int reinit_video_chain(struct MPContext *mpctx)
    sh_video->num_buffered_pts = 0;
    sh_video->next_frame_time = 0;
    mpctx->restart_playback = true;
+    mpctx->sync_audio_to_video = !sh_video->gsh->attached_picture;
    mpctx->delay = 0;
    mpctx->vo_pts_history_seek_ts++;

@ -2365,6 +2367,7 @@ err_out:
    cleanup_demux_stream(mpctx, STREAM_VIDEO);
 no_video:
    mpctx->current_track[STREAM_VIDEO] = NULL;
+    mpctx->sync_audio_to_video = false;
    mp_tmsg(MSGT_CPLAYER, MSGL_INFO, "Video: no video\n");
    return 0;
 }
@ -2416,6 +2419,15 @@ static bool filter_output_queued_frame(struct MPContext *mpctx)
    return !!img;
 }

+static bool load_next_vo_frame(struct MPContext *mpctx, bool eof)
+{
+    if (vo_get_buffered_frame(mpctx->video_out, eof) >= 0)
+        return true;
+    if (filter_output_queued_frame(mpctx))
+        return true;
+    return false;
+}
+
 static void filter_video(struct MPContext *mpctx, struct mp_image *frame)
 {
    struct sh_video *sh_video = mpctx->sh_video;
@ -2467,12 +2479,9 @@ static double update_video_nocorrect_pts(struct MPContext *mpctx)
 {
    struct sh_video *sh_video = mpctx->sh_video;
    double frame_time = 0;
-    struct vo *video_out = mpctx->video_out;
    while (1) {
        // In nocorrect-pts mode there is no way to properly time these frames
-        if (vo_get_buffered_frame(video_out, 0) >= 0)
-            break;
-        if (filter_output_queued_frame(mpctx))
+        if (load_next_vo_frame(mpctx, false))
            break;
        frame_time = sh_video->next_frame_time;
        if (mpctx->restart_playback)
@ -2497,6 +2506,23 @@ static double update_video_nocorrect_pts(struct MPContext *mpctx)
    return frame_time;
 }

+static double update_video_attached_pic(struct MPContext *mpctx)
+{
+    struct sh_video *sh_video = mpctx->sh_video;
+
+    // Try to decode the picture multiple times, until it is displayed.
+    if (mpctx->video_out->hasframe)
+        return -1;
+
+    struct mp_image *decoded_frame =
+            decode_video(sh_video, sh_video->gsh->attached_picture, 0, 0);
+    if (decoded_frame)
+        filter_video(mpctx, decoded_frame);
+    load_next_vo_frame(mpctx, true);
+    mpctx->sh_video->pts = MP_NOPTS_VALUE;
+    return 0;
+}
+
 static void determine_frame_pts(struct MPContext *mpctx)
 {
    struct sh_video *sh_video = mpctx->sh_video;
@ -2537,15 +2563,16 @@ static double update_video(struct MPContext *mpctx, double endpts)
    if (!mpctx->opts.correct_pts)
        return update_video_nocorrect_pts(mpctx);

+    if (sh_video->gsh->attached_picture)
+        return update_video_attached_pic(mpctx);
+
    double pts;

    while (1) {
-        if (vo_get_buffered_frame(video_out, false) >= 0)
-            break;
-        if (filter_output_queued_frame(mpctx))
+        if (load_next_vo_frame(mpctx, false))
            break;
        pts = MP_NOPTS_VALUE;
-        struct demux_packet *pkt;
+        struct demux_packet *pkt = NULL;
        while (1) {
            pkt = demux_read_packet(mpctx->sh_video->gsh);
            if (!pkt || pkt->len)
@ -2570,7 +2597,7 @@ static double update_video(struct MPContext *mpctx, double endpts)
            determine_frame_pts(mpctx);
            filter_video(mpctx, decoded_frame);
        } else if (!pkt) {
-            if (vo_get_buffered_frame(video_out, true) < 0)
+            if (!load_next_vo_frame(mpctx, true))
                return -1;
        }
        break;
@ -2580,6 +2607,8 @@ static double update_video(struct MPContext *mpctx, double endpts)
        return 0;

    pts = video_out->next_pts;
+    if (sh_video->gsh->attached_picture)
+        pts = mpctx->last_seek_pts;
    if (pts == MP_NOPTS_VALUE) {
        mp_msg(MSGT_CPLAYER, MSGL_ERR, "Video pts after filters MISSING\n");
        // Try to use decoder pts from before filters
@ -3458,10 +3487,12 @@ static void run_playloop(struct MPContext *mpctx)
            mpctx->time_frame -= get_relative_time(mpctx);
        }
        if (mpctx->restart_playback) {
-            mpctx->syncing_audio = true;
-            if (mpctx->sh_audio)
-                fill_audio_out_buffers(mpctx, endpts);
-            mpctx->restart_playback = false;
+            if (mpctx->sync_audio_to_video) {
+                mpctx->syncing_audio = true;
+                if (mpctx->sh_audio)
+                    fill_audio_out_buffers(mpctx, endpts);
+                mpctx->restart_playback = false;
+            }
            mpctx->time_frame = 0;
            get_relative_time(mpctx);
        }
@ -3473,6 +3504,8 @@ static void run_playloop(struct MPContext *mpctx)
        break;
    } // video

+    video_left &= mpctx->sync_audio_to_video; // force no-video semantics
+
    if (mpctx->sh_audio && (mpctx->restart_playback ? !video_left :
                            mpctx->ao->untimed && (mpctx->delay <= 0 ||
                                                   !video_left))) {
--- a/demux/demux.c
+++ b/demux/demux.c
@ -92,7 +92,6 @@ const demuxer_desc_t *const demuxer_list[] = {
 struct demux_stream {
    int selected;          // user wants packets from this stream
    int eof;               // end of demuxed stream? (true if all buffer empty)
-    int fill_count;        // number of unsuccessful tries to get a packet
    int packs;            // number of packets in buffer
    int bytes;            // total bytes of packets in buffer
    struct demux_packet *head;
@ -441,16 +440,6 @@ overflow:
    return true;
 }

-static bool need_coverart_hack(struct demuxer *demux)
-{
-    for (int n = 0; n < demux->num_streams; n++) {
-        struct sh_stream *sh = demux->streams[n];
-        if (sh->attached_picture && sh->ds->selected)
-            return true;
-    }
-    return false;
-}
-
 // return value:
 //     0 = EOF or no stream found or invalid type
 //     1 = successfully read a packet
@ -476,31 +465,14 @@ static void ds_get_packets(struct sh_stream *sh)
             * despite the eof flag then it's better to clear it to avoid
             * weird behavior. */
            ds->eof = 0;
-            ds->fill_count = 0;
            return;
        }
-        // avoid buffering too far ahead in e.g. badly interleaved files
-        // or when one stream is shorter, without breaking large audio
-        // delay with well interleaved files.
-        // This needs to be enough for at least 1 second of packets
-        // since libavformat mov demuxer does not try to interleave
-        // with more than 1s precision.
-        if (ds->fill_count > 80)
-            break;

        if (demux_check_queue_full(demux))
            break;

-        int apacks = count_packs(demux, STREAM_AUDIO);
-        int vpacks = count_packs(demux, STREAM_VIDEO);
-
        if (!demux_fill_buffer(demux))
            break; // EOF
-
-        if (need_coverart_hack(demux)) {
-            ds->fill_count += count_packs(demux, STREAM_AUDIO) - apacks;
-            ds->fill_count += count_packs(demux, STREAM_VIDEO) - vpacks;
-        }
    }
    mp_msg(MSGT_DEMUXER, MSGL_V, "ds_get_packets: EOF reached (stream: %s)\n",
           stream_type_name(sh->type));
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@ -351,7 +351,8 @@ static void select_tracks(struct demuxer *demuxer, int start)
    for (int n = start; n < priv->num_streams; n++) {
        struct sh_stream *stream = priv->streams[n];
        AVStream *st = priv->avfc->streams[n];
-        bool selected = stream && demuxer_stream_is_selected(demuxer, stream);
+        bool selected = stream && demuxer_stream_is_selected(demuxer, stream) &&
+                        !stream->attached_picture;
        st->discard = selected ? AVDISCARD_DEFAULT : AVDISCARD_ALL;
    }
 }
@ -388,8 +389,12 @@ static void handle_stream(demuxer_t *demuxer, int i)
            break;
        sh_video_t *sh_video = sh->video;

-        if (st->disposition & AV_DISPOSITION_ATTACHED_PIC)
-            sh_video->gsh->attached_picture = true;
+        if (st->disposition & AV_DISPOSITION_ATTACHED_PIC) {
+            sh->attached_picture = new_demux_packet_from(st->attached_pic.data,
+                                                         st->attached_pic.size);
+            sh->attached_picture->pts = 0;
+            talloc_steal(sh, sh->attached_picture);
+        }

        sh_video->format = codec->codec_tag;
        sh_video->disp_w = codec->width;
@ -679,8 +684,6 @@ static int demux_lavf_fill_buffer(demuxer_t *demux)
    dp->avpacket = pkt;

    int64_t ts = priv->use_dts ? pkt->dts : pkt->pts;
-    if (ts == AV_NOPTS_VALUE && (st->disposition & AV_DISPOSITION_ATTACHED_PIC))
-        ts = 0;
    if (ts != AV_NOPTS_VALUE) {
        dp->pts = ts * av_q2d(st->time_base);
        priv->last_pts = dp->pts * AV_TIME_BASE;
--- a/demux/stheader.h
+++ b/demux/stheader.h
@ -61,7 +61,9 @@ struct sh_stream {
    char *title;
    char *lang;                 // language code
    bool default_track;         // container default track flag
-    bool attached_picture;      // stream is a picture (such as album art)
+
+    // stream is a picture (such as album art)
+    struct demux_packet *attached_picture;

    // Human readable description of the running decoder, or NULL
    char *decoder_desc;