From 9a820ec8b1e2323b70a1cebd204bf459bf7daa1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Ekstr=C3=B6m?= <jan.ekstrom@24i.com>
Date: Fri, 22 Jul 2022 13:57:54 +0300
Subject: [PATCH] ffmpeg: add video heartbeat capability to fix_sub_duration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Splits the currently handled subtitle at random access point
packets that can be configured to follow a specific output stream.
Currently only subtitle streams which are directly mapped into the
same output in which the heartbeat stream resides are affected.

This way the subtitle - which is known to be shown at this time
can be split and passed to muxer before its full duration is
yet known. This is also a drawback, as this essentially outputs
multiple subtitles from a single input subtitle that continues
over multiple random access points. Thus this feature should not
be utilized in cases where subtitle output latency does not matter.

Co-authored-by: Andrzej Nadachowski <andrzej.nadachowski@24i.com>
Co-authored-by: Bernard Boulay <bernard.boulay@24i.com>

Signed-off-by: Jan Ekström <jan.ekstrom@24i.com>
---
 Changelog                                     |   1 +
 doc/ffmpeg.texi                               |  16 ++
 fftools/ffmpeg.c                              | 148 ++++++++++++++++++
 fftools/ffmpeg.h                              |   8 +
 fftools/ffmpeg_mux_init.c                     |   4 +
 fftools/ffmpeg_opt.c                          |   5 +
 tests/fate/ffmpeg.mak                         |  15 ++
 .../fate/ffmpeg-fix_sub_duration_heartbeat    |  48 ++++++
 8 files changed, 245 insertions(+)
 create mode 100644 tests/ref/fate/ffmpeg-fix_sub_duration_heartbeat
diff --git a/Changelog b/Changelog
index c3ca10a3bb..cdbe43eac1 100644
--- a/Changelog
+++ b/Changelog
@@ -36,6 +36,7 @@ version <next>:
 - hstack_vaapi, vstack_vaapi and xstack_vaapi filters
 - XMD ADPCM decoder and demuxer
 - media100 to mjpegb bsf
+- ffmpeg CLI new option: -fix_sub_duration_heartbeat
 
 
 version 5.1:
diff --git a/doc/ffmpeg.texi b/doc/ffmpeg.texi
index 81e04f6983..592c4b4393 100644
--- a/doc/ffmpeg.texi
+++ b/doc/ffmpeg.texi
@@ -1342,6 +1342,22 @@ List all hardware acceleration components enabled in this build of ffmpeg.
 Actual runtime availability depends on the hardware and its suitable driver
 being installed.
 
+@item -fix_sub_duration_heartbeat[:@var{stream_specifier}]
+Set a specific output video stream as the heartbeat stream according to which
+to split and push through currently in-progress subtitle upon receipt of a
+random access packet.
+
+This lowers the latency of subtitles for which the end packet or the following
+subtitle has not yet been received. As a drawback, this will most likely lead
+to duplication of subtitle events in order to cover the full duration, so
+when dealing with use cases where latency of when the subtitle event is passed
+on to output is not relevant this option should not be utilized.
+
+Requires @option{-fix_sub_duration} to be set for the relevant input subtitle
+stream for this to have any effect, as well as for the input subtitle stream
+having to be directly mapped to the same output in which the heartbeat stream
+resides.
+
 @end table
 
 @section Audio Options
diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index e0a02260f5..743bc0c6b6 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -126,6 +126,7 @@ typedef struct BenchmarkTimeStamps {
     int64_t sys_usec;
 } BenchmarkTimeStamps;
 
+static int trigger_fix_sub_duration_heartbeat(OutputStream *ost, const AVPacket *pkt);
 static BenchmarkTimeStamps get_benchmark_time_stamps(void);
 static int64_t getmaxrss(void);
 static int ifilter_has_all_input_formats(FilterGraph *fg);
@@ -953,6 +954,13 @@ static int encode_frame(OutputFile *of, OutputStream *ost, AVFrame *frame)
                    av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, &enc->time_base));
         }
 
+        if ((ret = trigger_fix_sub_duration_heartbeat(ost, pkt)) < 0) {
+            av_log(NULL, AV_LOG_ERROR,
+                   "Subtitle heartbeat logic failed in %s! (%s)\n",
+                   __func__, av_err2str(ret));
+            exit_program(1);
+        }
+
         ost->data_size_enc += pkt->size;
 
         ost->packets_encoded++;
@@ -1912,6 +1920,16 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
 
     opkt->duration = av_rescale_q(pkt->duration, ist->st->time_base, ost->mux_timebase);
 
+    {
+        int ret = trigger_fix_sub_duration_heartbeat(ost, pkt);
+        if (ret < 0) {
+            av_log(NULL, AV_LOG_ERROR,
+                   "Subtitle heartbeat logic failed in %s! (%s)\n",
+                   __func__, av_err2str(ret));
+            exit_program(1);
+        }
+    }
+
     of_output_packet(of, opkt, ost, 0);
 
     ost->streamcopy_started = 1;
@@ -2355,6 +2373,136 @@ out:
     return ret;
 }
 
+static int copy_av_subtitle(AVSubtitle *dst, AVSubtitle *src)
+{
+    int ret = AVERROR_BUG;
+    AVSubtitle tmp = {
+        .format = src->format,
+        .start_display_time = src->start_display_time,
+        .end_display_time = src->end_display_time,
+        .num_rects = 0,
+        .rects = NULL,
+        .pts = src->pts
+    };
+
+    if (!src->num_rects)
+        goto success;
+
+    if (!(tmp.rects = av_calloc(src->num_rects, sizeof(*tmp.rects))))
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < src->num_rects; i++) {
+        AVSubtitleRect *src_rect = src->rects[i];
+        AVSubtitleRect *dst_rect;
+
+        if (!(dst_rect = tmp.rects[i] = av_mallocz(sizeof(*tmp.rects[0])))) {
+            ret = AVERROR(ENOMEM);
+            goto cleanup;
+        }
+
+        tmp.num_rects++;
+
+        dst_rect->type      = src_rect->type;
+        dst_rect->flags     = src_rect->flags;
+
+        dst_rect->x         = src_rect->x;
+        dst_rect->y         = src_rect->y;
+        dst_rect->w         = src_rect->w;
+        dst_rect->h         = src_rect->h;
+        dst_rect->nb_colors = src_rect->nb_colors;
+
+        if (src_rect->text)
+            if (!(dst_rect->text = av_strdup(src_rect->text))) {
+                ret = AVERROR(ENOMEM);
+                goto cleanup;
+            }
+
+        if (src_rect->ass)
+            if (!(dst_rect->ass = av_strdup(src_rect->ass))) {
+                ret = AVERROR(ENOMEM);
+                goto cleanup;
+            }
+
+        for (int j = 0; j < 4; j++) {
+            // SUBTITLE_BITMAP images are special in the sense that they
+            // are like PAL8 images. first pointer to data, second to
+            // palette. This makes the size calculation match this.
+            size_t buf_size = src_rect->type == SUBTITLE_BITMAP && j == 1 ?
+                              AVPALETTE_SIZE :
+                              src_rect->h * src_rect->linesize[j];
+
+            if (!src_rect->data[j])
+                continue;
+
+            if (!(dst_rect->data[j] = av_memdup(src_rect->data[j], buf_size))) {
+                ret = AVERROR(ENOMEM);
+                goto cleanup;
+            }
+            dst_rect->linesize[j] = src_rect->linesize[j];
+        }
+    }
+
+success:
+    *dst = tmp;
+
+    return 0;
+
+cleanup:
+    avsubtitle_free(&tmp);
+
+    return ret;
+}
+
+static int fix_sub_duration_heartbeat(InputStream *ist, int64_t signal_pts)
+{
+    int ret = AVERROR_BUG;
+    int got_output = 1;
+    AVSubtitle *prev_subtitle = &ist->prev_sub.subtitle;
+    AVSubtitle subtitle;
+
+    if (!ist->fix_sub_duration || !prev_subtitle->num_rects ||
+        signal_pts <= prev_subtitle->pts)
+        return 0;
+
+    if ((ret = copy_av_subtitle(&subtitle, prev_subtitle)) < 0)
+        return ret;
+
+    subtitle.pts = signal_pts;
+
+    return process_subtitle(ist, &subtitle, &got_output);
+}
+
+static int trigger_fix_sub_duration_heartbeat(OutputStream *ost, const AVPacket *pkt)
+{
+    OutputFile *of = output_files[ost->file_index];
+    int64_t signal_pts = av_rescale_q(pkt->pts, ost->mux_timebase,
+                                      AV_TIME_BASE_Q);
+
+    if (!ost->fix_sub_duration_heartbeat || !(pkt->flags & AV_PKT_FLAG_KEY))
+        // we are only interested in heartbeats on streams configured, and
+        // only on random access points.
+        return 0;
+
+    for (int i = 0; i < of->nb_streams; i++) {
+        OutputStream *iter_ost = of->streams[i];
+        InputStream  *ist      = iter_ost->ist;
+        int ret = AVERROR_BUG;
+
+        if (iter_ost == ost || !ist || !ist->decoding_needed ||
+            ist->dec_ctx->codec_type != AVMEDIA_TYPE_SUBTITLE)
+            // We wish to skip the stream that causes the heartbeat,
+            // output streams without an input stream, streams not decoded
+            // (as fix_sub_duration is only done for decoded subtitles) as
+            // well as non-subtitle streams.
+            continue;
+
+        if ((ret = fix_sub_duration_heartbeat(ist, signal_pts)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
 static int transcode_subtitles(InputStream *ist, AVPacket *pkt, int *got_output,
                                int *decode_failed)
 {
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index 8c976af97a..933312dba7 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -224,6 +224,8 @@ typedef struct OptionsContext {
     int        nb_reinit_filters;
     SpecifierOpt *fix_sub_duration;
     int        nb_fix_sub_duration;
+    SpecifierOpt *fix_sub_duration_heartbeat;
+    int        nb_fix_sub_duration_heartbeat;
     SpecifierOpt *canvas_sizes;
     int        nb_canvas_sizes;
     SpecifierOpt *pass;
@@ -675,6 +677,12 @@ typedef struct OutputStream {
 
     EncStats enc_stats_pre;
     EncStats enc_stats_post;
+
+    /*
+     * bool on whether this stream should be utilized for splitting
+     * subtitles utilizing fix_sub_duration at random access points.
+     */
+    unsigned int fix_sub_duration_heartbeat;
 } OutputStream;
 
 typedef struct OutputFile {
diff --git a/fftools/ffmpeg_mux_init.c b/fftools/ffmpeg_mux_init.c
index f8ccf4a3e9..834cdbcc9f 100644
--- a/fftools/ffmpeg_mux_init.c
+++ b/fftools/ffmpeg_mux_init.c
@@ -61,6 +61,7 @@ static const char *const opt_name_enc_stats_pre_fmt[]         = {"enc_stats_pre_
 static const char *const opt_name_enc_stats_post_fmt[]        = {"enc_stats_post_fmt", NULL};
 static const char *const opt_name_filters[]                   = {"filter", "af", "vf", NULL};
 static const char *const opt_name_filter_scripts[]            = {"filter_script", NULL};
+static const char *const opt_name_fix_sub_duration_heartbeat[] = {"fix_sub_duration_heartbeat", NULL};
 static const char *const opt_name_fps_mode[]                  = {"fps_mode", NULL};
 static const char *const opt_name_force_fps[]                 = {"force_fps", NULL};
 static const char *const opt_name_forced_key_frames[]         = {"forced_key_frames", NULL};
@@ -614,6 +615,9 @@ static OutputStream *new_output_stream(Muxer *mux, const OptionsContext *o,
     MATCH_PER_STREAM_OPT(bits_per_raw_sample, i, ost->bits_per_raw_sample,
                          oc, st);
 
+    MATCH_PER_STREAM_OPT(fix_sub_duration_heartbeat, i, ost->fix_sub_duration_heartbeat,
+                         oc, st);
+
     if (oc->oformat->flags & AVFMT_GLOBALHEADER && ost->enc_ctx)
         ost->enc_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index c0ae55a359..204be38c94 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -1658,6 +1658,11 @@ const OptionDef options[] = {
     { "autoscale",        HAS_ARG | OPT_BOOL | OPT_SPEC |
                           OPT_EXPERT | OPT_OUTPUT,                               { .off = OFFSET(autoscale) },
         "automatically insert a scale filter at the end of the filter graph" },
+    { "fix_sub_duration_heartbeat", OPT_VIDEO | OPT_BOOL | OPT_EXPERT |
+                                    OPT_SPEC | OPT_OUTPUT,                       { .off = OFFSET(fix_sub_duration_heartbeat) },
+        "set this video output stream to be a heartbeat stream for "
+        "fix_sub_duration, according to which subtitles should be split at "
+        "random access points" },
 
     /* audio options */
     { "aframes",        OPT_AUDIO | HAS_ARG  | OPT_PERFILE | OPT_OUTPUT,           { .func_arg = opt_audio_frames },
diff --git a/tests/fate/ffmpeg.mak b/tests/fate/ffmpeg.mak
index d87639c596..0f33c2a0ed 100644
--- a/tests/fate/ffmpeg.mak
+++ b/tests/fate/ffmpeg.mak
@@ -117,6 +117,21 @@ fate-ffmpeg-fix_sub_duration: CMD = fmtstdout srt -fix_sub_duration \
   -real_time 1 -f lavfi \
   -i "movie=$(TARGET_SAMPLES)/sub/Closedcaption_rollup.m2v[out0+subcc]"
 
+# Basic test for fix_sub_duration_heartbeat, which causes a buffered subtitle
+# to be pushed out when a video keyframe is received from an encoder.
+FATE_SAMPLES_FFMPEG-$(call FILTERDEMDECENCMUX, MOVIE, MPEGVIDEO, \
+                           MPEG2VIDEO, SUBRIP, SRT, LAVFI_INDEV  \
+                           MPEGVIDEO_PARSER CCAPTION_DECODER \
+                           MPEG2VIDEO_ENCODER NULL_MUXER PIPE_PROTOCOL) \
+                           += fate-ffmpeg-fix_sub_duration_heartbeat
+fate-ffmpeg-fix_sub_duration_heartbeat: CMD = fmtstdout srt -fix_sub_duration \
+  -real_time 1 -f lavfi \
+  -i "movie=$(TARGET_SAMPLES)/sub/Closedcaption_rollup.m2v[out0+subcc]" \
+  -map 0:v  -map 0:s -fix_sub_duration_heartbeat:v:0 \
+  -c:v mpeg2video -b:v 2M -g 30 -sc_threshold 1000000000 \
+  -c:s srt \
+  -f null -
+
 FATE_STREAMCOPY-$(call REMUX, MP4 MOV, EAC3_DEMUXER) += fate-copy-trac3074
 fate-copy-trac3074: CMD = transcode eac3 $(TARGET_SAMPLES)/eac3/csi_miami_stereo_128_spx.eac3\
                      mp4 "-codec copy -map 0" "-codec copy"
diff --git a/tests/ref/fate/ffmpeg-fix_sub_duration_heartbeat b/tests/ref/fate/ffmpeg-fix_sub_duration_heartbeat
new file mode 100644
index 0000000000..957a410921
--- /dev/null
+++ b/tests/ref/fate/ffmpeg-fix_sub_duration_heartbeat
@@ -0,0 +1,48 @@
+1
+00:00:00,968 --> 00:00:01,001
+<font face="Monospace">{\an7}(</font>
+
+2
+00:00:01,001 --> 00:00:01,168
+<font face="Monospace">{\an7}(</font>
+
+3
+00:00:01,168 --> 00:00:01,368
+<font face="Monospace">{\an7}(<i> inaudibl</i></font>
+
+4
+00:00:01,368 --> 00:00:01,568
+<font face="Monospace">{\an7}(<i> inaudible radio chat</i></font>
+
+5
+00:00:01,568 --> 00:00:02,002
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )</font>
+
+6
+00:00:02,002 --> 00:00:03,003
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )</font>
+
+7
+00:00:03,003 --> 00:00:03,103
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )</font>
+
+8
+00:00:03,103 --> 00:00:03,303
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )
+>></font>
+
+9
+00:00:03,303 --> 00:00:03,503
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )
+>> Safety rema</font>
+
+10
+00:00:03,504 --> 00:00:03,704
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )
+>> Safety remains our numb</font>
+
+11
+00:00:03,704 --> 00:00:04,004
+<font face="Monospace">{\an7}(<i> inaudible radio chatter</i> )
+>> Safety remains our number one</font>
+