From 3235de4883b694058f48ac4e13a9207c1fd94c04 Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Tue, 23 May 2023 00:30:22 +0200 Subject: [PATCH] avfilter/af_silenceremove: switch to activate Do full rewrite, new code is much faster for typical filter usages. Also previous code was hard to follow and basically very-hard to maintain. --- doc/filters.texi | 2 +- libavfilter/af_silenceremove.c | 971 ++++++--------------------- libavfilter/silenceremove_template.c | 293 ++++++++ 3 files changed, 499 insertions(+), 767 deletions(-) create mode 100644 libavfilter/silenceremove_template.c diff --git a/doc/filters.texi b/doc/filters.texi index 0c3ac116b8..47b26fe92f 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -6431,7 +6431,7 @@ Set the count for trimming silence from the end of audio. To remove silence from the middle of a file, specify a @var{stop_periods} that is negative. This value is then treated as a positive value and is used to indicate the effect should restart processing as specified by -@var{start_periods}, making it suitable for removing periods of silence +@var{stop_periods}, making it suitable for removing periods of silence in the middle of the audio. Default value is @code{0}. diff --git a/libavfilter/af_silenceremove.c b/libavfilter/af_silenceremove.c index d5a2ac6a41..e0592c2368 100644 --- a/libavfilter/af_silenceremove.c +++ b/libavfilter/af_silenceremove.c @@ -23,11 +23,11 @@ #include /* DBL_MAX */ -#include "libavutil/audio_fifo.h" #include "libavutil/avassert.h" #include "libavutil/opt.h" #include "libavutil/timestamp.h" #include "audio.h" +#include "filters.h" #include "formats.h" #include "avfilter.h" #include "internal.h" @@ -42,69 +42,66 @@ enum ThresholdMode { T_ALL, }; -enum SilenceMode { - SILENCE_TRIM, - SILENCE_TRIM_FLUSH, - SILENCE_COPY, - SILENCE_COPY_FLUSH, - SILENCE_STOP -}; - typedef struct SilenceRemoveContext { const AVClass *class; - enum SilenceMode mode; - + int start_mode; int start_periods; int64_t start_duration; int64_t start_duration_opt; double start_threshold; int64_t start_silence; int64_t start_silence_opt; - int start_mode; + int stop_mode; int stop_periods; int64_t stop_duration; int64_t stop_duration_opt; double stop_threshold; int64_t stop_silence; int64_t stop_silence_opt; - int stop_mode; int64_t window_duration_opt; - AVFrame *start_holdoff; - AVFrame *start_silence_hold; - size_t start_holdoff_offset; - size_t start_holdoff_end; - size_t start_silence_offset; - size_t start_silence_end; - int start_found_periods; + int start_found_periods; + int stop_found_periods; - AVFrame *stop_holdoff; - AVFrame *stop_silence_hold; - size_t stop_holdoff_offset; - size_t stop_holdoff_end; - size_t stop_silence_offset; - size_t stop_silence_end; - int stop_found_periods; + int start_sample_count; + int start_silence_count; + + int stop_sample_count; + int stop_silence_count; + + AVFrame *start_window; + AVFrame *stop_window; - AVFrame *window; - int window_offset; int64_t window_duration; - double sum; - int one_period; + int start_window_pos; + int start_window_size; + + int stop_window_pos; + int stop_window_size; + + double *start_cache; + double *stop_cache; + + AVFrame *start_queuef; + int start_queue_pos; + int start_queue_size; + + AVFrame *stop_queuef; + int stop_queue_pos; + int stop_queue_size; + int restart; + int found_nonsilence; int64_t next_pts; int detection; - void (*update)(struct SilenceRemoveContext *s, AVFrame *frame, int ch, int offset); - double (*compute)(struct SilenceRemoveContext *s, AVFrame *frame, int ch, int offset); - void (*copy)(struct SilenceRemoveContext *s, AVFrame *out, AVFrame *in, - int ch, int out_offset, int in_offset); - AVAudioFifo *fifo; + float (*compute_flt)(float *c, float s, float ws, int size); + double (*compute_dbl)(double *c, double s, double ws, int size); } SilenceRemoveContext; #define OFFSET(x) offsetof(SilenceRemoveContext, x) @@ -119,7 +116,7 @@ static const AVOption silenceremove_options[] = { { "any", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ANY}, 0, 0, AF, "mode" }, { "all", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ALL}, 0, 0, AF, "mode" }, { "stop_periods", "set periods of silence parts to skip from end", OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, AF }, - { "stop_duration", "set stop duration of non-silence part", OFFSET(stop_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, + { "stop_duration", "set stop duration of silence part", OFFSET(stop_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, { "stop_threshold", "set threshold for stop silence detection", OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF }, { "stop_silence", "set stop duration of silence part to keep", OFFSET(stop_silence_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, { "stop_mode", "set which channel will trigger trimming from end", OFFSET(stop_mode), AV_OPT_TYPE_INT, {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" }, @@ -132,281 +129,12 @@ static const AVOption silenceremove_options[] = { AVFILTER_DEFINE_CLASS(silenceremove); -static void copy_double(SilenceRemoveContext *s, AVFrame *out, AVFrame *in, - int ch, int out_offset, int in_offset) -{ - const double *srcp = (const double *)in->data[0]; - const double src = srcp[in->ch_layout.nb_channels * in_offset + ch]; - double *dstp = (double *)out->data[0]; +#define DEPTH 32 +#include "silenceremove_template.c" - dstp[out->ch_layout.nb_channels * out_offset + ch] = src; -} - -static void copy_doublep(SilenceRemoveContext *s, AVFrame *out, AVFrame *in, - int ch, int out_offset, int in_offset) -{ - const double *srcp = (const double *)in->extended_data[ch]; - const double src = srcp[in_offset]; - double *dstp = (double *)out->extended_data[ch]; - - dstp[out_offset] = src; -} - -static void copy_float(SilenceRemoveContext *s, AVFrame *out, AVFrame *in, - int ch, int out_offset, int in_offset) -{ - const float *srcp = (const float *)in->data[0]; - const float src = srcp[in->ch_layout.nb_channels * in_offset + ch]; - float *dstp = (float *)out->data[0]; - - dstp[out->ch_layout.nb_channels * out_offset + ch] = src; -} - -static void copy_floatp(SilenceRemoveContext *s, AVFrame *out, AVFrame *in, - int ch, int out_offset, int in_offset) -{ - const float *srcp = (const float *)in->extended_data[ch]; - const float src = srcp[in_offset]; - float *dstp = (float *)out->extended_data[ch]; - - dstp[out_offset] = src; -} - -static double compute_peak_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->data[0]; - const double *wsamples = (const double *)s->window->data[0]; - double sample = samples[frame->ch_layout.nb_channels * offset + ch]; - double wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - double new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmax(new_sum, 0.); - new_sum += fabs(sample); - - return new_sum / s->window_duration; -} - -static void update_peak_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->data[0]; - double *wsamples = (double *)s->window->data[0]; - double sample = samples[frame->ch_layout.nb_channels * offset + ch]; - double *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - - s->sum -= *wsample; - s->sum = fmax(s->sum, 0.); - *wsample = fabs(sample); - s->sum += *wsample; -} - -static double compute_peak_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->data[0]; - const float *wsamples = (const float *)s->window->data[0]; - float sample = samples[frame->ch_layout.nb_channels * offset + ch]; - float wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - float new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmaxf(new_sum, 0.f); - new_sum += fabsf(sample); - - return new_sum / s->window_duration; -} - -static void update_peak_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->data[0]; - float *wsamples = (float *)s->window->data[0]; - float sample = samples[frame->ch_layout.nb_channels * offset + ch]; - float *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - - s->sum -= *wsample; - s->sum = fmaxf(s->sum, 0.f); - *wsample = fabsf(sample); - s->sum += *wsample; -} - -static double compute_rms_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->data[0]; - const double *wsamples = (const double *)s->window->data[0]; - double sample = samples[frame->ch_layout.nb_channels * offset + ch]; - double wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - double new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmax(new_sum, 0.); - new_sum += sample * sample; - - av_assert2(new_sum >= 0.); - return sqrt(new_sum / s->window_duration); -} - -static void update_rms_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->data[0]; - double *wsamples = (double *)s->window->data[0]; - double sample = samples[frame->ch_layout.nb_channels * offset + ch]; - double *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - - s->sum -= *wsample; - s->sum = fmax(s->sum, 0.); - *wsample = sample * sample; - s->sum += *wsample; -} - -static double compute_rms_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->data[0]; - const float *wsamples = (const float *)s->window->data[0]; - float sample = samples[frame->ch_layout.nb_channels * offset + ch]; - float wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - float new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmaxf(new_sum, 0.f); - new_sum += sample * sample; - - av_assert2(new_sum >= 0.f); - return sqrtf(new_sum / s->window_duration); -} - -static void update_rms_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->data[0]; - float sample = samples[frame->ch_layout.nb_channels * offset + ch]; - float *wsamples = (float *)s->window->data[0]; - float *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch]; - - s->sum -= *wsample; - s->sum = fmaxf(s->sum, 0.f); - *wsample = sample * sample; - s->sum += *wsample; -} - -static double compute_peak_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->extended_data[ch]; - const double *wsamples = (const double *)s->window->extended_data[ch]; - double sample = samples[offset]; - double wsample = wsamples[s->window_offset]; - double new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmax(new_sum, 0.); - new_sum += fabs(sample); - - return new_sum / s->window_duration; -} - -static void update_peak_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->extended_data[ch]; - double *wsamples = (double *)s->window->extended_data[ch]; - double sample = samples[offset]; - double *wsample = &wsamples[s->window_offset]; - - s->sum -= *wsample; - s->sum = fmax(s->sum, 0.); - *wsample = fabs(sample); - s->sum += *wsample; -} - -static double compute_peak_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->extended_data[ch]; - const float *wsamples = (const float *)s->window->extended_data[ch]; - float sample = samples[offset]; - float wsample = wsamples[s->window_offset]; - float new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmaxf(new_sum, 0.f); - new_sum += fabsf(sample); - - return new_sum / s->window_duration; -} - -static void update_peak_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->extended_data[ch]; - float *wsamples = (float *)s->window->extended_data[ch]; - float sample = samples[offset]; - float *wsample = &wsamples[s->window_offset]; - - s->sum -= *wsample; - s->sum = fmaxf(s->sum, 0.f); - *wsample = fabsf(sample); - s->sum += *wsample; -} - -static double compute_rms_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->extended_data[ch]; - const double *wsamples = (const double *)s->window->extended_data[ch]; - double sample = samples[offset]; - double wsample = wsamples[s->window_offset]; - double new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmax(new_sum, 0.); - new_sum += sample * sample; - - av_assert2(new_sum >= 0.); - return sqrt(new_sum / s->window_duration); -} - -static void update_rms_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const double *samples = (const double *)frame->extended_data[ch]; - double *wsamples = (double *)s->window->extended_data[ch]; - double sample = samples[offset]; - double *wsample = &wsamples[s->window_offset]; - - s->sum -= *wsample; - s->sum = fmax(s->sum, 0.); - *wsample = sample * sample; - s->sum += *wsample; -} - -static double compute_rms_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->extended_data[ch]; - const float *wsamples = (const float *)s->window->extended_data[ch]; - float sample = samples[offset]; - float wsample = wsamples[s->window_offset]; - float new_sum; - - new_sum = s->sum; - new_sum -= wsample; - new_sum = fmaxf(new_sum, 0.f); - new_sum += sample * sample; - - av_assert2(new_sum >= 0.f); - return sqrtf(new_sum / s->window_duration); -} - -static void update_rms_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset) -{ - const float *samples = (const float *)frame->extended_data[ch]; - float *wsamples = (float *)s->window->extended_data[ch]; - float sample = samples[offset]; - float *wsample = &wsamples[s->window_offset]; - - s->sum -= *wsample; - s->sum = fmaxf(s->sum, 0.f); - *wsample = sample * sample; - s->sum += *wsample; -} +#undef DEPTH +#define DEPTH 64 +#include "silenceremove_template.c" static av_cold int init(AVFilterContext *ctx) { @@ -420,13 +148,25 @@ static av_cold int init(AVFilterContext *ctx) return 0; } -static void clear_window(SilenceRemoveContext *s) +static void clear_windows(SilenceRemoveContext *s) { - av_samples_set_silence(s->window->extended_data, 0, s->window_duration, - s->window->ch_layout.nb_channels, s->window->format); + av_samples_set_silence(s->start_window->extended_data, 0, + s->start_window->nb_samples, + s->start_window->ch_layout.nb_channels, + s->start_window->format); + av_samples_set_silence(s->stop_window->extended_data, 0, + s->stop_window->nb_samples, + s->stop_window->ch_layout.nb_channels, + s->stop_window->format); - s->window_offset = 0; - s->sum = 0; + s->start_window_pos = 0; + s->start_window_size = 0; + s->stop_window_pos = 0; + s->stop_window_size = 0; + s->start_queue_pos = 0; + s->start_queue_size = 0; + s->stop_queue_pos = 0; + s->stop_queue_size = 0; } static int config_input(AVFilterLink *inlink) @@ -438,505 +178,204 @@ static int config_input(AVFilterLink *inlink) s->window_duration = av_rescale(s->window_duration_opt, inlink->sample_rate, AV_TIME_BASE); s->window_duration = FFMAX(1, s->window_duration); - s->window = ff_get_audio_buffer(ctx->outputs[0], s->window_duration); - if (!s->window) - return AVERROR(ENOMEM); - - clear_window(s); s->start_duration = av_rescale(s->start_duration_opt, inlink->sample_rate, AV_TIME_BASE); s->start_silence = av_rescale(s->start_silence_opt, inlink->sample_rate, AV_TIME_BASE); - s->stop_duration = av_rescale(s->stop_duration_opt, inlink->sample_rate, + s->stop_duration = av_rescale(s->stop_duration_opt, inlink->sample_rate, AV_TIME_BASE); - s->stop_silence = av_rescale(s->stop_silence_opt, inlink->sample_rate, + s->stop_silence = av_rescale(s->stop_silence_opt, inlink->sample_rate, AV_TIME_BASE); - s->start_holdoff = ff_get_audio_buffer(ctx->outputs[0], - FFMAX(s->start_duration, 1)); - if (!s->start_holdoff) - return AVERROR(ENOMEM); - - s->start_silence_hold = ff_get_audio_buffer(ctx->outputs[0], - FFMAX(s->start_silence, 1)); - if (!s->start_silence_hold) - return AVERROR(ENOMEM); - - s->start_holdoff_offset = 0; - s->start_holdoff_end = 0; - s->start_found_periods = 0; - - s->stop_holdoff = ff_get_audio_buffer(ctx->outputs[0], - FFMAX(s->stop_duration, 1)); - if (!s->stop_holdoff) - return AVERROR(ENOMEM); - - s->stop_silence_hold = ff_get_audio_buffer(ctx->outputs[0], - FFMAX(s->stop_silence, 1)); - if (!s->stop_silence_hold) - return AVERROR(ENOMEM); - - s->stop_holdoff_offset = 0; - s->stop_holdoff_end = 0; + s->start_found_periods = 0; s->stop_found_periods = 0; - if (s->start_periods) { - s->mode = SILENCE_TRIM; - s->one_period = 1; - } else { - s->mode = SILENCE_COPY; - } - - switch (inlink->format) { - case AV_SAMPLE_FMT_DBL: - s->copy = copy_double; - switch (s->detection) { - case D_PEAK: - s->update = update_peak_double; - s->compute = compute_peak_double; - break; - case D_RMS: - s->update = update_rms_double; - s->compute = compute_rms_double; - break; - } - break; - case AV_SAMPLE_FMT_FLT: - s->copy = copy_float; - switch (s->detection) { - case D_PEAK: - s->update = update_peak_float; - s->compute = compute_peak_float; - break; - case D_RMS: - s->update = update_rms_float; - s->compute = compute_rms_float; - break; - } - break; - case AV_SAMPLE_FMT_DBLP: - s->copy = copy_doublep; - switch (s->detection) { - case D_PEAK: - s->update = update_peak_doublep; - s->compute = compute_peak_doublep; - break; - case D_RMS: - s->update = update_rms_doublep; - s->compute = compute_rms_doublep; - break; - } - break; - case AV_SAMPLE_FMT_FLTP: - s->copy = copy_floatp; - switch (s->detection) { - case D_PEAK: - s->update = update_peak_floatp; - s->compute = compute_peak_floatp; - break; - case D_RMS: - s->update = update_rms_floatp; - s->compute = compute_rms_floatp; - break; - } - break; - default: - return AVERROR_BUG; - } - - s->fifo = av_audio_fifo_alloc(inlink->format, inlink->ch_layout.nb_channels, 1024); - if (!s->fifo) - return AVERROR(ENOMEM); - return 0; } -static void flush(SilenceRemoveContext *s, - AVFrame *out, AVFilterLink *outlink, - int *nb_samples_written, int flush_silence) +static int config_output(AVFilterLink *outlink) { - AVFrame *silence; + AVFilterContext *ctx = outlink->src; + SilenceRemoveContext *s = ctx->priv; - if (*nb_samples_written) { - out->nb_samples = *nb_samples_written; + s->start_window = ff_get_audio_buffer(outlink, s->window_duration); + s->stop_window = ff_get_audio_buffer(outlink, s->window_duration); + if (!s->start_window || !s->stop_window) + return AVERROR(ENOMEM); - av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples); - *nb_samples_written = 0; + s->start_queuef = ff_get_audio_buffer(outlink, s->start_silence + 1); + s->stop_queuef = ff_get_audio_buffer(outlink, s->stop_silence + 1); + if (!s->start_queuef || !s->stop_queuef) + return AVERROR(ENOMEM); + + s->start_cache = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->start_cache)); + s->stop_cache = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->stop_cache)); + if (!s->start_cache || !s->stop_cache) + return AVERROR(ENOMEM); + + clear_windows(s); + + switch (s->detection) { + case D_PEAK: + s->compute_flt = compute_peak_flt; + s->compute_dbl = compute_peak_dbl; + break; + case D_RMS: + s->compute_flt = compute_rms_flt; + s->compute_dbl = compute_rms_dbl; + break; } - av_frame_free(&out); - - if (s->stop_silence_end <= 0 || !flush_silence) - return; - - silence = ff_get_audio_buffer(outlink, s->stop_silence_end); - if (!silence) - return; - - if (s->stop_silence_offset < s->stop_silence_end) { - av_samples_copy(silence->extended_data, s->stop_silence_hold->extended_data, 0, - s->stop_silence_offset, - s->stop_silence_end - s->stop_silence_offset, - outlink->ch_layout.nb_channels, outlink->format); - } - - if (s->stop_silence_offset > 0) { - av_samples_copy(silence->extended_data, s->stop_silence_hold->extended_data, - s->stop_silence_end - s->stop_silence_offset, - 0, s->stop_silence_offset, - outlink->ch_layout.nb_channels, outlink->format); - } - - s->stop_silence_offset = 0; - s->stop_silence_end = 0; - - av_audio_fifo_write(s->fifo, (void **)silence->extended_data, silence->nb_samples); - av_frame_free(&silence); + return 0; } -static int filter_frame(AVFilterLink *inlink, AVFrame *in) +static int filter_frame(AVFilterLink *outlink, AVFrame *in) { - AVFilterContext *ctx = inlink->dst; - AVFilterLink *outlink = ctx->outputs[0]; + const int nb_channels = outlink->ch_layout.nb_channels; + AVFilterContext *ctx = outlink->src; SilenceRemoveContext *s = ctx->priv; - int nbs, nb_samples_read, nb_samples_written; - int i, j, threshold, ret = 0; + int max_out_nb_samples; + int out_nb_samples = 0; + int in_nb_samples; + const double *srcd; + const float *srcf; AVFrame *out; - - nb_samples_read = nb_samples_written = 0; + double *dstd; + float *dstf; if (s->next_pts == AV_NOPTS_VALUE) s->next_pts = in->pts; - switch (s->mode) { - case SILENCE_TRIM: -silence_trim: - nbs = in->nb_samples - nb_samples_read; - if (!nbs) - break; + in_nb_samples = in->nb_samples; + max_out_nb_samples = in->nb_samples + + s->start_silence + + s->stop_silence; + if (max_out_nb_samples <= 0) { + av_frame_free(&in); + ff_filter_set_ready(ctx, 100); + return 0; + } - for (i = 0; i < nbs; i++) { - if (s->start_mode == T_ANY) { - threshold = 0; - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - threshold |= s->compute(s, in, j, nb_samples_read) > s->start_threshold; - } - } else { - threshold = 1; - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - threshold &= s->compute(s, in, j, nb_samples_read) > s->start_threshold; - } + out = ff_get_audio_buffer(outlink, max_out_nb_samples); + if (!out) { + av_frame_free(&in); + return AVERROR(ENOMEM); + } + + out->pts = s->next_pts; + + switch (outlink->format) { + case AV_SAMPLE_FMT_FLT: + srcf = (const float *)in->data[0]; + dstf = (float *)out->data[0]; + if (s->start_periods > 0 && s->stop_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_start_flt(ctx, srcf + n * nb_channels, + dstf, &out_nb_samples, + nb_channels); } - - if (threshold) { - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - s->update(s, in, j, nb_samples_read); - s->copy(s, s->start_holdoff, in, j, s->start_holdoff_end, nb_samples_read); - } - - s->window_offset++; - if (s->window_offset >= s->window_duration) - s->window_offset = 0; - s->start_holdoff_end++; - nb_samples_read++; - - if (s->start_holdoff_end >= s->start_duration) { - s->start_found_periods += s->one_period >= 1; - s->one_period = 0; - if (s->start_found_periods >= s->start_periods) { - s->mode = SILENCE_TRIM_FLUSH; - goto silence_trim_flush; - } - - s->start_holdoff_offset = 0; - s->start_holdoff_end = 0; - s->start_silence_offset = 0; - s->start_silence_end = 0; - } - } else { - s->start_holdoff_end = 0; - s->one_period++; - - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - s->update(s, in, j, nb_samples_read); - if (s->start_silence) - s->copy(s, s->start_silence_hold, in, j, s->start_silence_offset, nb_samples_read); - } - - s->window_offset++; - if (s->window_offset >= s->window_duration) - s->window_offset = 0; - nb_samples_read++; - s->start_silence_offset++; - - if (s->start_silence) { - s->start_silence_end = FFMIN(s->start_silence_end + 1, s->start_silence); - if (s->start_silence_offset >= s->start_silence) - s->start_silence_offset = 0; - } + in_nb_samples = out_nb_samples; + out_nb_samples = 0; + for (int n = 0; n < in_nb_samples; n++) { + filter_stop_flt(ctx, dstf + n * nb_channels, + dstf, &out_nb_samples, + nb_channels); + } + } else if (s->start_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_start_flt(ctx, srcf + n * nb_channels, + dstf, &out_nb_samples, + nb_channels); + } + } else if (s->stop_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_stop_flt(ctx, srcf + n * nb_channels, + dstf, &out_nb_samples, + nb_channels); } } break; - - case SILENCE_TRIM_FLUSH: -silence_trim_flush: - nbs = s->start_holdoff_end - s->start_holdoff_offset; - if (!nbs) - break; - - out = ff_get_audio_buffer(outlink, nbs + s->start_silence_end); - if (!out) { - av_frame_free(&in); - return AVERROR(ENOMEM); - } - - if (s->start_silence_end > 0) { - if (s->start_silence_offset < s->start_silence_end) { - av_samples_copy(out->extended_data, s->start_silence_hold->extended_data, 0, - s->start_silence_offset, - s->start_silence_end - s->start_silence_offset, - outlink->ch_layout.nb_channels, outlink->format); + case AV_SAMPLE_FMT_DBL: + srcd = (const double *)in->data[0]; + dstd = (double *)out->data[0]; + if (s->start_periods > 0 && s->stop_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_start_dbl(ctx, srcd + n * nb_channels, + dstd, &out_nb_samples, + nb_channels); } - - if (s->start_silence_offset > 0) { - av_samples_copy(out->extended_data, s->start_silence_hold->extended_data, - s->start_silence_end - s->start_silence_offset, - 0, s->start_silence_offset, - outlink->ch_layout.nb_channels, outlink->format); + in_nb_samples = out_nb_samples; + out_nb_samples = 0; + for (int n = 0; n < in_nb_samples; n++) { + filter_stop_dbl(ctx, dstd + n * nb_channels, + dstd, &out_nb_samples, + nb_channels); + } + } else if (s->start_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_start_dbl(ctx, srcd + n * nb_channels, + dstd, &out_nb_samples, + nb_channels); + } + } else if (s->stop_periods > 0) { + for (int n = 0; n < in_nb_samples; n++) { + filter_stop_dbl(ctx, srcd + n * nb_channels, + dstd, &out_nb_samples, + nb_channels); } } - - av_samples_copy(out->extended_data, s->start_holdoff->extended_data, - s->start_silence_end, - s->start_holdoff_offset, nbs, - outlink->ch_layout.nb_channels, outlink->format); - - s->start_holdoff_offset += nbs; - - av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples); - av_frame_free(&out); - - if (s->start_holdoff_offset == s->start_holdoff_end) { - s->start_holdoff_offset = 0; - s->start_holdoff_end = 0; - s->start_silence_offset = 0; - s->start_silence_end = 0; - s->mode = SILENCE_COPY; - goto silence_copy; - } break; - - case SILENCE_COPY: -silence_copy: - nbs = in->nb_samples - nb_samples_read; - if (!nbs) - break; - - out = ff_get_audio_buffer(outlink, nbs); - if (!out) { - av_frame_free(&in); - return AVERROR(ENOMEM); - } - - if (s->stop_periods) { - for (i = 0; i < nbs; i++) { - if (s->stop_mode == T_ANY) { - threshold = 0; - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - threshold |= s->compute(s, in, j, nb_samples_read) > s->stop_threshold; - } - } else { - threshold = 1; - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - threshold &= s->compute(s, in, j, nb_samples_read) > s->stop_threshold; - } - } - - if (threshold && s->stop_holdoff_end && !s->stop_silence) { - s->mode = SILENCE_COPY_FLUSH; - flush(s, out, outlink, &nb_samples_written, 0); - s->one_period++; - goto silence_copy_flush; - } else if (threshold) { - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - s->update(s, in, j, nb_samples_read); - s->copy(s, out, in, j, nb_samples_written, nb_samples_read); - } - - s->window_offset++; - if (s->window_offset >= s->window_duration) - s->window_offset = 0; - nb_samples_read++; - nb_samples_written++; - s->one_period++; - } else if (!threshold) { - for (j = 0; j < outlink->ch_layout.nb_channels; j++) { - s->update(s, in, j, nb_samples_read); - if (s->stop_silence) - s->copy(s, s->stop_silence_hold, in, j, s->stop_silence_offset, nb_samples_read); - - s->copy(s, s->stop_holdoff, in, j, s->stop_holdoff_end, nb_samples_read); - } - - if (s->stop_silence) { - s->stop_silence_offset++; - s->stop_silence_end = FFMIN(s->stop_silence_end + 1, s->stop_silence); - if (s->stop_silence_offset >= s->stop_silence) { - s->stop_silence_offset = 0; - } - } - - s->window_offset++; - if (s->window_offset >= s->window_duration) - s->window_offset = 0; - nb_samples_read++; - s->stop_holdoff_end++; - - if (s->stop_holdoff_end >= s->stop_duration) { - s->stop_found_periods += s->one_period >= 1; - s->one_period = 0; - if (s->stop_found_periods >= s->stop_periods) { - s->stop_holdoff_offset = 0; - s->stop_holdoff_end = 0; - - if (!s->restart) { - s->mode = SILENCE_STOP; - flush(s, out, outlink, &nb_samples_written, 1); - goto silence_stop; - } else { - s->stop_found_periods = 0; - s->start_found_periods = 0; - s->start_holdoff_offset = 0; - s->start_holdoff_end = 0; - s->start_silence_offset = 0; - s->start_silence_end = 0; - clear_window(s); - s->mode = SILENCE_TRIM; - flush(s, out, outlink, &nb_samples_written, 1); - goto silence_trim; - } - } - s->mode = SILENCE_COPY_FLUSH; - flush(s, out, outlink, &nb_samples_written, 0); - goto silence_copy_flush; - } - } - } - s->one_period++; - flush(s, out, outlink, &nb_samples_written, 0); - } else { - av_samples_copy(out->extended_data, in->extended_data, - nb_samples_written, - nb_samples_read, nbs, - outlink->ch_layout.nb_channels, outlink->format); - - av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples); - av_frame_free(&out); - } - break; - - case SILENCE_COPY_FLUSH: -silence_copy_flush: - nbs = s->stop_holdoff_end - s->stop_holdoff_offset; - if (!nbs) - break; - - out = ff_get_audio_buffer(outlink, nbs); - if (!out) { - av_frame_free(&in); - return AVERROR(ENOMEM); - } - - av_samples_copy(out->extended_data, s->stop_holdoff->extended_data, 0, - s->stop_holdoff_offset, nbs, - outlink->ch_layout.nb_channels, outlink->format); - - s->stop_holdoff_offset += nbs; - - av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples); - av_frame_free(&out); - - if (s->stop_holdoff_offset == s->stop_holdoff_end) { - s->stop_holdoff_offset = 0; - s->stop_holdoff_end = 0; - s->stop_silence_offset = 0; - s->stop_silence_end = 0; - s->mode = SILENCE_COPY; - goto silence_copy; - } - break; - case SILENCE_STOP: -silence_stop: - break; - default: - ret = AVERROR_BUG; } av_frame_free(&in); - - if (av_audio_fifo_size(s->fifo) > 0) { - out = ff_get_audio_buffer(outlink, av_audio_fifo_size(s->fifo)); - if (!out) - return AVERROR(ENOMEM); - - av_audio_fifo_read(s->fifo, (void **)out->extended_data, out->nb_samples); - out->pts = s->next_pts; - s->next_pts += av_rescale_q(out->nb_samples, - (AVRational){1, outlink->sample_rate}, - outlink->time_base); - - ret = ff_filter_frame(outlink, out); + if (out_nb_samples > 0) { + s->next_pts += out_nb_samples; + out->nb_samples = out_nb_samples; + return ff_filter_frame(outlink, out); } - return ret; + av_frame_free(&out); + ff_filter_set_ready(ctx, 100); + + return 0; } -static int request_frame(AVFilterLink *outlink) +static int activate(AVFilterContext *ctx) { - AVFilterContext *ctx = outlink->src; + AVFilterLink *outlink = ctx->outputs[0]; + AVFilterLink *inlink = ctx->inputs[0]; SilenceRemoveContext *s = ctx->priv; + AVFrame *in; int ret; - ret = ff_request_frame(ctx->inputs[0]); - if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH || - s->mode == SILENCE_COPY)) { - int nbs = s->stop_holdoff_end - s->stop_holdoff_offset; - if (nbs) { - AVFrame *frame; + FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); - frame = ff_get_audio_buffer(outlink, nbs); - if (!frame) - return AVERROR(ENOMEM); - - av_samples_copy(frame->extended_data, s->stop_holdoff->extended_data, 0, - s->stop_holdoff_offset, nbs, - outlink->ch_layout.nb_channels, outlink->format); - - frame->pts = s->next_pts; - s->next_pts += av_rescale_q(frame->nb_samples, - (AVRational){1, outlink->sample_rate}, - outlink->time_base); - - ret = ff_filter_frame(outlink, frame); - } - s->mode = SILENCE_STOP; + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) + return ret; + if (ret > 0) { + if (s->start_periods == 0 && s->stop_periods == 0) + return ff_filter_frame(outlink, in); + return filter_frame(outlink, in); } - return ret; + + FF_FILTER_FORWARD_STATUS(inlink, outlink); + FF_FILTER_FORWARD_WANTED(outlink, inlink); + + return FFERROR_NOT_READY; } static av_cold void uninit(AVFilterContext *ctx) { SilenceRemoveContext *s = ctx->priv; - av_frame_free(&s->start_holdoff); - av_frame_free(&s->start_silence_hold); - av_frame_free(&s->stop_holdoff); - av_frame_free(&s->stop_silence_hold); - av_frame_free(&s->window); - - av_audio_fifo_free(s->fifo); - s->fifo = NULL; + av_frame_free(&s->start_window); + av_frame_free(&s->stop_window); + av_frame_free(&s->start_queuef); + av_frame_free(&s->stop_queuef); + av_freep(&s->start_cache); + av_freep(&s->stop_cache); } static const AVFilterPad silenceremove_inputs[] = { @@ -944,15 +383,14 @@ static const AVFilterPad silenceremove_inputs[] = { .name = "default", .type = AVMEDIA_TYPE_AUDIO, .config_props = config_input, - .filter_frame = filter_frame, }, }; static const AVFilterPad silenceremove_outputs[] = { { - .name = "default", - .type = AVMEDIA_TYPE_AUDIO, - .request_frame = request_frame, + .name = "default", + .type = AVMEDIA_TYPE_AUDIO, + .config_props = config_output, }, }; @@ -962,9 +400,10 @@ const AVFilter ff_af_silenceremove = { .priv_size = sizeof(SilenceRemoveContext), .priv_class = &silenceremove_class, .init = init, + .activate = activate, .uninit = uninit, FILTER_INPUTS(silenceremove_inputs), FILTER_OUTPUTS(silenceremove_outputs), - FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, - AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_DBLP), + FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLT, + AV_SAMPLE_FMT_DBL), }; diff --git a/libavfilter/silenceremove_template.c b/libavfilter/silenceremove_template.c new file mode 100644 index 0000000000..1a12435ee6 --- /dev/null +++ b/libavfilter/silenceremove_template.c @@ -0,0 +1,293 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef ftype +#undef FABS +#undef FMAX +#undef SAMPLE_FORMAT +#undef SQRT +#undef ZERO +#if DEPTH == 32 +#define SAMPLE_FORMAT flt +#define SQRT sqrtf +#define FMAX fmaxf +#define FABS fabsf +#define ftype float +#define ZERO 0.f +#else +#define SAMPLE_FORMAT dbl +#define SQRT sqrt +#define FMAX fmax +#define FABS fabs +#define ftype double +#define ZERO 0.0 +#endif + +#define fn3(a,b) a##_##b +#define fn2(a,b) fn3(a,b) +#define fn(a) fn2(a, SAMPLE_FORMAT) + +static void fn(flush)(ftype *dst, const ftype *src, int src_pos, + int nb_channels, int count, int src_nb_samples, + int *out_nb_samples) +{ + int oidx, out_count = count; + int sidx = src_pos; + + if (count <= 0) + return; + + oidx = *out_nb_samples + out_count - 1; + *out_nb_samples += out_count; + while (out_count-- > 0) { + const int spos = sidx * nb_channels; + const int opos = oidx * nb_channels; + + for (int ch = 0; ch < nb_channels; ch++) + dst[opos + ch] = src[spos + ch]; + + oidx--; + sidx--; + if (sidx < 0) + sidx = src_nb_samples - 1; + } +} + +static void fn(queue_sample)(AVFilterContext *ctx, + const ftype *src, + ftype *queue, + int *queue_pos, + int *queue_size, + int *window_pos, + int *window_size, + const int nb_channels, + const int nb_samples, + const int window_nb_samples) +{ + const int pos = *queue_pos * nb_channels; + + for (int ch = 0; ch < nb_channels; ch++) + queue[pos + ch] = src[ch]; + + (*queue_pos)++; + if (*queue_pos >= nb_samples) + *queue_pos = 0; + + if (*queue_size < nb_samples) + (*queue_size)++; + + if (*window_size < window_nb_samples) + (*window_size)++; + + (*window_pos)++; + if (*window_pos >= window_nb_samples) + *window_pos = 0; +} + +static ftype fn(compute_peak)(ftype *cache, ftype sample, ftype wsample, + int window_size) +{ + ftype r; + + cache[0] += FABS(sample); + cache[0] -= FABS(wsample); + cache[0] = r = FMAX(cache[0], ZERO); + + return r / window_size; +} + +static ftype fn(compute_rms)(ftype *cache, ftype sample, ftype wsample, + int window_size) +{ + ftype r; + + cache[0] += sample * sample; + cache[0] -= wsample * wsample; + cache[0] = r = FMAX(cache[0], ZERO); + + return SQRT(r / window_size); +} + +static void fn(filter_start)(AVFilterContext *ctx, + const ftype *src, ftype *dst, + int *nb_out_samples, + const int nb_channels) +{ + SilenceRemoveContext *s = ctx->priv; + const int start_periods = s->start_periods; + int out_nb_samples = *nb_out_samples; + const int start_window_nb_samples = s->start_window->nb_samples; + const int start_nb_samples = s->start_queuef->nb_samples; + const int start_wpos = s->start_window_pos * nb_channels; + const int start_pos = s->start_queue_pos * nb_channels; + ftype *startw = (ftype *)s->start_window->data[0]; + ftype *start = (ftype *)s->start_queuef->data[0]; + const ftype start_threshold = s->start_threshold; + const int start_mode = s->start_mode; + int start_thres = (start_mode == T_ANY) ? 0 : 1; + const int start_duration = s->start_duration; + ftype *start_cache = (ftype *)s->start_cache; + const int start_silence = s->start_silence; + + fn(queue_sample)(ctx, src, start, + &s->start_queue_pos, + &s->start_queue_size, + &s->start_window_pos, + &s->start_window_size, + nb_channels, + start_nb_samples, + start_window_nb_samples); + + for (int ch = 0; ch < nb_channels; ch++) { + ftype start_sample = start[start_pos + ch]; + ftype start_ow = startw[start_wpos + ch]; + ftype tstart; + + tstart = fn(s->compute)(start_cache + ch, + start_sample, + start_ow, + s->start_window_size); + + startw[start_wpos + ch] = start_sample; + + if (start_mode == T_ANY) { + start_thres |= tstart > start_threshold; + } else { + start_thres &= tstart > start_threshold; + } + } + + if (s->start_found_periods >= 0) { + if (start_silence > 0) { + s->start_silence_count++; + if (s->start_silence_count > start_silence) + s->start_silence_count = start_silence; + } + + s->start_sample_count += start_thres; + } + + if (s->start_sample_count > start_duration) { + s->start_found_periods++; + if (s->start_found_periods >= start_periods) { + fn(flush)(dst, start, s->start_queue_pos, nb_channels, + s->start_silence_count, start_nb_samples, + &out_nb_samples); + s->start_silence_count = 0; + s->start_found_periods = -1; + } + + s->start_sample_count = 0; + } + + if (s->start_found_periods < 0) { + const int dst_pos = out_nb_samples * nb_channels; + for (int ch = 0; ch < nb_channels; ch++) + dst[dst_pos + ch] = start[start_pos + ch]; + out_nb_samples++; + } + + *nb_out_samples = out_nb_samples; +} + +static void fn(filter_stop)(AVFilterContext *ctx, + const ftype *src, ftype *dst, + int *nb_out_samples, + const int nb_channels) +{ + SilenceRemoveContext *s = ctx->priv; + const int stop_periods = s->stop_periods; + int out_nb_samples = *nb_out_samples; + const int stop_window_nb_samples = s->stop_window->nb_samples; + const int stop_nb_samples = s->stop_queuef->nb_samples; + const int stop_wpos = s->stop_window_pos * nb_channels; + const int stop_pos = s->stop_queue_pos * nb_channels; + ftype *stopw = (ftype *)s->stop_window->data[0]; + const ftype stop_threshold = s->stop_threshold; + ftype *stop = (ftype *)s->stop_queuef->data[0]; + const int stop_mode = s->stop_mode; + int stop_thres = (stop_mode == T_ANY) ? 0 : 1; + const int stop_duration = s->stop_duration; + ftype *stop_cache = (ftype *)s->stop_cache; + const int stop_silence = s->stop_silence; + const int restart = s->restart; + + fn(queue_sample)(ctx, src, stop, + &s->stop_queue_pos, + &s->stop_queue_size, + &s->stop_window_pos, + &s->stop_window_size, + nb_channels, + stop_nb_samples, + stop_window_nb_samples); + + for (int ch = 0; ch < nb_channels; ch++) { + ftype stop_sample = stop[stop_pos + ch]; + ftype stop_ow = stopw[stop_wpos + ch]; + ftype tstop; + + tstop = fn(s->compute)(stop_cache + ch, + stop_sample, + stop_ow, + s->stop_window_size); + + stopw[stop_wpos + ch] = stop_sample; + + if (stop_mode == T_ANY) { + stop_thres |= tstop <= stop_threshold; + } else { + stop_thres &= tstop <= stop_threshold; + } + } + + s->found_nonsilence = FFMAX(s->found_nonsilence, !stop_thres); + if (restart && !stop_thres) + s->stop_found_periods = 0; + + if (s->stop_found_periods >= 0) { + if (s->found_nonsilence) { + s->stop_sample_count += stop_thres; + s->stop_sample_count *= stop_thres; + } + } else if (s->stop_silence_count > 0) { + const int dst_pos = out_nb_samples * nb_channels; + for (int ch = 0; ch < nb_channels; ch++) + dst[dst_pos + ch] = stop[stop_pos + ch]; + s->stop_silence_count--; + out_nb_samples++; + } + + if (s->stop_sample_count > stop_duration) { + s->stop_found_periods++; + if (s->stop_found_periods >= stop_periods) { + s->stop_found_periods = -1; + s->stop_silence_count = stop_silence; + } + + s->stop_sample_count = 0; + } + + if (s->stop_found_periods >= 0) { + const int dst_pos = out_nb_samples * nb_channels; + for (int ch = 0; ch < nb_channels; ch++) + dst[dst_pos + ch] = stop[stop_pos + ch]; + out_nb_samples++; + } + + *nb_out_samples = out_nb_samples; +}