From 0df0e12d025bf98e5869dc5716e296d0768bccd7 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Thu, 15 Oct 2020 17:29:04 +0200
Subject: [PATCH] avfilter/af_aiir: implement parallel processing

---
 doc/filters.texi      |  12 ++-
 libavfilter/af_aiir.c | 234 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 228 insertions(+), 18 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index 8404f4fb9a..50ef692077 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -1421,8 +1421,16 @@ S-plane zeros/poles
 @end table
 
 @item process, r
-Set kind of processing.
-Can be @code{d} - direct or @code{s} - serial cascading. Default is @code{s}.
+Set type of processing.
+
+@table @samp
+@item d
+direct processing
+@item s
+serial processing
+@item p
+parallel processing
+@end table
 
 @item precision, e
 Set filtering precision.
diff --git a/libavfilter/af_aiir.c b/libavfilter/af_aiir.c
index d2f3fc7374..4b6e867819 100644
--- a/libavfilter/af_aiir.c
+++ b/libavfilter/af_aiir.c
@@ -49,6 +49,7 @@ typedef struct IIRChannel {
     double *ab[2];
     double g;
     double *cache[2];
+    double fir;
     BiquadContext *biquads;
     int clippings;
 } IIRChannel;
@@ -183,6 +184,7 @@ static int iir_ch_serial_## name(AVFilterContext *ctx, void *arg, int ch, int nb
     const double ig = s->dry_gain;                                      \
     const double og = s->wet_gain;                                      \
     const double mix = s->mix;                                          \
+    const double imix = 1. - mix;                                       \
     ThreadData *td = arg;                                               \
     AVFrame *in = td->in, *out = td->out;                               \
     const type *src = (const type *)in->extended_data[ch];              \
@@ -205,16 +207,16 @@ static int iir_ch_serial_## name(AVFilterContext *ctx, void *arg, int ch, int nb
         double o2 = iir->biquads[i].o2;                                 \
                                                                         \
         for (n = 0; n < in->nb_samples; n++) {                          \
-            double sample = ig * (i ? dst[n] : src[n]);                 \
-            double o0 = sample * b0 + i1 * b1 + i2 * b2 + o1 * a1 + o2 * a2; \
+            double i0 = ig * (i ? dst[n] : src[n]);                     \
+            double o0 = i0 * b0 + i1 * b1 + i2 * b2 + o1 * a1 + o2 * a2; \
                                                                         \
             i2 = i1;                                                    \
-            i1 = src[n];                                                \
+            i1 = i0;                                                    \
             o2 = o1;                                                    \
             o1 = o0;                                                    \
             o0 *= og * g;                                               \
                                                                         \
-            o0 = o0 * mix + (1. - mix) * sample;                        \
+            o0 = o0 * mix + imix * i0;                                  \
             if (need_clipping && o0 < min) {                            \
                 (*clippings)++;                                         \
                 dst[n] = min;                                           \
@@ -239,6 +241,76 @@ SERIAL_IIR_CH(s32p, int32_t, INT32_MIN, INT32_MAX, 1)
 SERIAL_IIR_CH(fltp, float,         -1.,        1., 0)
 SERIAL_IIR_CH(dblp, double,        -1.,        1., 0)
 
+#define PARALLEL_IIR_CH(name, type, min, max, need_clipping)            \
+static int iir_ch_parallel_## name(AVFilterContext *ctx, void *arg,     \
+                                   int ch, int nb_jobs)                 \
+{                                                                       \
+    AudioIIRContext *s = ctx->priv;                                     \
+    const double ig = s->dry_gain;                                      \
+    const double og = s->wet_gain;                                      \
+    const double mix = s->mix;                                          \
+    const double imix = 1. - mix;                                       \
+    ThreadData *td = arg;                                               \
+    AVFrame *in = td->in, *out = td->out;                               \
+    const type *src = (const type *)in->extended_data[ch];              \
+    type *dst = (type *)out->extended_data[ch];                         \
+    IIRChannel *iir = &s->iir[ch];                                      \
+    const double g = iir->g;                                            \
+    const double fir = iir->fir;                                        \
+    int *clippings = &iir->clippings;                                   \
+    int nb_biquads = (FFMAX(iir->nb_ab[0], iir->nb_ab[1]) + 1) / 2;     \
+    int n, i;                                                           \
+                                                                        \
+    for (i = 0; i < nb_biquads; i++) {                                  \
+        const double a1 = -iir->biquads[i].a[1];                        \
+        const double a2 = -iir->biquads[i].a[2];                        \
+        const double b1 = iir->biquads[i].b[1];                         \
+        const double b2 = iir->biquads[i].b[2];                         \
+        double i1 = iir->biquads[i].i1;                                 \
+        double i2 = iir->biquads[i].i2;                                 \
+        double o1 = iir->biquads[i].o1;                                 \
+        double o2 = iir->biquads[i].o2;                                 \
+                                                                        \
+        for (n = 0; n < in->nb_samples; n++) {                          \
+            double i0 = ig * src[n];                                    \
+            double o0 = i1 * b1 + i2 * b2 + o1 * a1 + o2 * a2;          \
+                                                                        \
+            i2 = i1;                                                    \
+            i1 = i0;                                                    \
+            o2 = o1;                                                    \
+            o1 = o0;                                                    \
+            o0 *= og * g;                                               \
+            o0 += dst[n];                                               \
+                                                                        \
+            if (need_clipping && o0 < min) {                            \
+                (*clippings)++;                                         \
+                dst[n] = min;                                           \
+            } else if (need_clipping && o0 > max) {                     \
+                (*clippings)++;                                         \
+                dst[n] = max;                                           \
+            } else {                                                    \
+                dst[n] = o0;                                            \
+            }                                                           \
+        }                                                               \
+        iir->biquads[i].i1 = i1;                                        \
+        iir->biquads[i].i2 = i2;                                        \
+        iir->biquads[i].o1 = o1;                                        \
+        iir->biquads[i].o2 = o2;                                        \
+    }                                                                   \
+                                                                        \
+    for (n = 0; n < in->nb_samples; n++) {                              \
+        dst[n] += fir * src[n];                                         \
+        dst[n] = dst[n] * mix + imix * src[n];                          \
+    }                                                                   \
+                                                                        \
+    return 0;                                                           \
+}
+
+PARALLEL_IIR_CH(s16p, int16_t, INT16_MIN, INT16_MAX, 1)
+PARALLEL_IIR_CH(s32p, int32_t, INT32_MIN, INT32_MAX, 1)
+PARALLEL_IIR_CH(fltp, float,         -1.,        1., 0)
+PARALLEL_IIR_CH(dblp, double,        -1.,        1., 0)
+
 static void count_coefficients(char *item_str, int *nb_items)
 {
     char *p;
@@ -656,6 +728,128 @@ static int decompose_zp2biquads(AVFilterContext *ctx, int channels)
     return 0;
 }
 
+static void biquad_process(double *x, double *y, int length,
+                           double b0, double b1, double b2,
+                           double a1, double a2)
+{
+    double w1 = 0., w2 = 0.;
+
+    a1 = -a1;
+    a2 = -a2;
+
+    for (int n = 0; n < length; n++) {
+        double out, in = x[n];
+
+        y[n] = out = in * b0 + w1;
+        w1 = b1 * in + w2 + a1 * out;
+        w2 = b2 * in + a2 * out;
+    }
+}
+
+static void solve(double *matrix, double *vector, int n, double *y, double *x, double *lu)
+{
+    double sum = 0.;
+
+    for (int i = 0; i < n; i++) {
+        for (int j = i; j < n; j++) {
+            sum = 0.;
+            for (int k = 0; k < i; k++)
+                sum += lu[i * n + k] * lu[k * n + j];
+            lu[i * n + j] = matrix[j * n + i] - sum;
+        }
+        for (int j = i + 1; j < n; j++) {
+            sum = 0.;
+            for (int k = 0; k < i; k++)
+                sum += lu[j * n + k] * lu[k * n + i];
+            lu[j * n + i] = (1. / lu[i * n + i]) * (matrix[i * n + j] - sum);
+        }
+    }
+
+    for (int i = 0; i < n; i++) {
+        sum = 0.;
+        for (int k = 0; k < i; k++)
+            sum += lu[i * n + k] * y[k];
+        y[i] = vector[i] - sum;
+    }
+
+    for (int i = n - 1; i >= 0; i--) {
+        sum = 0.;
+        for (int k = i + 1; k < n; k++)
+            sum += lu[i * n + k] * x[k];
+        x[i] = (1 / lu[i * n + i]) * (y[i] - sum);
+    }
+}
+
+static int convert_serial2parallel(AVFilterContext *ctx, int channels)
+{
+    AudioIIRContext *s = ctx->priv;
+    int ret = 0;
+
+    for (int ch = 0; ch < channels; ch++) {
+        IIRChannel *iir = &s->iir[ch];
+        int nb_biquads = (FFMAX(iir->nb_ab[0], iir->nb_ab[1]) + 1) / 2;
+        int length = nb_biquads * 2 + 1;
+        double *impulse = av_calloc(length, sizeof(*impulse));
+        double *y = av_calloc(length, sizeof(*y));
+        double *resp = av_calloc(length, sizeof(*resp));
+        double *M = av_calloc((length - 1) * 2 * nb_biquads, sizeof(*M));
+        double *W = av_calloc((length - 1) * 2 * nb_biquads, sizeof(*W));
+
+        if (!impulse || !y || !resp || !M) {
+            av_free(impulse);
+            av_free(y);
+            av_free(resp);
+            av_free(M);
+            av_free(W);
+            return AVERROR(ENOMEM);
+        }
+
+        impulse[0] = 1.;
+
+        for (int n = 0; n < nb_biquads; n++) {
+            BiquadContext *biquad = &iir->biquads[n];
+
+            biquad_process(n ? y : impulse, y, length,
+                           biquad->b[0], biquad->b[1], biquad->b[2],
+                           biquad->a[1], biquad->a[2]);
+        }
+
+        for (int n = 0; n < nb_biquads; n++) {
+            BiquadContext *biquad = &iir->biquads[n];
+
+            biquad_process(impulse, resp, length - 1,
+                           1., 0., 0., biquad->a[1], biquad->a[2]);
+
+            memcpy(M + n * 2 * (length - 1), resp, sizeof(*resp) * (length - 1));
+            memcpy(M + n * 2 * (length - 1) + length, resp, sizeof(*resp) * (length - 2));
+            memset(resp, 0, length * sizeof(*resp));
+        }
+
+        solve(M, &y[1], length - 1, &impulse[1], resp, W);
+
+        iir->fir = y[0];
+
+        for (int n = 0; n < nb_biquads; n++) {
+            BiquadContext *biquad = &iir->biquads[n];
+
+            biquad->b[0] = 0.;
+            biquad->b[1] = resp[n * 2 + 0];
+            biquad->b[2] = resp[n * 2 + 1];
+        }
+
+        av_free(impulse);
+        av_free(y);
+        av_free(resp);
+        av_free(M);
+        av_free(W);
+
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
 static void convert_pr2zp(AVFilterContext *ctx, int channels)
 {
     AudioIIRContext *s = ctx->priv;
@@ -1034,15 +1228,22 @@ static int config_output(AVFilterLink *outlink)
         if (ret < 0)
             return ret;
     } else if (s->format == 0 && s->process == 1) {
-        av_log(ctx, AV_LOG_ERROR, "Serial cascading is not implemented for transfer function.\n");
+        av_log(ctx, AV_LOG_ERROR, "Serial processing is not implemented for transfer function.\n");
+        return AVERROR_PATCHWELCOME;
+    } else if (s->format == 0 && s->process == 2) {
+        av_log(ctx, AV_LOG_ERROR, "Parallel processing is not implemented for transfer function.\n");
         return AVERROR_PATCHWELCOME;
     } else if (s->format > 0 && s->process == 1) {
-        if (inlink->format == AV_SAMPLE_FMT_S16P)
-            av_log(ctx, AV_LOG_WARNING, "Serial cascading is not recommended for i16 precision.\n");
-
         ret = decompose_zp2biquads(ctx, inlink->channels);
         if (ret < 0)
             return ret;
+    } else if (s->format > 0 && s->process == 2) {
+        ret = decompose_zp2biquads(ctx, inlink->channels);
+        if (ret < 0)
+            return ret;
+        ret = convert_serial2parallel(ctx, inlink->channels);
+        if (ret < 0)
+            return ret;
     }
 
     for (ch = 0; s->format == 0 && ch < inlink->channels; ch++) {
@@ -1061,10 +1262,10 @@ static int config_output(AVFilterLink *outlink)
     }
 
     switch (inlink->format) {
-    case AV_SAMPLE_FMT_DBLP: s->iir_channel = s->process == 1 ? iir_ch_serial_dblp : iir_ch_dblp; break;
-    case AV_SAMPLE_FMT_FLTP: s->iir_channel = s->process == 1 ? iir_ch_serial_fltp : iir_ch_fltp; break;
-    case AV_SAMPLE_FMT_S32P: s->iir_channel = s->process == 1 ? iir_ch_serial_s32p : iir_ch_s32p; break;
-    case AV_SAMPLE_FMT_S16P: s->iir_channel = s->process == 1 ? iir_ch_serial_s16p : iir_ch_s16p; break;
+    case AV_SAMPLE_FMT_DBLP: s->iir_channel = s->process == 2 ? iir_ch_parallel_dblp : s->process == 1 ? iir_ch_serial_dblp : iir_ch_dblp; break;
+    case AV_SAMPLE_FMT_FLTP: s->iir_channel = s->process == 2 ? iir_ch_parallel_fltp : s->process == 1 ? iir_ch_serial_fltp : iir_ch_fltp; break;
+    case AV_SAMPLE_FMT_S32P: s->iir_channel = s->process == 2 ? iir_ch_parallel_s32p : s->process == 1 ? iir_ch_serial_s32p : iir_ch_s32p; break;
+    case AV_SAMPLE_FMT_S16P: s->iir_channel = s->process == 2 ? iir_ch_parallel_s16p : s->process == 1 ? iir_ch_serial_s16p : iir_ch_s16p; break;
     }
 
     return 0;
@@ -1079,7 +1280,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     AVFrame *out;
     int ch, ret;
 
-    if (av_frame_is_writable(in)) {
+    if (av_frame_is_writable(in) && s->process != 2) {
         out = in;
     } else {
         out = ff_get_audio_buffer(outlink, in->nb_samples);
@@ -1232,10 +1433,11 @@ static const AVOption aiir_options[] = {
     { "pr", "Z-plane zeros/poles (polar radians)", 0,                AV_OPT_TYPE_CONST,  {.i64=2},     0, 0, AF, "format" },
     { "pd", "Z-plane zeros/poles (polar degrees)", 0,                AV_OPT_TYPE_CONST,  {.i64=3},     0, 0, AF, "format" },
     { "sp", "S-plane zeros/poles",                 0,                AV_OPT_TYPE_CONST,  {.i64=4},     0, 0, AF, "format" },
-    { "process", "set kind of processing",         OFFSET(process),  AV_OPT_TYPE_INT,    {.i64=1},     0, 1, AF, "process" },
-    { "r", "set kind of processing",               OFFSET(process),  AV_OPT_TYPE_INT,    {.i64=1},     0, 1, AF, "process" },
+    { "process", "set kind of processing",         OFFSET(process),  AV_OPT_TYPE_INT,    {.i64=1},     0, 2, AF, "process" },
+    { "r", "set kind of processing",               OFFSET(process),  AV_OPT_TYPE_INT,    {.i64=1},     0, 2, AF, "process" },
     { "d", "direct",                               0,                AV_OPT_TYPE_CONST,  {.i64=0},     0, 0, AF, "process" },
-    { "s", "serial cascading",                     0,                AV_OPT_TYPE_CONST,  {.i64=1},     0, 0, AF, "process" },
+    { "s", "serial",                               0,                AV_OPT_TYPE_CONST,  {.i64=1},     0, 0, AF, "process" },
+    { "p", "parallel",                             0,                AV_OPT_TYPE_CONST,  {.i64=2},     0, 0, AF, "process" },
     { "precision", "set filtering precision",      OFFSET(precision),AV_OPT_TYPE_INT,    {.i64=0},     0, 3, AF, "precision" },
     { "e", "set precision",                        OFFSET(precision),AV_OPT_TYPE_INT,    {.i64=0},     0, 3, AF, "precision" },
     { "dbl", "double-precision floating-point",    0,                AV_OPT_TYPE_CONST,  {.i64=0},     0, 0, AF, "precision" },