af_scaletempo2: use gcc vectors to speed up inner loop

This brings my scaletempo2 benchmark down from ~22s to ~7s on my machine (-march=native), and down to ~11s with a generic compile. Guarded behind an appropriate #ifdef to avoid being ableist against people who have the clinical need to run obscure platforms. Closes #8848
2025-02-17 21:27:08 +00:00 · 2021-05-22 21:03:52 +02:00 · 2021-05-22 21:03:52 +02:00 · ec0006bfa1
commit ec0006bfa1
parent 353cccfa8c
2 changed files with 76 additions and 3 deletions
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@ -4,6 +4,8 @@
 #include "audio/chmap.h"
 #include "audio/filter/af_scaletempo2_internals.h"
 #include "config.h"
 // Algorithm overview (from chromium):
 // Waveform Similarity Overlap-and-add (WSOLA).
 //
@ -104,6 +106,10 @@ static float multi_channel_similarity_measure(
    return similarity_measure;
 }
 #if HAVE_VECTOR
 typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
 // Dot-product of channels of two AudioBus. For each AudioBus an offset is
 // given. |dot_product[k]| is the dot-product of channel |k|. The caller should
 // allocate sufficient space for |dot_product|.
@ -116,16 +122,79 @@ static void multi_channel_dot_product(
    assert(frame_offset_a >= 0);
    assert(frame_offset_b >= 0);
    memset(dot_product, 0, sizeof(*dot_product) * channels);
    for (int k = 0; k < channels; ++k) {
        const float* ch_a = a[k] + frame_offset_a;
        const float* ch_b = b[k] + frame_offset_b;
-        for (int n = 0; n < num_frames; ++n) {
+        float sum = 0.0;
-            dot_product[k] += *ch_a++ * *ch_b++;
+        if (num_frames < 32)
            goto rest;
        const v8sf *va = (const v8sf *) ch_a;
        const v8sf *vb = (const v8sf *) ch_b;
        v8sf vsum[4] = {
            // Initialize to product of first 32 floats
            va[0] * vb[0],
            va[1] * vb[1],
            va[2] * vb[2],
            va[3] * vb[3],
        };
        va += 4;
        vb += 4;
        // Process `va` and `vb` across four vertical stripes
        for (int n = 1; n < num_frames / 32; n++) {
            vsum[0] += va[0] * vb[0];
            vsum[1] += va[1] * vb[1];
            vsum[2] += va[2] * vb[2];
            vsum[3] += va[3] * vb[3];
            va += 4;
            vb += 4;
        }
        // Vertical sum across `vsum` entries
        vsum[0] += vsum[1];
        vsum[2] += vsum[3];
        vsum[0] += vsum[2];
        // Horizontal sum across `vsum[0]`, could probably be done better but
        // this section is not super performance critical
        float *vf = (float *) &vsum[0];
        sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
        ch_a = (const float *) va;
        ch_b = (const float *) vb;
 rest:
        // Process the remainder
        for (int n = 0; n < num_frames % 32; n++)
            sum += *ch_a++ * *ch_b++;
        dot_product[k] = sum;
    }
 }
 #else // !HAVE_VECTOR
 static void multi_channel_dot_product(
    float **a, int frame_offset_a,
    float **b, int frame_offset_b,
    int channels,
    int num_frames, float *dot_product)
 {
    assert(frame_offset_a >= 0);
    assert(frame_offset_b >= 0);
    for (int k = 0; k < channels; ++k) {
        const float* ch_a = a[k] + frame_offset_a;
        const float* ch_b = b[k] + frame_offset_b;
        float sum = 0.0;
        for (int n = 0; n < num_frames; n++)
            sum += *ch_a++ * *ch_b++;
        dot_product[k] = sum;
    }
 }
 #endif // HAVE_VECTOR
 // Fit the curve f(x) = a * x^2 + b * x + c such that
 //   f(-1) = y[0]
 //   f(0) = y[1]
--- a/4
+++ b/4
@ -116,6 +116,10 @@ build_options = [
        'desc': 'inline assembly (currently without effect)',
        'default': 'enable',
        'func': check_true,
    }, {
        'name': '--vector',
        'desc': 'GCC vector instructions',
        'func': check_statement([], 'float v __attribute__((vector_size(32)))'),
    }, {
        'name': '--clang-database',
        'desc': 'generate a clang compilation database',