From ec0006bfa1aaf608a7141929f2871c89ac7a15d6 Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sat, 22 May 2021 21:03:52 +0200 Subject: [PATCH] af_scaletempo2: use gcc vectors to speed up inner loop This brings my scaletempo2 benchmark down from ~22s to ~7s on my machine (-march=native), and down to ~11s with a generic compile. Guarded behind an appropriate #ifdef to avoid being ableist against people who have the clinical need to run obscure platforms. Closes #8848 --- audio/filter/af_scaletempo2_internals.c | 75 ++++++++++++++++++++++++- wscript | 4 ++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c index 5eb0e6b8d9..1cee7e469f 100644 --- a/audio/filter/af_scaletempo2_internals.c +++ b/audio/filter/af_scaletempo2_internals.c @@ -4,6 +4,8 @@ #include "audio/chmap.h" #include "audio/filter/af_scaletempo2_internals.h" +#include "config.h" + // Algorithm overview (from chromium): // Waveform Similarity Overlap-and-add (WSOLA). // @@ -104,6 +106,10 @@ static float multi_channel_similarity_measure( return similarity_measure; } +#if HAVE_VECTOR + +typedef float v8sf __attribute__ ((vector_size (32), aligned (1))); + // Dot-product of channels of two AudioBus. For each AudioBus an offset is // given. |dot_product[k]| is the dot-product of channel |k|. The caller should // allocate sufficient space for |dot_product|. @@ -116,16 +122,79 @@ static void multi_channel_dot_product( assert(frame_offset_a >= 0); assert(frame_offset_b >= 0); - memset(dot_product, 0, sizeof(*dot_product) * channels); for (int k = 0; k < channels; ++k) { const float* ch_a = a[k] + frame_offset_a; const float* ch_b = b[k] + frame_offset_b; - for (int n = 0; n < num_frames; ++n) { - dot_product[k] += *ch_a++ * *ch_b++; + float sum = 0.0; + if (num_frames < 32) + goto rest; + + const v8sf *va = (const v8sf *) ch_a; + const v8sf *vb = (const v8sf *) ch_b; + v8sf vsum[4] = { + // Initialize to product of first 32 floats + va[0] * vb[0], + va[1] * vb[1], + va[2] * vb[2], + va[3] * vb[3], + }; + va += 4; + vb += 4; + + // Process `va` and `vb` across four vertical stripes + for (int n = 1; n < num_frames / 32; n++) { + vsum[0] += va[0] * vb[0]; + vsum[1] += va[1] * vb[1]; + vsum[2] += va[2] * vb[2]; + vsum[3] += va[3] * vb[3]; + va += 4; + vb += 4; } + + // Vertical sum across `vsum` entries + vsum[0] += vsum[1]; + vsum[2] += vsum[3]; + vsum[0] += vsum[2]; + + // Horizontal sum across `vsum[0]`, could probably be done better but + // this section is not super performance critical + float *vf = (float *) &vsum[0]; + sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7]; + ch_a = (const float *) va; + ch_b = (const float *) vb; + +rest: + // Process the remainder + for (int n = 0; n < num_frames % 32; n++) + sum += *ch_a++ * *ch_b++; + + dot_product[k] = sum; } } +#else // !HAVE_VECTOR + +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + for (int n = 0; n < num_frames; n++) + sum += *ch_a++ * *ch_b++; + dot_product[k] = sum; + } +} + +#endif // HAVE_VECTOR + // Fit the curve f(x) = a * x^2 + b * x + c such that // f(-1) = y[0] // f(0) = y[1] diff --git a/wscript b/wscript index e9f4d53002..1a5ff5aa65 100644 --- a/wscript +++ b/wscript @@ -116,6 +116,10 @@ build_options = [ 'desc': 'inline assembly (currently without effect)', 'default': 'enable', 'func': check_true, + }, { + 'name': '--vector', + 'desc': 'GCC vector instructions', + 'func': check_statement([], 'float v __attribute__((vector_size(32)))'), }, { 'name': '--clang-database', 'desc': 'generate a clang compilation database',