mirror of
https://github.com/mpv-player/mpv
synced 2025-02-17 21:27:08 +00:00
af_scaletempo2: use gcc vectors to speed up inner loop
This brings my scaletempo2 benchmark down from ~22s to ~7s on my machine (-march=native), and down to ~11s with a generic compile. Guarded behind an appropriate #ifdef to avoid being ableist against people who have the clinical need to run obscure platforms. Closes #8848
This commit is contained in:
parent
353cccfa8c
commit
ec0006bfa1
@ -4,6 +4,8 @@
|
|||||||
#include "audio/chmap.h"
|
#include "audio/chmap.h"
|
||||||
#include "audio/filter/af_scaletempo2_internals.h"
|
#include "audio/filter/af_scaletempo2_internals.h"
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
// Algorithm overview (from chromium):
|
// Algorithm overview (from chromium):
|
||||||
// Waveform Similarity Overlap-and-add (WSOLA).
|
// Waveform Similarity Overlap-and-add (WSOLA).
|
||||||
//
|
//
|
||||||
@ -104,6 +106,10 @@ static float multi_channel_similarity_measure(
|
|||||||
return similarity_measure;
|
return similarity_measure;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_VECTOR
|
||||||
|
|
||||||
|
typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
|
||||||
|
|
||||||
// Dot-product of channels of two AudioBus. For each AudioBus an offset is
|
// Dot-product of channels of two AudioBus. For each AudioBus an offset is
|
||||||
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
|
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
|
||||||
// allocate sufficient space for |dot_product|.
|
// allocate sufficient space for |dot_product|.
|
||||||
@ -116,16 +122,79 @@ static void multi_channel_dot_product(
|
|||||||
assert(frame_offset_a >= 0);
|
assert(frame_offset_a >= 0);
|
||||||
assert(frame_offset_b >= 0);
|
assert(frame_offset_b >= 0);
|
||||||
|
|
||||||
memset(dot_product, 0, sizeof(*dot_product) * channels);
|
|
||||||
for (int k = 0; k < channels; ++k) {
|
for (int k = 0; k < channels; ++k) {
|
||||||
const float* ch_a = a[k] + frame_offset_a;
|
const float* ch_a = a[k] + frame_offset_a;
|
||||||
const float* ch_b = b[k] + frame_offset_b;
|
const float* ch_b = b[k] + frame_offset_b;
|
||||||
for (int n = 0; n < num_frames; ++n) {
|
float sum = 0.0;
|
||||||
dot_product[k] += *ch_a++ * *ch_b++;
|
if (num_frames < 32)
|
||||||
|
goto rest;
|
||||||
|
|
||||||
|
const v8sf *va = (const v8sf *) ch_a;
|
||||||
|
const v8sf *vb = (const v8sf *) ch_b;
|
||||||
|
v8sf vsum[4] = {
|
||||||
|
// Initialize to product of first 32 floats
|
||||||
|
va[0] * vb[0],
|
||||||
|
va[1] * vb[1],
|
||||||
|
va[2] * vb[2],
|
||||||
|
va[3] * vb[3],
|
||||||
|
};
|
||||||
|
va += 4;
|
||||||
|
vb += 4;
|
||||||
|
|
||||||
|
// Process `va` and `vb` across four vertical stripes
|
||||||
|
for (int n = 1; n < num_frames / 32; n++) {
|
||||||
|
vsum[0] += va[0] * vb[0];
|
||||||
|
vsum[1] += va[1] * vb[1];
|
||||||
|
vsum[2] += va[2] * vb[2];
|
||||||
|
vsum[3] += va[3] * vb[3];
|
||||||
|
va += 4;
|
||||||
|
vb += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Vertical sum across `vsum` entries
|
||||||
|
vsum[0] += vsum[1];
|
||||||
|
vsum[2] += vsum[3];
|
||||||
|
vsum[0] += vsum[2];
|
||||||
|
|
||||||
|
// Horizontal sum across `vsum[0]`, could probably be done better but
|
||||||
|
// this section is not super performance critical
|
||||||
|
float *vf = (float *) &vsum[0];
|
||||||
|
sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
|
||||||
|
ch_a = (const float *) va;
|
||||||
|
ch_b = (const float *) vb;
|
||||||
|
|
||||||
|
rest:
|
||||||
|
// Process the remainder
|
||||||
|
for (int n = 0; n < num_frames % 32; n++)
|
||||||
|
sum += *ch_a++ * *ch_b++;
|
||||||
|
|
||||||
|
dot_product[k] = sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else // !HAVE_VECTOR
|
||||||
|
|
||||||
|
static void multi_channel_dot_product(
|
||||||
|
float **a, int frame_offset_a,
|
||||||
|
float **b, int frame_offset_b,
|
||||||
|
int channels,
|
||||||
|
int num_frames, float *dot_product)
|
||||||
|
{
|
||||||
|
assert(frame_offset_a >= 0);
|
||||||
|
assert(frame_offset_b >= 0);
|
||||||
|
|
||||||
|
for (int k = 0; k < channels; ++k) {
|
||||||
|
const float* ch_a = a[k] + frame_offset_a;
|
||||||
|
const float* ch_b = b[k] + frame_offset_b;
|
||||||
|
float sum = 0.0;
|
||||||
|
for (int n = 0; n < num_frames; n++)
|
||||||
|
sum += *ch_a++ * *ch_b++;
|
||||||
|
dot_product[k] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // HAVE_VECTOR
|
||||||
|
|
||||||
// Fit the curve f(x) = a * x^2 + b * x + c such that
|
// Fit the curve f(x) = a * x^2 + b * x + c such that
|
||||||
// f(-1) = y[0]
|
// f(-1) = y[0]
|
||||||
// f(0) = y[1]
|
// f(0) = y[1]
|
||||||
|
4
wscript
4
wscript
@ -116,6 +116,10 @@ build_options = [
|
|||||||
'desc': 'inline assembly (currently without effect)',
|
'desc': 'inline assembly (currently without effect)',
|
||||||
'default': 'enable',
|
'default': 'enable',
|
||||||
'func': check_true,
|
'func': check_true,
|
||||||
|
}, {
|
||||||
|
'name': '--vector',
|
||||||
|
'desc': 'GCC vector instructions',
|
||||||
|
'func': check_statement([], 'float v __attribute__((vector_size(32)))'),
|
||||||
}, {
|
}, {
|
||||||
'name': '--clang-database',
|
'name': '--clang-database',
|
||||||
'desc': 'generate a clang compilation database',
|
'desc': 'generate a clang compilation database',
|
||||||
|
Loading…
Reference in New Issue
Block a user