lavu/tx: implement aarch64 NEON SIMD FFT

The fastest fast Fourier transform in not just the west, but the world,
now for the most popular toy ISA.

On a high level, it follows the design of the AVX2 version closely,
with the exception that the input is slightly less permuted as we don't have
to do lane switching with the input on double 4pt and 8pt.

On a low level, the lack of subadd/addsub instructions REALLY penalizes
any attempt at writing an FFT. That single register matters a lot,
and reloading it simply takes unacceptably long.
In x86 land, vendors would've noticed developers need this.
In ARM land, you get a badly designed complex multiplication instruction
we cannot use, that's not present on 95% of devices. Because only
compilers matter, right?

Future optimization options are very few, perhaps better register
management to use more ld1/st1s.

All timings below are in cycles:
A53:
Length | C           | New (lavu)  | Old (lavc)  | FFTW
------ |-------------|-------------|-------------|-----
4      |         842 | 420         | 1210        | 1460
8      |        1538 | 1020        | 1850        | 2520
16     |        3717 | 1900        | 3700        | 3990
32     |        9156 | 4070        | 8289        | 8860
64     |       21160 | 9931        | 18600       | 19625
128    |       49180 | 23278       | 41922       | 41922
256    |      112073 | 53876       | 93202       | 101092
512    |      252864 | 122884      | 205897      | 207868
1024   |      560512 | 278322      | 458071      | 453053
2048   |     1295402 | 775835      | 1038205     | 1020265
4096   |     3281263 | 2021221     | 2409718     | 2577554
8192   |     8577845 | 4780526     | 5673041     | 6802722

Apple M1
New  - Total for len 512 reps 2097152 = 1.459141 s
Old  - Total for len 512 reps 2097152 = 2.251344 s
FFTW - Total for len 512 reps 2097152 = 1.868429 s

New  - Total for len 1024 reps 4194304 = 6.490080 s
Old  - Total for len 1024 reps 4194304 = 9.604949 s
FFTW - Total for len 1024 reps 4194304 = 7.889281 s

New  - Total for len 16384 reps 262144 = 10.374001 s
Old  - Total for len 16384 reps 262144 = 15.266713 s
FFTW - Total for len 16384 reps 262144 = 12.341745 s

New  - Total for len 65536 reps 8192 = 1.769812 s
Old  - Total for len 65536 reps 8192 = 4.209413 s
FFTW - Total for len 65536 reps 8192 = 3.012365 s

New  - Total for len 131072 reps 4096 = 1.942836 s
Old  - Segfaults
FFTW - Total for len 131072 reps 4096 = 3.713713 s

Thanks to wbs for some simplifications, assembler fixes and a review
and to jannau for giving it a look.
This commit is contained in:
Lynne 2022-02-03 11:27:03 +00:00
parent 9bf9d42d01
commit f932b89ea3
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464
5 changed files with 1366 additions and 1 deletions

View File

@ -1,4 +1,6 @@
OBJS += aarch64/cpu.o \
aarch64/float_dsp_init.o \
aarch64/tx_float_init.o \
NEON-OBJS += aarch64/float_dsp_neon.o
NEON-OBJS += aarch64/float_dsp_neon.o \
aarch64/tx_float_neon.o \

View File

@ -0,0 +1,65 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define TX_FLOAT
#include "libavutil/tx_priv.h"
#include "libavutil/attributes.h"
#include "libavutil/aarch64/cpu.h"
TX_DECL_FN(fft2, neon)
TX_DECL_FN(fft4_fwd, neon)
TX_DECL_FN(fft4_inv, neon)
TX_DECL_FN(fft8, neon)
TX_DECL_FN(fft8_ns, neon)
TX_DECL_FN(fft16, neon)
TX_DECL_FN(fft16_ns, neon)
TX_DECL_FN(fft32, neon)
TX_DECL_FN(fft32_ns, neon)
TX_DECL_FN(fft_sr, neon)
TX_DECL_FN(fft_sr_ns, neon)
static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd,
uint64_t flags, FFTXCodeletOptions *opts,
int len, int inv, const void *scale)
{
const int inv_lookup = opts ? opts->invert_lookup : 1;
ff_tx_init_tabs_float(len);
if (cd->max_len == 2)
return ff_tx_gen_ptwo_revtab(s, inv_lookup);
else
return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, 8, 0);
}
const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = {
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE, 0),
TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft16, FFT, 16, 16, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
NULL,
};

File diff suppressed because it is too large Load Diff

View File

@ -457,6 +457,9 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
ff_tx_null_list,
#if HAVE_X86ASM
ff_tx_codelet_list_float_x86,
#endif
#if ARCH_AARCH64
ff_tx_codelet_list_float_aarch64,
#endif
};
int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);

View File

@ -308,6 +308,7 @@ int ff_tx_mdct_gen_exp_int32 (AVTXContext *s, int *pre_tab);
/* Lists of codelets */
extern const FFTXCodelet * const ff_tx_codelet_list_float_c [];
extern const FFTXCodelet * const ff_tx_codelet_list_float_x86 [];
extern const FFTXCodelet * const ff_tx_codelet_list_float_aarch64 [];
extern const FFTXCodelet * const ff_tx_codelet_list_double_c [];