lavu/tx: implement aarch64 NEON SIMD FFT
The fastest fast Fourier transform in not just the west, but the world,
now for the most popular toy ISA.
On a high level, it follows the design of the AVX2 version closely,
with the exception that the input is slightly less permuted as we don't have
to do lane switching with the input on double 4pt and 8pt.
On a low level, the lack of subadd/addsub instructions REALLY penalizes
any attempt at writing an FFT. That single register matters a lot,
and reloading it simply takes unacceptably long.
In x86 land, vendors would've noticed developers need this.
In ARM land, you get a badly designed complex multiplication instruction
we cannot use, that's not present on 95% of devices. Because only
compilers matter, right?
Future optimization options are very few, perhaps better register
management to use more ld1/st1s.
All timings below are in cycles:
A53:
Length | C | New (lavu) | Old (lavc) | FFTW
------ |-------------|-------------|-------------|-----
4 | 842 | 420 | 1210 | 1460
8 | 1538 | 1020 | 1850 | 2520
16 | 3717 | 1900 | 3700 | 3990
32 | 9156 | 4070 | 8289 | 8860
64 | 21160 | 9931 | 18600 | 19625
128 | 49180 | 23278 | 41922 | 41922
256 | 112073 | 53876 | 93202 | 101092
512 | 252864 | 122884 | 205897 | 207868
1024 | 560512 | 278322 | 458071 | 453053
2048 | 1295402 | 775835 | 1038205 | 1020265
4096 | 3281263 | 2021221 | 2409718 | 2577554
8192 | 8577845 | 4780526 | 5673041 | 6802722
Apple M1
New - Total for len 512 reps 2097152 = 1.459141 s
Old - Total for len 512 reps 2097152 = 2.251344 s
FFTW - Total for len 512 reps 2097152 = 1.868429 s
New - Total for len 1024 reps 4194304 = 6.490080 s
Old - Total for len 1024 reps 4194304 = 9.604949 s
FFTW - Total for len 1024 reps 4194304 = 7.889281 s
New - Total for len 16384 reps 262144 = 10.374001 s
Old - Total for len 16384 reps 262144 = 15.266713 s
FFTW - Total for len 16384 reps 262144 = 12.341745 s
New - Total for len 65536 reps 8192 = 1.769812 s
Old - Total for len 65536 reps 8192 = 4.209413 s
FFTW - Total for len 65536 reps 8192 = 3.012365 s
New - Total for len 131072 reps 4096 = 1.942836 s
Old - Segfaults
FFTW - Total for len 131072 reps 4096 = 3.713713 s
Thanks to wbs for some simplifications, assembler fixes and a review
and to jannau for giving it a look.
2022-02-03 11:27:03 +00:00
|
|
|
/*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define TX_FLOAT
|
|
|
|
#include "libavutil/tx_priv.h"
|
|
|
|
#include "libavutil/attributes.h"
|
|
|
|
#include "libavutil/aarch64/cpu.h"
|
|
|
|
|
|
|
|
TX_DECL_FN(fft2, neon)
|
|
|
|
TX_DECL_FN(fft4_fwd, neon)
|
|
|
|
TX_DECL_FN(fft4_inv, neon)
|
|
|
|
TX_DECL_FN(fft8, neon)
|
|
|
|
TX_DECL_FN(fft8_ns, neon)
|
|
|
|
TX_DECL_FN(fft16, neon)
|
|
|
|
TX_DECL_FN(fft16_ns, neon)
|
|
|
|
TX_DECL_FN(fft32, neon)
|
|
|
|
TX_DECL_FN(fft32_ns, neon)
|
|
|
|
TX_DECL_FN(fft_sr, neon)
|
|
|
|
TX_DECL_FN(fft_sr_ns, neon)
|
|
|
|
|
|
|
|
static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd,
|
|
|
|
uint64_t flags, FFTXCodeletOptions *opts,
|
|
|
|
int len, int inv, const void *scale)
|
|
|
|
{
|
|
|
|
ff_tx_init_tabs_float(len);
|
|
|
|
if (cd->max_len == 2)
|
2022-11-18 23:47:45 +00:00
|
|
|
return ff_tx_gen_ptwo_revtab(s, opts);
|
lavu/tx: implement aarch64 NEON SIMD FFT
The fastest fast Fourier transform in not just the west, but the world,
now for the most popular toy ISA.
On a high level, it follows the design of the AVX2 version closely,
with the exception that the input is slightly less permuted as we don't have
to do lane switching with the input on double 4pt and 8pt.
On a low level, the lack of subadd/addsub instructions REALLY penalizes
any attempt at writing an FFT. That single register matters a lot,
and reloading it simply takes unacceptably long.
In x86 land, vendors would've noticed developers need this.
In ARM land, you get a badly designed complex multiplication instruction
we cannot use, that's not present on 95% of devices. Because only
compilers matter, right?
Future optimization options are very few, perhaps better register
management to use more ld1/st1s.
All timings below are in cycles:
A53:
Length | C | New (lavu) | Old (lavc) | FFTW
------ |-------------|-------------|-------------|-----
4 | 842 | 420 | 1210 | 1460
8 | 1538 | 1020 | 1850 | 2520
16 | 3717 | 1900 | 3700 | 3990
32 | 9156 | 4070 | 8289 | 8860
64 | 21160 | 9931 | 18600 | 19625
128 | 49180 | 23278 | 41922 | 41922
256 | 112073 | 53876 | 93202 | 101092
512 | 252864 | 122884 | 205897 | 207868
1024 | 560512 | 278322 | 458071 | 453053
2048 | 1295402 | 775835 | 1038205 | 1020265
4096 | 3281263 | 2021221 | 2409718 | 2577554
8192 | 8577845 | 4780526 | 5673041 | 6802722
Apple M1
New - Total for len 512 reps 2097152 = 1.459141 s
Old - Total for len 512 reps 2097152 = 2.251344 s
FFTW - Total for len 512 reps 2097152 = 1.868429 s
New - Total for len 1024 reps 4194304 = 6.490080 s
Old - Total for len 1024 reps 4194304 = 9.604949 s
FFTW - Total for len 1024 reps 4194304 = 7.889281 s
New - Total for len 16384 reps 262144 = 10.374001 s
Old - Total for len 16384 reps 262144 = 15.266713 s
FFTW - Total for len 16384 reps 262144 = 12.341745 s
New - Total for len 65536 reps 8192 = 1.769812 s
Old - Total for len 65536 reps 8192 = 4.209413 s
FFTW - Total for len 65536 reps 8192 = 3.012365 s
New - Total for len 131072 reps 4096 = 1.942836 s
Old - Segfaults
FFTW - Total for len 131072 reps 4096 = 3.713713 s
Thanks to wbs for some simplifications, assembler fixes and a review
and to jannau for giving it a look.
2022-02-03 11:27:03 +00:00
|
|
|
else
|
2022-11-18 23:47:45 +00:00
|
|
|
return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0);
|
lavu/tx: implement aarch64 NEON SIMD FFT
The fastest fast Fourier transform in not just the west, but the world,
now for the most popular toy ISA.
On a high level, it follows the design of the AVX2 version closely,
with the exception that the input is slightly less permuted as we don't have
to do lane switching with the input on double 4pt and 8pt.
On a low level, the lack of subadd/addsub instructions REALLY penalizes
any attempt at writing an FFT. That single register matters a lot,
and reloading it simply takes unacceptably long.
In x86 land, vendors would've noticed developers need this.
In ARM land, you get a badly designed complex multiplication instruction
we cannot use, that's not present on 95% of devices. Because only
compilers matter, right?
Future optimization options are very few, perhaps better register
management to use more ld1/st1s.
All timings below are in cycles:
A53:
Length | C | New (lavu) | Old (lavc) | FFTW
------ |-------------|-------------|-------------|-----
4 | 842 | 420 | 1210 | 1460
8 | 1538 | 1020 | 1850 | 2520
16 | 3717 | 1900 | 3700 | 3990
32 | 9156 | 4070 | 8289 | 8860
64 | 21160 | 9931 | 18600 | 19625
128 | 49180 | 23278 | 41922 | 41922
256 | 112073 | 53876 | 93202 | 101092
512 | 252864 | 122884 | 205897 | 207868
1024 | 560512 | 278322 | 458071 | 453053
2048 | 1295402 | 775835 | 1038205 | 1020265
4096 | 3281263 | 2021221 | 2409718 | 2577554
8192 | 8577845 | 4780526 | 5673041 | 6802722
Apple M1
New - Total for len 512 reps 2097152 = 1.459141 s
Old - Total for len 512 reps 2097152 = 2.251344 s
FFTW - Total for len 512 reps 2097152 = 1.868429 s
New - Total for len 1024 reps 4194304 = 6.490080 s
Old - Total for len 1024 reps 4194304 = 9.604949 s
FFTW - Total for len 1024 reps 4194304 = 7.889281 s
New - Total for len 16384 reps 262144 = 10.374001 s
Old - Total for len 16384 reps 262144 = 15.266713 s
FFTW - Total for len 16384 reps 262144 = 12.341745 s
New - Total for len 65536 reps 8192 = 1.769812 s
Old - Total for len 65536 reps 8192 = 4.209413 s
FFTW - Total for len 65536 reps 8192 = 3.012365 s
New - Total for len 131072 reps 4096 = 1.942836 s
Old - Segfaults
FFTW - Total for len 131072 reps 4096 = 3.713713 s
Thanks to wbs for some simplifications, assembler fixes and a review
and to jannau for giving it a look.
2022-02-03 11:27:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = {
|
|
|
|
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE, 0),
|
|
|
|
TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
|
|
|
|
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
|
|
|
|
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
|
|
|
|
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
TX_DEF(fft16, FFT, 16, 16, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
|
|
|
|
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
|
|
|
|
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
|
|
|
|
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0),
|
|
|
|
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
|
|
|
|
|
|
|
NULL,
|
|
|
|
};
|