From 8e94b7cff03539bcb4c360d2550a031a5378df03 Mon Sep 17 00:00:00 2001 From: Lynne Date: Sat, 27 Feb 2021 04:11:04 +0100 Subject: [PATCH] lavu/tx: invert permutation lookups out[lut[i]] = in[i] lookups were 4.04 times(!) slower than out[i] = in[lut[i]] lookups for an out-of-place FFT of length 4096. The permutes remain unchanged for anything but out-of-place monolithic FFT, as those benefit quite a lot from the current order (it means there's only 1 lookup necessary to add to an offset, rather than a full gather). The code was based around non-power-of-two FFTs, so this wasn't benchmarked early on. --- libavutil/tx.c | 7 +++++-- libavutil/tx_priv.h | 2 +- libavutil/tx_template.c | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/libavutil/tx.c b/libavutil/tx.c index ac67b354be..1161df3285 100644 --- a/libavutil/tx.c +++ b/libavutil/tx.c @@ -91,7 +91,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s) return 0; } -int ff_tx_gen_ptwo_revtab(AVTXContext *s) +int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup) { const int m = s->m, inv = s->inv; @@ -101,7 +101,10 @@ int ff_tx_gen_ptwo_revtab(AVTXContext *s) /* Default */ for (int i = 0; i < m; i++) { int k = -split_radix_permutation(i, m, inv) & (m - 1); - s->revtab[k] = i; + if (invert_lookup) + s->revtab[i] = k; + else + s->revtab[k] = i; } return 0; diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index e9fba02a35..e2f4314a4f 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -123,7 +123,7 @@ struct AVTXContext { /* Shared functions */ int ff_tx_type_is_mdct(enum AVTXType type); int ff_tx_gen_compound_mapping(AVTXContext *s); -int ff_tx_gen_ptwo_revtab(AVTXContext *s); +int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup); int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s); /* Also used by SIMD init */ diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 711013c352..0c76e0ed6f 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -410,7 +410,7 @@ static void monolithic_fft(AVTXContext *s, void *_out, void *_in, } while ((src = *inplace_idx++)); } else { for (int i = 0; i < m; i++) - out[s->revtab[i]] = in[i]; + out[i] = in[s->revtab[i]]; } fft_dispatch[mb](out); @@ -738,7 +738,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx, if (n != 1) init_cos_tabs(0); if (m != 1) { - if ((err = ff_tx_gen_ptwo_revtab(s))) + if ((err = ff_tx_gen_ptwo_revtab(s, n == 1 && !(flags & AV_TX_INPLACE)))) return err; if (flags & AV_TX_INPLACE) { if (is_mdct) /* In-place MDCTs are not supported yet */