mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-12 18:31:37 +00:00
1a7efafd33
May fix the FATE failures on x64 Windows here: https://fate.ffmpeg.org/report.cgi?slot=x86_64-msvc17-windows-native&time=20221125130443 Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
918 lines
28 KiB
C
918 lines
28 KiB
C
/*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "avassert.h"
|
|
#include "intmath.h"
|
|
#include "cpu.h"
|
|
#include "qsort.h"
|
|
#include "bprint.h"
|
|
|
|
#include "tx_priv.h"
|
|
|
|
#define TYPE_IS(type, x) \
|
|
(((x) == AV_TX_FLOAT_ ## type) || \
|
|
((x) == AV_TX_DOUBLE_ ## type) || \
|
|
((x) == AV_TX_INT32_ ## type))
|
|
|
|
/* Calculates the modular multiplicative inverse */
|
|
static av_always_inline int mulinv(int n, int m)
|
|
{
|
|
n = n % m;
|
|
for (int x = 1; x < m; x++)
|
|
if (((n * x) % m) == 1)
|
|
return x;
|
|
av_assert0(0); /* Never reached */
|
|
return 0;
|
|
}
|
|
|
|
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
|
|
int d1, int d2)
|
|
{
|
|
const int sl = d1*d2;
|
|
|
|
s->map = av_malloc(s->len*sizeof(*s->map));
|
|
if (!s->map)
|
|
return AVERROR(ENOMEM);
|
|
|
|
for (int k = 0; k < s->len; k += sl) {
|
|
if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) {
|
|
for (int m = 0; m < d2; m++)
|
|
for (int n = 0; n < d1; n++)
|
|
s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n;
|
|
} else {
|
|
for (int m = 0; m < d2; m++)
|
|
for (int n = 0; n < d1; n++)
|
|
s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl);
|
|
}
|
|
|
|
if (s->inv)
|
|
for (int w = 1; w <= ((sl) >> 1); w++)
|
|
FFSWAP(int, s->map[k + w], s->map[k + sl - w]);
|
|
}
|
|
|
|
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Guaranteed to work for any n, m where gcd(n, m) == 1 */
|
|
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
|
|
int inv, int n, int m)
|
|
{
|
|
int *in_map, *out_map;
|
|
const int len = n*m; /* Will not be equal to s->len for MDCTs */
|
|
int m_inv, n_inv;
|
|
|
|
/* Make sure the numbers are coprime */
|
|
if (av_gcd(n, m) != 1)
|
|
return AVERROR(EINVAL);
|
|
|
|
m_inv = mulinv(m, n);
|
|
n_inv = mulinv(n, m);
|
|
|
|
if (!(s->map = av_malloc(2*len*sizeof(*s->map))))
|
|
return AVERROR(ENOMEM);
|
|
|
|
in_map = s->map;
|
|
out_map = s->map + len;
|
|
|
|
/* Ruritanian map for input, CRT map for output, can be swapped */
|
|
if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
|
|
for (int j = 0; j < m; j++) {
|
|
for (int i = 0; i < n; i++) {
|
|
in_map[(i*m + j*n) % len] = j*n + i;
|
|
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
|
|
}
|
|
}
|
|
} else {
|
|
for (int j = 0; j < m; j++) {
|
|
for (int i = 0; i < n; i++) {
|
|
in_map[j*n + i] = (i*m + j*n) % len;
|
|
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (inv) {
|
|
for (int i = 0; i < m; i++) {
|
|
int *in = &in_map[i*n + 1]; /* Skip the DC */
|
|
for (int j = 0; j < ((n - 1) >> 1); j++)
|
|
FFSWAP(int, in[j], in[n - j - 2]);
|
|
}
|
|
}
|
|
|
|
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int split_radix_permutation(int i, int len, int inv)
|
|
{
|
|
len >>= 1;
|
|
if (len <= 1)
|
|
return i & 1;
|
|
if (!(i & len))
|
|
return split_radix_permutation(i, len, inv) * 2;
|
|
len >>= 1;
|
|
return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
|
|
}
|
|
|
|
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
|
|
{
|
|
int len = s->len;
|
|
|
|
if (!(s->map = av_malloc(len*sizeof(*s->map))))
|
|
return AVERROR(ENOMEM);
|
|
|
|
if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
|
|
for (int i = 0; i < s->len; i++)
|
|
s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
|
|
} else {
|
|
for (int i = 0; i < s->len; i++)
|
|
s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
|
|
}
|
|
|
|
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
|
|
{
|
|
int *src_map, out_map_idx = 0;
|
|
|
|
if (!s->sub || !s->sub->map)
|
|
return AVERROR(EINVAL);
|
|
|
|
if (!(s->map = av_mallocz(len*sizeof(*s->map))))
|
|
return AVERROR(ENOMEM);
|
|
|
|
src_map = s->sub->map;
|
|
|
|
/* The first coefficient is always already in-place */
|
|
for (int src = 1; src < s->len; src++) {
|
|
int dst = src_map[src];
|
|
int found = 0;
|
|
|
|
if (dst <= src)
|
|
continue;
|
|
|
|
/* This just checks if a closed loop has been encountered before,
|
|
* and if so, skips it, since to fully permute a loop we must only
|
|
* enter it once. */
|
|
do {
|
|
for (int j = 0; j < out_map_idx; j++) {
|
|
if (dst == s->map[j]) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
dst = src_map[dst];
|
|
} while (dst != src && !found);
|
|
|
|
if (!found)
|
|
s->map[out_map_idx++] = src;
|
|
}
|
|
|
|
s->map[out_map_idx++] = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
|
|
int is_dual, int dual_high, int len,
|
|
int basis, int dual_stride, int inv_lookup)
|
|
{
|
|
len >>= 1;
|
|
|
|
if (len <= basis) {
|
|
int k1, k2, stride, even_idx, odd_idx;
|
|
|
|
is_dual = is_dual && dual_stride;
|
|
dual_high = is_dual & dual_high;
|
|
stride = is_dual ? FFMIN(dual_stride, len) : 0;
|
|
|
|
even_idx = offset + dual_high*(stride - 2*len);
|
|
odd_idx = even_idx + len + (is_dual && !dual_high)*len + dual_high*len;
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1);
|
|
k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1);
|
|
if (inv_lookup) {
|
|
revtab[even_idx++] = k1;
|
|
revtab[odd_idx++] = k2;
|
|
} else {
|
|
revtab[k1] = even_idx++;
|
|
revtab[k2] = odd_idx++;
|
|
}
|
|
if (stride && !((i + 1) % stride)) {
|
|
even_idx += stride;
|
|
odd_idx += stride;
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
parity_revtab_generator(revtab, n, inv, offset,
|
|
0, 0, len >> 0, basis, dual_stride, inv_lookup);
|
|
parity_revtab_generator(revtab, n, inv, offset + (len >> 0),
|
|
1, 0, len >> 1, basis, dual_stride, inv_lookup);
|
|
parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1),
|
|
1, 1, len >> 1, basis, dual_stride, inv_lookup);
|
|
}
|
|
|
|
int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
|
|
FFTXCodeletOptions *opts,
|
|
int basis, int dual_stride)
|
|
{
|
|
basis >>= 1;
|
|
if (len < basis)
|
|
return AVERROR(EINVAL);
|
|
|
|
if (!(s->map = av_mallocz(len*sizeof(*s->map))))
|
|
return AVERROR(ENOMEM);
|
|
|
|
av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1)));
|
|
av_assert0(dual_stride <= basis);
|
|
|
|
parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
|
|
basis, dual_stride,
|
|
opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER);
|
|
|
|
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void reset_ctx(AVTXContext *s, int free_sub)
|
|
{
|
|
if (!s)
|
|
return;
|
|
|
|
if (s->sub)
|
|
for (int i = 0; i < TX_MAX_SUB; i++)
|
|
reset_ctx(&s->sub[i], free_sub + 1);
|
|
|
|
if (s->cd_self && s->cd_self->uninit)
|
|
s->cd_self->uninit(s);
|
|
|
|
if (free_sub)
|
|
av_freep(&s->sub);
|
|
|
|
av_freep(&s->map);
|
|
av_freep(&s->exp);
|
|
av_freep(&s->tmp);
|
|
|
|
/* Nothing else needs to be reset, it gets overwritten if another
|
|
* ff_tx_init_subtx() call is made. */
|
|
s->nb_sub = 0;
|
|
s->opaque = NULL;
|
|
memset(s->fn, 0, sizeof(*s->fn));
|
|
}
|
|
|
|
void ff_tx_clear_ctx(AVTXContext *s)
|
|
{
|
|
reset_ctx(s, 0);
|
|
}
|
|
|
|
av_cold void av_tx_uninit(AVTXContext **ctx)
|
|
{
|
|
if (!(*ctx))
|
|
return;
|
|
|
|
reset_ctx(*ctx, 1);
|
|
av_freep(ctx);
|
|
}
|
|
|
|
static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd,
|
|
uint64_t flags, FFTXCodeletOptions *opts,
|
|
int len, int inv, const void *scale)
|
|
{
|
|
/* Can only handle one sample+type to one sample+type transforms */
|
|
if (TYPE_IS(MDCT, s->type) || TYPE_IS(RDFT, s->type))
|
|
return AVERROR(EINVAL);
|
|
return 0;
|
|
}
|
|
|
|
/* Null transform when the length is 1 */
|
|
static void ff_tx_null(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
|
|
{
|
|
memcpy(_out, _in, stride);
|
|
}
|
|
|
|
static const FFTXCodelet ff_tx_null_def = {
|
|
.name = NULL_IF_CONFIG_SMALL("null"),
|
|
.function = ff_tx_null,
|
|
.type = TX_TYPE_ANY,
|
|
.flags = AV_TX_UNALIGNED | FF_TX_ALIGNED |
|
|
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
|
|
.factors[0] = TX_FACTOR_ANY,
|
|
.min_len = 1,
|
|
.max_len = 1,
|
|
.init = ff_tx_null_init,
|
|
.cpu_flags = FF_TX_CPU_FLAGS_ALL,
|
|
.prio = FF_TX_PRIO_MAX,
|
|
};
|
|
|
|
static const FFTXCodelet * const ff_tx_null_list[] = {
|
|
&ff_tx_null_def,
|
|
NULL,
|
|
};
|
|
|
|
/* Array of all compiled codelet lists. Order is irrelevant. */
|
|
static const FFTXCodelet * const * const codelet_list[] = {
|
|
ff_tx_codelet_list_float_c,
|
|
ff_tx_codelet_list_double_c,
|
|
ff_tx_codelet_list_int32_c,
|
|
ff_tx_null_list,
|
|
#if HAVE_X86ASM
|
|
ff_tx_codelet_list_float_x86,
|
|
#endif
|
|
#if ARCH_AARCH64
|
|
ff_tx_codelet_list_float_aarch64,
|
|
#endif
|
|
};
|
|
static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
|
|
|
|
static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
|
|
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
|
|
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
|
|
|
|
static const int cpu_slow_penalties[][2] = {
|
|
{ AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
|
|
{ AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
|
|
{ AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
|
|
{ AV_CPU_FLAG_ATOM, 1 + 128 },
|
|
{ AV_CPU_FLAG_AVXSLOW, 1 + 128 },
|
|
{ AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
|
|
};
|
|
|
|
static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
|
|
{
|
|
int prio = cd->prio;
|
|
int max_factor = 0;
|
|
|
|
/* If the CPU has a SLOW flag, and the instruction is also flagged
|
|
* as being slow for such, reduce its priority */
|
|
for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
|
|
if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
|
|
prio -= cpu_slow_penalties[i][1];
|
|
}
|
|
|
|
/* Prioritize aligned-only codelets */
|
|
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
|
|
prio += 64;
|
|
|
|
/* Codelets for specific lengths are generally faster */
|
|
if ((len == cd->min_len) && (len == cd->max_len))
|
|
prio += 64;
|
|
|
|
/* Forward-only or inverse-only transforms are generally better */
|
|
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
|
|
prio += 64;
|
|
|
|
/* Larger factors are generally better */
|
|
for (int i = 0; i < TX_MAX_SUB; i++)
|
|
max_factor = FFMAX(cd->factors[i], max_factor);
|
|
if (max_factor)
|
|
prio += 16*max_factor;
|
|
|
|
return prio;
|
|
}
|
|
|
|
typedef struct FFTXLenDecomp {
|
|
int len;
|
|
int len2;
|
|
int prio;
|
|
const FFTXCodelet *cd;
|
|
} FFTXLenDecomp;
|
|
|
|
static int cmp_decomp(FFTXLenDecomp *a, FFTXLenDecomp *b)
|
|
{
|
|
return FFDIFFSIGN(b->prio, a->prio);
|
|
}
|
|
|
|
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type,
|
|
int len, int inv)
|
|
{
|
|
int nb_decomp = 0;
|
|
FFTXLenDecomp ld[TX_MAX_DECOMPOSITIONS];
|
|
int codelet_list_idx = codelet_list_num;
|
|
|
|
const int cpu_flags = av_get_cpu_flags();
|
|
|
|
/* Loop through all codelets in all codelet lists to find matches
|
|
* to the requirements */
|
|
while (codelet_list_idx--) {
|
|
const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
|
|
const FFTXCodelet *cd = NULL;
|
|
|
|
while ((cd = *list++)) {
|
|
int fl = len;
|
|
int skip = 0, prio;
|
|
int factors_product = 1, factors_mod = 0;
|
|
|
|
if (nb_decomp >= TX_MAX_DECOMPOSITIONS)
|
|
goto sort;
|
|
|
|
/* Check if the type matches */
|
|
if (cd->type != TX_TYPE_ANY && type != cd->type)
|
|
continue;
|
|
|
|
/* Check direction for non-orthogonal codelets */
|
|
if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
|
|
((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
|
|
continue;
|
|
|
|
/* Check if the CPU supports the required ISA */
|
|
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
|
|
!(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
|
|
continue;
|
|
|
|
for (int i = 0; i < TX_MAX_FACTORS; i++) {
|
|
if (!cd->factors[i] || (fl == 1))
|
|
break;
|
|
|
|
if (cd->factors[i] == TX_FACTOR_ANY) {
|
|
factors_mod++;
|
|
factors_product *= fl;
|
|
} else if (!(fl % cd->factors[i])) {
|
|
factors_mod++;
|
|
if (cd->factors[i] == 2) {
|
|
int b = ff_ctz(fl);
|
|
fl >>= b;
|
|
factors_product <<= b;
|
|
} else {
|
|
do {
|
|
fl /= cd->factors[i];
|
|
factors_product *= cd->factors[i];
|
|
} while (!(fl % cd->factors[i]));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Disqualify if factor requirements are not satisfied or if trivial */
|
|
if ((factors_mod < cd->nb_factors) || (len == factors_product))
|
|
continue;
|
|
|
|
if (av_gcd(factors_product, fl) != 1)
|
|
continue;
|
|
|
|
/* Check if length is supported and factorization was successful */
|
|
if ((factors_product < cd->min_len) ||
|
|
(cd->max_len != TX_LEN_UNLIMITED && (factors_product > cd->max_len)))
|
|
continue;
|
|
|
|
prio = get_codelet_prio(cd, cpu_flags, factors_product) * factors_product;
|
|
|
|
/* Check for duplicates */
|
|
for (int i = 0; i < nb_decomp; i++) {
|
|
if (factors_product == ld[i].len) {
|
|
/* Update priority if new one is higher */
|
|
if (prio > ld[i].prio)
|
|
ld[i].prio = prio;
|
|
skip = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Add decomposition if unique */
|
|
if (!skip) {
|
|
ld[nb_decomp].cd = cd;
|
|
ld[nb_decomp].len = factors_product;
|
|
ld[nb_decomp].len2 = fl;
|
|
ld[nb_decomp].prio = prio;
|
|
nb_decomp++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!nb_decomp)
|
|
return AVERROR(EINVAL);
|
|
|
|
sort:
|
|
AV_QSORT(ld, nb_decomp, FFTXLenDecomp, cmp_decomp);
|
|
|
|
for (int i = 0; i < nb_decomp; i++) {
|
|
if (ld[i].cd->nb_factors > 1)
|
|
dst[i] = ld[i].len2;
|
|
else
|
|
dst[i] = ld[i].len;
|
|
}
|
|
|
|
return nb_decomp;
|
|
}
|
|
|
|
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
|
|
{
|
|
s->map = av_malloc(s->len*sizeof(*s->map));
|
|
if (!s->map)
|
|
return AVERROR(ENOMEM);
|
|
|
|
s->map[0] = 0; /* DC is always at the start */
|
|
if (s->inv) /* Reversing the ACs flips the transform direction */
|
|
for (int i = 1; i < s->len; i++)
|
|
s->map[i] = s->len - i;
|
|
else
|
|
for (int i = 1; i < s->len; i++)
|
|
s->map[i] = i;
|
|
|
|
s->map_dir = FF_TX_MAP_GATHER;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if !CONFIG_SMALL
|
|
static void print_flags(AVBPrint *bp, uint64_t f)
|
|
{
|
|
int prev = 0;
|
|
const char *sep = ", ";
|
|
av_bprintf(bp, "flags: [");
|
|
if ((f & FF_TX_ALIGNED) && ++prev)
|
|
av_bprintf(bp, "aligned");
|
|
if ((f & AV_TX_UNALIGNED) && ++prev)
|
|
av_bprintf(bp, "%sunaligned", prev > 1 ? sep : "");
|
|
if ((f & AV_TX_INPLACE) && ++prev)
|
|
av_bprintf(bp, "%sinplace", prev > 1 ? sep : "");
|
|
if ((f & FF_TX_OUT_OF_PLACE) && ++prev)
|
|
av_bprintf(bp, "%sout_of_place", prev > 1 ? sep : "");
|
|
if ((f & FF_TX_FORWARD_ONLY) && ++prev)
|
|
av_bprintf(bp, "%sfwd_only", prev > 1 ? sep : "");
|
|
if ((f & FF_TX_INVERSE_ONLY) && ++prev)
|
|
av_bprintf(bp, "%sinv_only", prev > 1 ? sep : "");
|
|
if ((f & FF_TX_PRESHUFFLE) && ++prev)
|
|
av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
|
|
if ((f & AV_TX_FULL_IMDCT) && ++prev)
|
|
av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
|
|
if ((f & FF_TX_ASM_CALL) && ++prev)
|
|
av_bprintf(bp, "%sasm_call", prev > 1 ? sep : "");
|
|
av_bprintf(bp, "]");
|
|
}
|
|
|
|
static void print_type(AVBPrint *bp, enum AVTXType type)
|
|
{
|
|
av_bprintf(bp, "%s",
|
|
type == TX_TYPE_ANY ? "any" :
|
|
type == AV_TX_FLOAT_FFT ? "fft_float" :
|
|
type == AV_TX_FLOAT_MDCT ? "mdct_float" :
|
|
type == AV_TX_FLOAT_RDFT ? "rdft_float" :
|
|
type == AV_TX_DOUBLE_FFT ? "fft_double" :
|
|
type == AV_TX_DOUBLE_MDCT ? "mdct_double" :
|
|
type == AV_TX_DOUBLE_RDFT ? "rdft_double" :
|
|
type == AV_TX_INT32_FFT ? "fft_int32" :
|
|
type == AV_TX_INT32_MDCT ? "mdct_int32" :
|
|
type == AV_TX_INT32_RDFT ? "rdft_int32" :
|
|
"unknown");
|
|
}
|
|
|
|
static void print_cd_info(const FFTXCodelet *cd, int prio, int len, int print_prio)
|
|
{
|
|
AVBPrint bp = { 0 };
|
|
av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
|
|
|
|
av_bprintf(&bp, "%s - type: ", cd->name);
|
|
|
|
print_type(&bp, cd->type);
|
|
|
|
av_bprintf(&bp, ", len: ");
|
|
if (!len) {
|
|
if (cd->min_len != cd->max_len)
|
|
av_bprintf(&bp, "[%i, ", cd->min_len);
|
|
|
|
if (cd->max_len == TX_LEN_UNLIMITED)
|
|
av_bprintf(&bp, "∞");
|
|
else
|
|
av_bprintf(&bp, "%i", cd->max_len);
|
|
} else {
|
|
av_bprintf(&bp, "%i", len);
|
|
}
|
|
|
|
if (cd->factors[1]) {
|
|
av_bprintf(&bp, "%s, factors", !len && cd->min_len != cd->max_len ? "]" : "");
|
|
if (!cd->nb_factors)
|
|
av_bprintf(&bp, ": [");
|
|
else
|
|
av_bprintf(&bp, "[%i]: [", cd->nb_factors);
|
|
|
|
for (int i = 0; i < TX_MAX_FACTORS; i++) {
|
|
if (i && cd->factors[i])
|
|
av_bprintf(&bp, ", ");
|
|
if (cd->factors[i] == TX_FACTOR_ANY)
|
|
av_bprintf(&bp, "any");
|
|
else if (cd->factors[i])
|
|
av_bprintf(&bp, "%i", cd->factors[i]);
|
|
else
|
|
break;
|
|
}
|
|
|
|
av_bprintf(&bp, "], ");
|
|
} else {
|
|
av_bprintf(&bp, "%s, factor: %i, ",
|
|
!len && cd->min_len != cd->max_len ? "]" : "", cd->factors[0]);
|
|
}
|
|
print_flags(&bp, cd->flags);
|
|
|
|
if (print_prio)
|
|
av_bprintf(&bp, ", prio: %i", prio);
|
|
|
|
av_log(NULL, AV_LOG_DEBUG, "%s\n", bp.str);
|
|
}
|
|
|
|
static void print_tx_structure(AVTXContext *s, int depth)
|
|
{
|
|
const FFTXCodelet *cd = s->cd_self;
|
|
|
|
for (int i = 0; i <= depth; i++)
|
|
av_log(NULL, AV_LOG_DEBUG, " ");
|
|
|
|
print_cd_info(cd, cd->prio, s->len, 0);
|
|
|
|
for (int i = 0; i < s->nb_sub; i++)
|
|
print_tx_structure(&s->sub[i], depth + 1);
|
|
}
|
|
#endif /* CONFIG_SMALL */
|
|
|
|
typedef struct TXCodeletMatch {
|
|
const FFTXCodelet *cd;
|
|
int prio;
|
|
} TXCodeletMatch;
|
|
|
|
static int cmp_matches(TXCodeletMatch *a, TXCodeletMatch *b)
|
|
{
|
|
return FFDIFFSIGN(b->prio, a->prio);
|
|
}
|
|
|
|
/* We want all factors to completely cover the length */
|
|
static inline int check_cd_factors(const FFTXCodelet *cd, int len)
|
|
{
|
|
int matches = 0, any_flag = 0;
|
|
|
|
for (int i = 0; i < TX_MAX_FACTORS; i++) {
|
|
int factor = cd->factors[i];
|
|
|
|
if (factor == TX_FACTOR_ANY) {
|
|
any_flag = 1;
|
|
matches++;
|
|
continue;
|
|
} else if (len <= 1 || !factor) {
|
|
break;
|
|
} else if (factor == 2) { /* Fast path */
|
|
int bits_2 = ff_ctz(len);
|
|
if (!bits_2)
|
|
continue; /* Factor not supported */
|
|
|
|
len >>= bits_2;
|
|
matches++;
|
|
} else {
|
|
int res = len % factor;
|
|
if (res)
|
|
continue; /* Factor not supported */
|
|
|
|
while (!res) {
|
|
len /= factor;
|
|
res = len % factor;
|
|
}
|
|
matches++;
|
|
}
|
|
}
|
|
|
|
return (cd->nb_factors <= matches) && (any_flag || len == 1);
|
|
}
|
|
|
|
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|
uint64_t flags, FFTXCodeletOptions *opts,
|
|
int len, int inv, const void *scale)
|
|
{
|
|
int ret = 0;
|
|
AVTXContext *sub = NULL;
|
|
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
|
|
unsigned int cd_matches_size = 0;
|
|
int codelet_list_idx = codelet_list_num;
|
|
int nb_cd_matches = 0;
|
|
#if !CONFIG_SMALL
|
|
AVBPrint bp = { 0 };
|
|
#endif
|
|
|
|
/* We still accept functions marked with SLOW, even if the CPU is
|
|
* marked with the same flag, but we give them lower priority. */
|
|
const int cpu_flags = av_get_cpu_flags();
|
|
|
|
/* Flags the transform wants */
|
|
uint64_t req_flags = flags;
|
|
|
|
/* Flags the codelet may require to be present */
|
|
uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL;
|
|
|
|
/* Unaligned codelets are compatible with the aligned flag */
|
|
if (req_flags & FF_TX_ALIGNED)
|
|
req_flags |= AV_TX_UNALIGNED;
|
|
|
|
/* If either flag is set, both are okay, so don't check for an exact match */
|
|
if ((req_flags & AV_TX_INPLACE) && (req_flags & FF_TX_OUT_OF_PLACE))
|
|
req_flags &= ~(AV_TX_INPLACE | FF_TX_OUT_OF_PLACE);
|
|
if ((req_flags & FF_TX_ALIGNED) && (req_flags & AV_TX_UNALIGNED))
|
|
req_flags &= ~(FF_TX_ALIGNED | AV_TX_UNALIGNED);
|
|
|
|
/* Loop through all codelets in all codelet lists to find matches
|
|
* to the requirements */
|
|
while (codelet_list_idx--) {
|
|
const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
|
|
const FFTXCodelet *cd = NULL;
|
|
|
|
while ((cd = *list++)) {
|
|
/* Check if the type matches */
|
|
if (cd->type != TX_TYPE_ANY && type != cd->type)
|
|
continue;
|
|
|
|
/* Check direction for non-orthogonal codelets */
|
|
if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
|
|
((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
|
|
continue;
|
|
|
|
/* Check if the requested flags match from both sides */
|
|
if (((req_flags & cd->flags) != (req_flags)) ||
|
|
((inv_req_mask & cd->flags) != (req_flags & inv_req_mask)))
|
|
continue;
|
|
|
|
/* Check if length is supported */
|
|
if ((len < cd->min_len) || (cd->max_len != -1 && (len > cd->max_len)))
|
|
continue;
|
|
|
|
/* Check if the CPU supports the required ISA */
|
|
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
|
|
!(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
|
|
continue;
|
|
|
|
/* Check for factors */
|
|
if (!check_cd_factors(cd, len))
|
|
continue;
|
|
|
|
/* Realloc array and append */
|
|
cd_tmp = av_fast_realloc(cd_matches, &cd_matches_size,
|
|
sizeof(*cd_tmp) * (nb_cd_matches + 1));
|
|
if (!cd_tmp) {
|
|
av_free(cd_matches);
|
|
return AVERROR(ENOMEM);
|
|
}
|
|
|
|
cd_matches = cd_tmp;
|
|
cd_matches[nb_cd_matches].cd = cd;
|
|
cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
|
|
nb_cd_matches++;
|
|
}
|
|
}
|
|
|
|
#if !CONFIG_SMALL
|
|
/* Print debugging info */
|
|
av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
|
|
av_bprintf(&bp, "For transform of length %i, %s, ", len,
|
|
inv ? "inverse" : "forward");
|
|
print_type(&bp, type);
|
|
av_bprintf(&bp, ", ");
|
|
print_flags(&bp, flags);
|
|
av_bprintf(&bp, ", found %i matches%s", nb_cd_matches,
|
|
nb_cd_matches ? ":" : ".");
|
|
#endif
|
|
|
|
/* No matches found */
|
|
if (!nb_cd_matches)
|
|
return AVERROR(ENOSYS);
|
|
|
|
/* Sort the list */
|
|
AV_QSORT(cd_matches, nb_cd_matches, TXCodeletMatch, cmp_matches);
|
|
|
|
#if !CONFIG_SMALL
|
|
av_log(NULL, AV_LOG_DEBUG, "%s\n", bp.str);
|
|
|
|
for (int i = 0; i < nb_cd_matches; i++) {
|
|
av_log(NULL, AV_LOG_DEBUG, " %i: ", i + 1);
|
|
print_cd_info(cd_matches[i].cd, cd_matches[i].prio, 0, 1);
|
|
}
|
|
#endif
|
|
|
|
if (!s->sub) {
|
|
s->sub = sub = av_mallocz(TX_MAX_SUB*sizeof(*sub));
|
|
if (!sub) {
|
|
ret = AVERROR(ENOMEM);
|
|
goto end;
|
|
}
|
|
}
|
|
|
|
/* Attempt to initialize each */
|
|
for (int i = 0; i < nb_cd_matches; i++) {
|
|
const FFTXCodelet *cd = cd_matches[i].cd;
|
|
AVTXContext *sctx = &s->sub[s->nb_sub];
|
|
|
|
sctx->len = len;
|
|
sctx->inv = inv;
|
|
sctx->type = type;
|
|
sctx->flags = cd->flags | flags;
|
|
sctx->cd_self = cd;
|
|
|
|
s->fn[s->nb_sub] = cd->function;
|
|
s->cd[s->nb_sub] = cd;
|
|
|
|
ret = 0;
|
|
if (cd->init)
|
|
ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
|
|
|
|
if (ret >= 0) {
|
|
if (opts && opts->map_dir != FF_TX_MAP_NONE &&
|
|
sctx->map_dir == FF_TX_MAP_NONE) {
|
|
/* If a specific map direction was requested, and it doesn't
|
|
* exist, create one.*/
|
|
sctx->map = av_malloc(len*sizeof(*sctx->map));
|
|
if (!sctx->map) {
|
|
ret = AVERROR(ENOMEM);
|
|
goto end;
|
|
}
|
|
|
|
for (int i = 0; i < len; i++)
|
|
sctx->map[i] = i;
|
|
} else if (opts && (opts->map_dir != sctx->map_dir)) {
|
|
int *tmp = av_malloc(len*sizeof(*sctx->map));
|
|
if (!tmp) {
|
|
ret = AVERROR(ENOMEM);
|
|
goto end;
|
|
}
|
|
|
|
memcpy(tmp, sctx->map, len*sizeof(*sctx->map));
|
|
|
|
for (int i = 0; i < len; i++)
|
|
sctx->map[tmp[i]] = i;
|
|
|
|
av_free(tmp);
|
|
}
|
|
|
|
s->nb_sub++;
|
|
goto end;
|
|
}
|
|
|
|
s->fn[s->nb_sub] = NULL;
|
|
s->cd[s->nb_sub] = NULL;
|
|
|
|
reset_ctx(sctx, 0);
|
|
if (ret == AVERROR(ENOMEM))
|
|
break;
|
|
}
|
|
|
|
if (!s->nb_sub)
|
|
av_freep(&s->sub);
|
|
|
|
end:
|
|
av_free(cd_matches);
|
|
return ret;
|
|
}
|
|
|
|
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
|
|
int inv, int len, const void *scale, uint64_t flags)
|
|
{
|
|
int ret;
|
|
AVTXContext tmp = { 0 };
|
|
const double default_scale_d = 1.0;
|
|
const float default_scale_f = 1.0f;
|
|
|
|
if (!len || type >= AV_TX_NB || !ctx || !tx)
|
|
return AVERROR(EINVAL);
|
|
|
|
if (!(flags & AV_TX_UNALIGNED))
|
|
flags |= FF_TX_ALIGNED;
|
|
if (!(flags & AV_TX_INPLACE))
|
|
flags |= FF_TX_OUT_OF_PLACE;
|
|
|
|
if (!scale && ((type == AV_TX_FLOAT_MDCT) || (type == AV_TX_INT32_MDCT)))
|
|
scale = &default_scale_f;
|
|
else if (!scale && (type == AV_TX_DOUBLE_MDCT))
|
|
scale = &default_scale_d;
|
|
|
|
ret = ff_tx_init_subtx(&tmp, type, flags, NULL, len, inv, scale);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
*ctx = &tmp.sub[0];
|
|
*tx = tmp.fn[0];
|
|
|
|
#if !CONFIG_SMALL
|
|
av_log(NULL, AV_LOG_DEBUG, "Transform tree:\n");
|
|
print_tx_structure(*ctx, 0);
|
|
#endif
|
|
|
|
return ret;
|
|
}
|