x86: use new schema for ASM macros

Signed-off-by: Janne Grunau <janne-libav@jannau.net>
This commit is contained in:
Vitor Sessak 2012-05-27 17:43:56 +00:00 committed by Janne Grunau
parent 7263cd5544
commit bac0729d9e
4 changed files with 81 additions and 75 deletions

View File

@ -27,15 +27,15 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
int has_vectors = av_get_cpu_flags(); int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
/* 3DNow! for K6-2/3 */ /* 3DNow! for K6-2/3 */
s->imdct_calc = ff_imdct_calc_3dn; s->imdct_calc = ff_imdct_calc_3dnow;
s->imdct_half = ff_imdct_half_3dn; s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dn; s->fft_calc = ff_fft_calc_3dnow;
} }
if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
/* 3DNowEx for K7 */ /* 3DNowEx for K7 */
s->imdct_calc = ff_imdct_calc_3dn2; s->imdct_calc = ff_imdct_calc_3dnow2;
s->imdct_half = ff_imdct_half_3dn2; s->imdct_half = ff_imdct_half_3dnow2;
s->fft_calc = ff_fft_calc_3dn2; s->fft_calc = ff_fft_calc_3dnow2;
} }
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
/* SSE for P3/P4/K8 */ /* SSE for P3/P4/K8 */

View File

@ -24,13 +24,13 @@
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_3dn(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dn(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);

View File

@ -30,30 +30,30 @@ DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 };
"movq "#s","#d"\n"\ "movq "#s","#d"\n"\
"psrlq $32,"#d"\n"\ "psrlq $32,"#d"\n"\
"punpckldq "#s","#d"\n" "punpckldq "#s","#d"\n"
#define ff_fft_calc_3dn2 ff_fft_calc_3dn #define ff_fft_calc_3dnow2 ff_fft_calc_3dnow
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn #define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn #define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn #define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow
#define ff_imdct_half_3dn2 ff_imdct_half_3dn #define ff_imdct_half_3dnow2 ff_imdct_half_3dnow
#else #else
#define PSWAPD(s,d) "pswapd "#s","#d"\n" #define PSWAPD(s,d) "pswapd "#s","#d"\n"
#endif #endif
void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z)
{ {
int n = 1<<s->nbits; int n = 1<<s->nbits;
int i; int i;
ff_fft_dispatch_interleave_3dn2(z, s->nbits); ff_fft_dispatch_interleave_3dnow2(z, s->nbits);
__asm__ volatile("femms"); __asm__ volatile("femms");
if(n <= 8) if(n <= 8)
for(i=0; i<n; i+=2) for(i=0; i<n; i+=2)
FFSWAP(FFTSample, z[i].im, z[i+1].re); FFSWAP(FFTSample, z[i].im, z[i+1].re);
} }
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
{ {
x86_reg j, k; x86_reg j, k;
long n = s->mdct_size; long n = s->mdct_size;
@ -101,7 +101,7 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
); );
} }
ff_fft_dispatch_3dn2(z, s->nbits); ff_fft_dispatch_3dnow2(z, s->nbits);
#define CMUL(j,mm0,mm1)\ #define CMUL(j,mm0,mm1)\
"movq (%2,"#j",2), %%mm6 \n"\ "movq (%2,"#j",2), %%mm6 \n"\
@ -144,13 +144,13 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
__asm__ volatile("femms"); __asm__ volatile("femms");
} }
void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
{ {
x86_reg j, k; x86_reg j, k;
long n = s->mdct_size; long n = s->mdct_size;
long n4 = n >> 2; long n4 = n >> 2;
ff_imdct_half_3dn2(s, output+n4, input); ff_imdct_half_3dnow2(s, output+n4, input);
j = -n; j = -n;
k = n-8; k = n-8;

View File

@ -297,7 +297,7 @@ IF%1 mova Z(1), m5
%define Z2(x) [r0+mmsize*x] %define Z2(x) [r0+mmsize*x]
%define ZH(x) [r0+mmsize*x+mmsize/2] %define ZH(x) [r0+mmsize*x+mmsize/2]
INIT_YMM INIT_YMM avx
%if HAVE_AVX %if HAVE_AVX
align 16 align 16
@ -390,7 +390,7 @@ fft32_interleave_avx:
ret ret
%endif %endif
INIT_XMM INIT_XMM sse
%define movdqa movaps %define movdqa movaps
align 16 align 16
@ -439,11 +439,9 @@ fft16_sse:
ret ret
INIT_MMX %macro FFT48_3DN 0
%macro FFT48_3DN 1
align 16 align 16
fft4%1: fft4 %+ SUFFIX:
T2_3DN m0, m1, Z(0), Z(1) T2_3DN m0, m1, Z(0), Z(1)
mova m2, Z(2) mova m2, Z(2)
mova m3, Z(3) mova m3, Z(3)
@ -457,7 +455,7 @@ fft4%1:
ret ret
align 16 align 16
fft8%1: fft8 %+ SUFFIX:
T2_3DN m0, m1, Z(0), Z(1) T2_3DN m0, m1, Z(0), Z(1)
mova m2, Z(2) mova m2, Z(2)
mova m3, Z(3) mova m3, Z(3)
@ -495,7 +493,8 @@ fft8%1:
ret ret
%endmacro %endmacro
FFT48_3DN _3dn2 INIT_MMX 3dnow2
FFT48_3DN
%macro pswapd 2 %macro pswapd 2
%ifidn %1, %2 %ifidn %1, %2
@ -508,7 +507,8 @@ FFT48_3DN _3dn2
%endif %endif
%endmacro %endmacro
FFT48_3DN _3dn INIT_MMX 3dnow
FFT48_3DN
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)] %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
@ -532,7 +532,7 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret rep ret
%endmacro %endmacro
INIT_YMM INIT_YMM avx
%if HAVE_AVX %if HAVE_AVX
%macro INTERL_AVX 5 %macro INTERL_AVX 5
@ -550,7 +550,7 @@ DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0 DECL_PASS pass_interleave_avx, PASS_BIG 0
%endif %endif
INIT_XMM INIT_XMM sse
%macro INTERL_SSE 5 %macro INTERL_SSE 5
mova %3, %2 mova %3, %2
@ -565,16 +565,16 @@ INIT_XMM
DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0 DECL_PASS pass_interleave_sse, PASS_BIG 0
INIT_MMX INIT_MMX 3dnow
%define mulps pfmul %define mulps pfmul
%define addps pfadd %define addps pfadd
%define subps pfsub %define subps pfsub
%define unpcklps punpckldq %define unpcklps punpckldq
%define unpckhps punpckhdq %define unpckhps punpckhdq
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
DECL_PASS pass_interleave_3dn, PASS_BIG 0 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define pass_3dn2 pass_3dn %define pass_3dnow2 pass_3dnow
%define pass_interleave_3dn2 pass_interleave_3dn %define pass_interleave_3dnow2 pass_interleave_3dnow
%ifdef PIC %ifdef PIC
%define SECTION_REL - $$ %define SECTION_REL - $$
@ -592,67 +592,73 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
call r2 call r2
%endmacro ; FFT_DISPATCH %endmacro ; FFT_DISPATCH
%macro DECL_FFT 2-3 ; nbits, cpu, suffix %macro DECL_FFT 1-2 ; nbits, suffix
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL %ifidn %0, 1
%xdefine fullsuffix SUFFIX
%else
%xdefine fullsuffix %2 %+ SUFFIX
%endif
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
%if %1>=5 %if %1>=5
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
%endif %endif
%if %1>=6 %if %1>=6
%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
%endif %endif
%assign n 1<<%1 %assign n 1<<%1
%rep 17-%1 %rep 17-%1
%assign n2 n/2 %assign n2 n/2
%assign n4 n/4 %assign n4 n/4
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
align 16 align 16
fft %+ n %+ %3%2: fft %+ n %+ fullsuffix:
call fft %+ n2 %+ %2 call fft %+ n2 %+ SUFFIX
add r0, n*4 - (n&(-2<<%1)) add r0, n*4 - (n&(-2<<%1))
call fft %+ n4 %+ %2 call fft %+ n4 %+ SUFFIX
add r0, n*2 - (n2&(-2<<%1)) add r0, n*2 - (n2&(-2<<%1))
call fft %+ n4 %+ %2 call fft %+ n4 %+ SUFFIX
sub r0, n*6 + (n2&(-2<<%1)) sub r0, n*6 + (n2&(-2<<%1))
lea r1, [cos_ %+ n] lea r1, [cos_ %+ n]
mov r2d, n4/2 mov r2d, n4/2
jmp pass%3%2 jmp pass %+ fullsuffix
%assign n n*2 %assign n n*2
%endrep %endrep
%undef n %undef n
align 8 align 8
dispatch_tab%3%2: pointer list_of_fft dispatch_tab %+ fullsuffix: pointer list_of_fft
section .text section .text
; On x86_32, this function does the register saving and restoring for all of fft. ; On x86_32, this function does the register saving and restoring for all of fft.
; The others pass args in registers and don't spill anything. ; The others pass args in registers and don't spill anything.
cglobal fft_dispatch%3%2, 2,5,8, z, nbits cglobal fft_dispatch%2, 2,5,8, z, nbits
FFT_DISPATCH %3%2, nbits FFT_DISPATCH fullsuffix, nbits
%ifidn %2, _avx %if mmsize == 32
vzeroupper vzeroupper
%endif %endif
RET RET
%endmacro ; DECL_FFT %endmacro ; DECL_FFT
%if HAVE_AVX %if HAVE_AVX
INIT_YMM INIT_YMM avx
DECL_FFT 6, _avx DECL_FFT 6
DECL_FFT 6, _avx, _interleave DECL_FFT 6, _interleave
%endif %endif
INIT_XMM INIT_XMM sse
DECL_FFT 5, _sse DECL_FFT 5
DECL_FFT 5, _sse, _interleave DECL_FFT 5, _interleave
INIT_MMX INIT_MMX 3dnow
DECL_FFT 4, _3dn DECL_FFT 4
DECL_FFT 4, _3dn, _interleave DECL_FFT 4, _interleave
DECL_FFT 4, _3dn2 INIT_MMX 3dnow2
DECL_FFT 4, _3dn2, _interleave DECL_FFT 4
DECL_FFT 4, _interleave
INIT_XMM INIT_XMM sse
%undef mulps %undef mulps
%undef addps %undef addps
%undef subps %undef subps
@ -748,8 +754,8 @@ INIT_XMM
jl .post jl .post
%endmacro %endmacro
%macro DECL_IMDCT 2 %macro DECL_IMDCT 1
cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64 %if ARCH_X86_64
%define rrevtab r7 %define rrevtab r7
%define rtcos r8 %define rtcos r8
@ -821,7 +827,7 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
mov r0, r1 mov r0, r1
mov r1d, [r5+FFTContext.nbits] mov r1d, [r5+FFTContext.nbits]
FFT_DISPATCH %1, r1 FFT_DISPATCH SUFFIX, r1
mov r0d, [r5+FFTContext.mdctsize] mov r0d, [r5+FFTContext.mdctsize]
add r6, r0 add r6, r0
@ -835,20 +841,20 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
neg r0 neg r0
mov r1, -mmsize mov r1, -mmsize
sub r1, r0 sub r1, r0
%2 r0, r1, r6, rtcos, rtsin %1 r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
add esp, 12 add esp, 12
%endif %endif
%ifidn avx_enabled, 1 %if mmsize == 32
vzeroupper vzeroupper
%endif %endif
RET RET
%endmacro %endmacro
DECL_IMDCT _sse, POSROTATESHUF DECL_IMDCT POSROTATESHUF
INIT_YMM INIT_YMM avx
%if HAVE_AVX %if HAVE_AVX
DECL_IMDCT _avx, POSROTATESHUF_AVX DECL_IMDCT POSROTATESHUF_AVX
%endif %endif