dct32: Add SSE2 ASM optimizations

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Vitor Sessak 2011-07-30 18:39:25 +02:00 committed by Ronald S. Bultje
parent 6f7fe4723b
commit 18b131de04
3 changed files with 33 additions and 9 deletions

View File

@ -63,6 +63,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
mulps %1, %3 mulps %1, %3
%endmacro %endmacro
%macro BUTTERFLY0_SSE2 5
pshufd %4, %1, %5
xorps %1, %2
addps %1, %4
mulps %1, %3
%endmacro
%macro BUTTERFLY0_AVX 5 %macro BUTTERFLY0_AVX 5
vshufps %4, %1, %1, %5 vshufps %4, %1, %1, %5
vxorps %1, %1, %2 vxorps %1, %1, %2
@ -405,18 +412,17 @@ INIT_XMM
INIT_XMM INIT_XMM
%macro DCT32_FUNC 1
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
cglobal dct32_float_sse, 2,3,16, out, in, tmp cglobal dct32_float_%1, 2,3,16, out, in, tmp
; pass 1 ; pass 1
movaps m0, [inq+0] movaps m0, [inq+0]
movaps m1, [inq+112] LOAD_INV m1, [inq+112]
shufps m1, m1, 0x1b
BUTTERFLY m0, m1, [ps_cos_vec], m3 BUTTERFLY m0, m1, [ps_cos_vec], m3
movaps m7, [inq+64] movaps m7, [inq+64]
movaps m4, [inq+48] LOAD_INV m4, [inq+48]
shufps m4, m4, 0x1b
BUTTERFLY m7, m4, [ps_cos_vec+32], m3 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
; pass 2 ; pass 2
@ -427,13 +433,11 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
; pass 1 ; pass 1
movaps m1, [inq+16] movaps m1, [inq+16]
movaps m6, [inq+96] LOAD_INV m6, [inq+96]
shufps m6, m6, 0x1b
BUTTERFLY m1, m6, [ps_cos_vec+16], m3 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
movaps m4, [inq+80] movaps m4, [inq+80]
movaps m5, [inq+32] LOAD_INV m5, [inq+32]
shufps m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec+48], m3 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
; pass 2 ; pass 2
@ -492,3 +496,20 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
PASS5 PASS5
PASS6 PASS6
RET RET
%endmacro
%macro LOAD_INV_SSE 2
movaps %1, %2
shufps %1, %1, 0x1b
%endmacro
%define LOAD_INV LOAD_INV_SSE
DCT32_FUNC sse
%macro LOAD_INV_SSE2 2
pshufd %1, %2, 0x1b
%endmacro
%define LOAD_INV LOAD_INV_SSE2
%define BUTTERFLY0 BUTTERFLY0_SSE2
DCT32_FUNC sse2

View File

@ -60,6 +60,8 @@ av_cold void ff_dct_init_mmx(DCTContext *s)
int has_vectors = av_get_cpu_flags(); int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
s->dct32 = ff_dct32_float_avx; s->dct32 = ff_dct32_float_avx;
else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
s->dct32 = ff_dct32_float_sse2;
else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
s->dct32 = ff_dct32_float_sse; s->dct32 = ff_dct32_float_sse;
#endif #endif

View File

@ -35,6 +35,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
#endif /* AVCODEC_X86_FFT_H */ #endif /* AVCODEC_X86_FFT_H */