mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-11 18:09:36 +00:00
x86: xvid_idct: SSE2 merged add version
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
decd5193e1
commit
15ce160183
@ -384,6 +384,12 @@ SECTION .text
|
||||
; Must now load args as gprs are no longer used for masks
|
||||
; DEST is set to where address of dest was loaded
|
||||
%if ARCH_X86_32
|
||||
%if %2 == 2 ; Not enough xmms, store
|
||||
movdqa [%1+1*16], TAN3
|
||||
movdqa [%1+2*16], xmm3
|
||||
movdqa [%1+5*16], REG0
|
||||
movdqa [%1+6*16], xmm5
|
||||
%endif
|
||||
%xdefine DEST r2q ; BLOCK is r0, stride r1
|
||||
movifnidn DEST, destm
|
||||
movifnidn strideq, stridem
|
||||
@ -397,8 +403,6 @@ SECTION .text
|
||||
movq [DEST + strideq], TAN3
|
||||
movhps [DEST + 2*strideq], TAN3
|
||||
; REG0 and TAN3 are now available (and likely used in second half)
|
||||
%else
|
||||
%warning Unimplemented
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
@ -427,7 +431,88 @@ SECTION .text
|
||||
movq [DEST + 2*strideq], xmm5
|
||||
movhps [DEST + strideq], xmm5
|
||||
%elif %2 == 2
|
||||
%warning Unimplemented
|
||||
pxor xmm0, xmm0
|
||||
%if ARCH_X86_32
|
||||
; free: m3 REG0=m4 m5
|
||||
; input: m1, m7, m2, m6
|
||||
movq xmm3, [DEST+0*strideq]
|
||||
movq xmm4, [DEST+1*strideq]
|
||||
punpcklbw xmm3, xmm0
|
||||
punpcklbw xmm4, xmm0
|
||||
paddsw xmm3, %3
|
||||
paddsw xmm4, [%1 + 1*16]
|
||||
movq %3, [DEST+2*strideq]
|
||||
movq xmm5, [DEST+ r3q]
|
||||
punpcklbw %3, xmm0
|
||||
punpcklbw xmm5, xmm0
|
||||
paddsw %3, [%1 + 2*16]
|
||||
paddsw xmm5, %5
|
||||
packuswb xmm3, xmm4
|
||||
packuswb %3, xmm5
|
||||
movq [DEST+0*strideq], xmm3
|
||||
movhps [DEST+1*strideq], xmm3
|
||||
movq [DEST+2*strideq], %3
|
||||
movhps [DEST+ r3q], %3
|
||||
lea DEST, [DEST+4*strideq]
|
||||
movq xmm3, [DEST+0*strideq]
|
||||
movq xmm4, [DEST+1*strideq]
|
||||
movq %3, [DEST+2*strideq]
|
||||
movq xmm5, [DEST+ r3q]
|
||||
punpcklbw xmm3, xmm0
|
||||
punpcklbw xmm4, xmm0
|
||||
punpcklbw %3, xmm0
|
||||
punpcklbw xmm5, xmm0
|
||||
paddsw xmm3, %6
|
||||
paddsw xmm4, [%1 + 5*16]
|
||||
paddsw %3, [%1 + 6*16]
|
||||
paddsw xmm5, %4
|
||||
packuswb xmm3, xmm4
|
||||
packuswb %3, xmm5
|
||||
movq [DEST+0*strideq], xmm3
|
||||
movhps [DEST+1*strideq], xmm3
|
||||
movq [DEST+2*strideq], %3
|
||||
movhps [DEST+ r3q], %3
|
||||
%else
|
||||
; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5
|
||||
; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
|
||||
movq xmm2, [DEST+0*strideq]
|
||||
movq xmm4, [DEST+1*strideq]
|
||||
movq xmm12, [DEST+2*strideq]
|
||||
movq xmm11, [DEST+ r3q]
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm4, xmm0
|
||||
punpcklbw xmm12, xmm0
|
||||
punpcklbw xmm11, xmm0
|
||||
paddsw xmm2, %3
|
||||
paddsw xmm4, TAN3
|
||||
paddsw xmm12, xmm3
|
||||
paddsw xmm11, %5
|
||||
packuswb xmm2, xmm4
|
||||
packuswb xmm12, xmm11
|
||||
movq [DEST+0*strideq], xmm2
|
||||
movhps [DEST+1*strideq], xmm2
|
||||
movq [DEST+2*strideq], xmm12
|
||||
movhps [DEST+ r3q], xmm12
|
||||
lea DEST, [DEST+4*strideq]
|
||||
movq xmm2, [DEST+0*strideq]
|
||||
movq xmm4, [DEST+1*strideq]
|
||||
movq xmm12, [DEST+2*strideq]
|
||||
movq xmm11, [DEST+ r3q]
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm4, xmm0
|
||||
punpcklbw xmm12, xmm0
|
||||
punpcklbw xmm11, xmm0
|
||||
paddsw xmm2, %6
|
||||
paddsw xmm4, REG0
|
||||
paddsw xmm12, xmm5
|
||||
paddsw xmm11, %4
|
||||
packuswb xmm2, xmm4
|
||||
packuswb xmm12, xmm11
|
||||
movq [DEST+0*strideq], xmm2
|
||||
movhps [DEST+1*strideq], xmm2
|
||||
movq [DEST+2*strideq], xmm12
|
||||
movhps [DEST+ r3q], xmm12
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
|
||||
INIT_XMM sse2
|
||||
IDCT_SSE2 0
|
||||
IDCT_SSE2 1
|
||||
IDCT_SSE2 2
|
||||
|
||||
%if ARCH_X86_32
|
||||
|
||||
|
@ -27,12 +27,7 @@
|
||||
#include "xvididct.h"
|
||||
|
||||
void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
|
||||
|
||||
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
|
||||
{
|
||||
ff_xvid_idct_sse2(block);
|
||||
ff_add_pixels_clamped(block, dest, line_size);
|
||||
}
|
||||
void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
|
||||
|
||||
#if ARCH_X86_32
|
||||
static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
|
||||
@ -88,7 +83,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_xvid_idct_put_sse2;
|
||||
c->idct_add = xvid_idct_sse2_add;
|
||||
c->idct_add = ff_xvid_idct_add_sse2;
|
||||
c->idct = ff_xvid_idct_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_SSE2;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user