x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format

This commit is contained in:
Timothy Gu 2016-01-31 11:42:24 -08:00
parent b62825a480
commit 838abfc1d7
3 changed files with 111 additions and 207 deletions

View File

@ -395,3 +395,101 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
jnz .loop
REP_RET
%endif ; HAVE_MMX_INLINE
%macro INV_TRANS_INIT 0
movsxdifnidn linesizeq, linesized
movd m0, blockd
SPLATW m0, m0
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
DEFINE_ARGS dest, linesize, linesize3
lea linesize3q, [linesizeq*3]
%endmacro
%macro INV_TRANS_PROCESS 1
mov%1 m2, [destq+linesizeq*0]
mov%1 m3, [destq+linesizeq*1]
mov%1 m4, [destq+linesizeq*2]
mov%1 m5, [destq+linesize3q]
paddusb m2, m0
paddusb m3, m0
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
psubusb m4, m1
psubusb m5, m1
mov%1 [linesizeq*0+destq], m2
mov%1 [linesizeq*1+destq], m3
mov%1 [linesizeq*2+destq], m4
mov%1 [linesize3q +destq], m5
%endmacro
; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
INIT_MMX mmxext
cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
movsx r3d, WORD [blockq]
mov blockd, r3d ; dc
shl blockd, 4 ; 16 * dc
lea blockd, [blockq+r3+4] ; 17 * dc + 4
sar blockd, 3 ; >> 3
mov r3d, blockd ; dc
shl blockd, 4 ; 16 * dc
lea blockd, [blockq+r3+64] ; 17 * dc + 64
sar blockd, 7 ; >> 7
INV_TRANS_INIT
INV_TRANS_PROCESS h
RET
INIT_MMX mmxext
cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
movsx r3d, WORD [blockq]
mov blockd, r3d ; dc
shl blockd, 4 ; 16 * dc
lea blockd, [blockq+r3+4] ; 17 * dc + 4
sar blockd, 3 ; >> 3
shl blockd, 2 ; 4 * dc
lea blockd, [blockq*3+64] ; 12 * dc + 64
sar blockd, 7 ; >> 7
INV_TRANS_INIT
INV_TRANS_PROCESS h
lea destq, [destq+linesizeq*4]
INV_TRANS_PROCESS h
RET
INIT_MMX mmxext
cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
movsx blockd, WORD [blockq] ; dc
lea blockd, [blockq*3+1] ; 3 * dc + 1
sar blockd, 1 ; >> 1
mov r3d, blockd ; dc
shl blockd, 4 ; 16 * dc
lea blockd, [blockq+r3+64] ; 17 * dc + 64
sar blockd, 7 ; >> 7
INV_TRANS_INIT
INV_TRANS_PROCESS a
RET
INIT_MMX mmxext
cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
movsx blockd, WORD [blockq] ; dc
lea blockd, [blockq*3+1] ; 3 * dc + 1
sar blockd, 1 ; >> 1
lea blockd, [blockq*3+16] ; 3 * dc + 16
sar blockd, 5 ; >> 5
INV_TRANS_INIT
INV_TRANS_PROCESS a
lea destq, [destq+linesizeq*4]
INV_TRANS_PROCESS a
RET

View File

@ -92,6 +92,14 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block);
void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block);
void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block);
void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
@ -130,6 +138,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;

View File

@ -481,208 +481,6 @@ DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (12 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = ( 3 * dc + 1) >> 1;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (3 * dc + 1) >> 1;
dc = (3 * dc + 16) >> 5;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
@ -729,10 +527,5 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
FN_ASSIGN(avg_, 3, 1, _mmxext);
FN_ASSIGN(avg_, 3, 2, _mmxext);
FN_ASSIGN(avg_, 3, 3, _mmxext);
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */