vp3: don't use calls to inline asm in yasm code.

Mixing yasm and inline asm is a bad idea, since if either yasm or inline
asm is not supported by your toolchain, all of the asm stops working.
Thus, better to use either one or the other alone.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
This commit is contained in:
Ronald S. Bultje 2012-07-22 20:38:56 +00:00 committed by Derek Buitenhuis
parent 79195ce565
commit a1878a88a1
1 changed files with 81 additions and 43 deletions

View File

@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3 cextern pb_3
cextern pb_7 cextern pb_7
cextern pb_1F cextern pb_1F
cextern pb_80
cextern pb_81 cextern pb_81
cextern pw_8 cextern pw_8
cextern put_signed_pixels_clamped_mmx
cextern add_pixels_clamped_mmx
SECTION .text SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63 ; this is off by one or two for some cases when filter_limit is greater than 63
@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro %endmacro
%macro vp3_idct_funcs 3 %macro vp3_idct_funcs 1
cglobal vp3_idct_put_%1, 3, %3, %2 cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2 VP3_IDCT_%1 r2
%if ARCH_X86_64
mov r3, r2
mov r2, r1
mov r1, r0
mov r0, r3
%else
mov r0m, r2
mov r1m, r0
mov r2m, r1
%endif
%if WIN64
call put_signed_pixels_clamped_mmx
RET
%else
jmp put_signed_pixels_clamped_mmx
%endif
cglobal vp3_idct_add_%1, 3, %3, %2 movsxdifnidn r1, r1d
VP3_IDCT_%1 r2 mova m4, [pb_80]
%if ARCH_X86_64 lea r3, [r1*3]
mov r3, r2 %assign %%i 0
mov r2, r1 %rep 16/mmsize
mov r1, r0 mova m0, [r2+mmsize*0+%%i]
mov r0, r3 mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%else %else
mov r0m, r2 movhps [r0+r1 ], m0
mov r1m, r0 movq [r0+r1*2], m1
mov r2m, r1 movhps [r0+r3 ], m1
%endif %endif
%if WIN64 %if %%i == 0
call add_pixels_clamped_mmx lea r0, [r0+r1*4]
%endif
%if mmsize == 16
movq [r0 ], m2
movhps [r0+r1 ], m2
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
%assign %%i %%i+64
%endrep
RET RET
%else
jmp add_pixels_clamped_mmx cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
mov r3, 4
pxor m4, m4
movsxdifnidn r1, r1d
.loop:
movq m0, [r0]
movq m1, [r0+r1]
%if mmsize == 8
mova m2, m0
mova m3, m1
%endif %endif
punpcklbw m0, m4
punpcklbw m1, m4
%if mmsize == 8
punpckhbw m2, m4
punpckhbw m3, m4
%endif
paddsw m0, [r2+ 0]
paddsw m1, [r2+16]
%if mmsize == 8
paddsw m2, [r2+ 8]
paddsw m3, [r2+24]
packuswb m0, m2
packuswb m1, m3
%else ; mmsize == 16
packuswb m0, m1
%endif
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1], m1
%else ; mmsize == 16
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
add r2, 32
dec r3
jg .loop
RET
%endmacro %endmacro
%if ARCH_X86_64
%define REGS 4
%else
%define REGS 3
%endif
INIT_MMX INIT_MMX
vp3_idct_funcs mmx, 0, REGS vp3_idct_funcs mmx
INIT_XMM INIT_XMM
vp3_idct_funcs sse2, 9, REGS vp3_idct_funcs sse2
%undef REGS
%macro DC_ADD 0 %macro DC_ADD 0
movq m2, [r0 ] movq m2, [r0 ]