Merge remote-tracking branch 'qatar/master'

* qatar/master:
  x86: PALIGNR: port to cpuflags
  x86: h264_qpel_10bit: port to cpuflags

Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2012-11-10 12:44:27 +01:00
commit 2ce64413e2
6 changed files with 164 additions and 223 deletions

View File

@ -1063,10 +1063,8 @@ cglobal pred8x8l_top_dc_8, 4,4
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
PRED8x8L_TOP_DC PRED8x8L_TOP_DC
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_TOP_DC PRED8x8L_TOP_DC
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1168,10 +1166,8 @@ cglobal pred8x8l_dc_8, 4,5
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
PRED8x8L_DC PRED8x8L_DC
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_DC PRED8x8L_DC
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1241,10 +1237,8 @@ cglobal pred8x8l_horizontal_8, 4,4
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL PRED8x8L_HORIZONTAL
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL PRED8x8L_HORIZONTAL
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1293,10 +1287,8 @@ cglobal pred8x8l_vertical_8, 4,4
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL PRED8x8L_VERTICAL
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL PRED8x8L_VERTICAL
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1304,7 +1296,6 @@ PRED8x8L_VERTICAL
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred8x8l_down_left_8, 4,5 cglobal pred8x8l_down_left_8, 4,5
sub r0, r3 sub r0, r3
movq mm0, [r0-8] movq mm0, [r0-8]
@ -1496,10 +1487,8 @@ INIT_XMM cpuname
%endmacro %endmacro
INIT_MMX sse2 INIT_MMX sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_LEFT PRED8x8L_DOWN_LEFT
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT PRED8x8L_DOWN_LEFT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1507,7 +1496,6 @@ PRED8x8L_DOWN_LEFT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred8x8l_down_right_8, 4,5 cglobal pred8x8l_down_right_8, 4,5
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*2]
@ -1750,10 +1738,8 @@ INIT_XMM cpuname
%endmacro %endmacro
INIT_MMX sse2 INIT_MMX sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_RIGHT PRED8x8L_DOWN_RIGHT
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT PRED8x8L_DOWN_RIGHT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -1761,7 +1747,6 @@ PRED8x8L_DOWN_RIGHT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred8x8l_vertical_right_8, 4,5 cglobal pred8x8l_vertical_right_8, 4,5
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*2]
@ -1980,10 +1965,8 @@ INIT_XMM cpuname
%endmacro %endmacro
INIT_MMX sse2 INIT_MMX sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL_RIGHT PRED8x8L_VERTICAL_RIGHT
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT PRED8x8L_VERTICAL_RIGHT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -2071,10 +2054,8 @@ INIT_XMM cpuname
%endmacro %endmacro
INIT_MMX sse2 INIT_MMX sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL_LEFT PRED8x8L_VERTICAL_LEFT
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_LEFT PRED8x8L_VERTICAL_LEFT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -2160,10 +2141,8 @@ cglobal pred8x8l_horizontal_up_8, 4,4
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL_UP PRED8x8L_HORIZONTAL_UP
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP PRED8x8L_HORIZONTAL_UP
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -2171,7 +2150,6 @@ PRED8x8L_HORIZONTAL_UP
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred8x8l_horizontal_down_8, 4,5 cglobal pred8x8l_horizontal_down_8, 4,5
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*2]
@ -2411,10 +2389,8 @@ INIT_XMM cpuname
%endmacro %endmacro
INIT_MMX sse2 INIT_MMX sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL_DOWN PRED8x8L_HORIZONTAL_DOWN
INIT_MMX ssse3 INIT_MMX ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_DOWN PRED8x8L_HORIZONTAL_DOWN
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -2637,7 +2613,6 @@ cglobal pred4x4_horizontal_up_8, 3,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred4x4_horizontal_down_8, 3,3 cglobal pred4x4_horizontal_down_8, 3,3
sub r0, r2 sub r0, r2
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
@ -2673,7 +2648,6 @@ cglobal pred4x4_horizontal_down_8, 3,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred4x4_vertical_right_8, 3,3 cglobal pred4x4_vertical_right_8, 3,3
sub r0, r2 sub r0, r2
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
@ -2704,7 +2678,6 @@ cglobal pred4x4_vertical_right_8, 3,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmxext INIT_MMX mmxext
%define PALIGNR PALIGNR_MMX
cglobal pred4x4_down_right_8, 3,3 cglobal pred4x4_down_right_8, 3,3
sub r0, r2 sub r0, r2
lea r1, [r0+r2*2] lea r1, [r0+r2*2]

View File

@ -79,10 +79,8 @@ cglobal pred4x4_down_right_10, 3, 3
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED4x4_DR PRED4x4_DR
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED4x4_DR PRED4x4_DR
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -120,10 +118,8 @@ cglobal pred4x4_vertical_right_10, 3, 3, 6
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED4x4_VR PRED4x4_VR
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED4x4_VR PRED4x4_VR
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -164,10 +160,8 @@ cglobal pred4x4_horizontal_down_10, 3, 3
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED4x4_HD PRED4x4_HD
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED4x4_HD PRED4x4_HD
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -726,10 +720,8 @@ cglobal pred8x8l_horizontal_10, 4, 4, 5
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL PRED8x8L_HORIZONTAL
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL PRED8x8L_HORIZONTAL
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -796,10 +788,8 @@ cglobal pred8x8l_down_left_10, 4, 4, 7
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_LEFT PRED8x8L_DOWN_LEFT
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT PRED8x8L_DOWN_LEFT
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -872,10 +862,8 @@ cglobal pred8x8l_down_right_10, 4, 5, 8
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_RIGHT PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT PRED8x8L_DOWN_RIGHT
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -944,10 +932,8 @@ cglobal pred8x8l_vertical_right_10, 4, 5, 7
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL_RIGHT PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT PRED8x8L_VERTICAL_RIGHT
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx
@ -1007,10 +993,8 @@ cglobal pred8x8l_horizontal_up_10, 4, 4, 6
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
%define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL_UP PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3 INIT_XMM ssse3
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP PRED8x8L_HORIZONTAL_UP
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
INIT_XMM avx INIT_XMM avx

View File

@ -97,81 +97,73 @@ SECTION .text
%macro MC 1 %macro MC 1
%define OP_MOV mova %define OP_MOV mova
INIT_MMX INIT_MMX mmxext
%1 mmxext, put, 4 %1 put, 4
INIT_XMM INIT_XMM sse2
%1 sse2 , put, 8 %1 put, 8
%define OP_MOV AVG_MOV %define OP_MOV AVG_MOV
INIT_MMX INIT_MMX mmxext
%1 mmxext, avg, 4 %1 avg, 4
INIT_XMM INIT_XMM sse2
%1 sse2 , avg, 8 %1 avg, 8
%endmacro %endmacro
%macro MCAxA 8 %macro MCAxA_OP 7
%if ARCH_X86_64
%ifnidn %1,mmxext
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endif
%else
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endif
%endmacro
%macro MCAxA_OP 8
%if ARCH_X86_32 %if ARCH_X86_32
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m mov r0, r0m
mov r1, r1m mov r1, r1m
add r0, %4*2 add r0, %3*2
add r1, %4*2 add r1, %3*2
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m mov r0, r0m
mov r1, r1m mov r1, r1m
lea r0, [r0+r2*%4] lea r0, [r0+r2*%3]
lea r1, [r1+r2*%4] lea r1, [r1+r2*%3]
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m mov r0, r0m
mov r1, r1m mov r1, r1m
lea r0, [r0+r2*%4+%4*2] lea r0, [r0+r2*%3+%3*2]
lea r1, [r1+r2*%4+%4*2] lea r1, [r1+r2*%3+%3*2]
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET RET
%else ; ARCH_X86_64 %else ; ARCH_X86_64
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8 cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
mov r%7, r0 mov r%6, r0
%assign p1 %7+1 %assign p1 %6+1
mov r %+ p1, r1 mov r %+ p1, r1
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%7+%4*2] lea r0, [r%6+%3*2]
lea r1, [r %+ p1+%4*2] lea r1, [r %+ p1+%3*2]
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%7+r2*%4] lea r0, [r%6+r2*%3]
lea r1, [r %+ p1+r2*%4] lea r1, [r %+ p1+r2*%3]
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%7+r2*%4+%4*2] lea r0, [r%6+r2*%3+%3*2]
lea r1, [r %+ p1+r2*%4+%4*2] lea r1, [r %+ p1+r2*%3+%3*2]
%if UNIX64 == 0 ; fall through to function %if UNIX64 == 0 ; fall through to function
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET RET
%endif %endif
%endif %endif
%endmacro %endmacro
;cpu, put/avg, mc, 4/8, ... ;cpu, put/avg, mc, 4/8, ...
%macro cglobal_mc 7 %macro cglobal_mc 6
%assign i %4*2 %assign i %3*2
MCAxA %1, %2, %3, %4, i, %5,%6,%7 %if ARCH_X86_32 || cpuflag(sse2)
MCAxA_OP %1, %2, %3, i, %4,%5,%6
%endif
cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7 cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET RET
%endif %endif
stub_%2_h264_qpel%4_%3_10_%1: stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
%endmacro %endmacro
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
@ -189,14 +181,14 @@ stub_%2_h264_qpel%4_%3_10_%1:
%endmacro %endmacro
%macro MC00 1 %macro MC00 1
INIT_MMX INIT_MMX mmxext
cglobal_mc mmxext, %1, mc00, 4, 3,4,0 cglobal_mc %1, mc00, 4, 3,4,0
lea r3, [r2*3] lea r3, [r2*3]
COPY4 COPY4
ret ret
INIT_XMM INIT_XMM sse2
cglobal %1_h264_qpel8_mc00_10_sse2, 3,4 cglobal %1_h264_qpel8_mc00_10, 3,4
lea r3, [r2*3] lea r3, [r2*3]
COPY4 COPY4
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
@ -204,7 +196,7 @@ cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
COPY4 COPY4
RET RET
cglobal %1_h264_qpel16_mc00_10_sse2, 3,4 cglobal %1_h264_qpel16_mc00_10, 3,4
mov r3d, 8 mov r3d, 8
.loop: .loop:
movu m0, [r1 ] movu m0, [r1 ]
@ -233,29 +225,29 @@ MC00 avg
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC_CACHE 1 %macro MC_CACHE 1
%define OP_MOV mova %define OP_MOV mova
%define PALIGNR PALIGNR_MMX INIT_MMX mmxext
INIT_MMX %1 put, 4
%1 mmxext , put, 4 INIT_XMM sse2, cache64
INIT_XMM %1 put, 8
%1 sse2_cache64 , put, 8 INIT_XMM ssse3, cache64
%define PALIGNR PALIGNR_SSSE3 %1 put, 8
%1 ssse3_cache64, put, 8 INIT_XMM sse2
%1 sse2 , put, 8, 0 %1 put, 8, 0
%define OP_MOV AVG_MOV %define OP_MOV AVG_MOV
%define PALIGNR PALIGNR_MMX INIT_MMX mmxext
INIT_MMX %1 avg, 4
%1 mmxext , avg, 4 INIT_XMM sse2, cache64
INIT_XMM %1 avg, 8
%1 sse2_cache64 , avg, 8 INIT_XMM ssse3, cache64
%define PALIGNR PALIGNR_SSSE3 %1 avg, 8
%1 ssse3_cache64, avg, 8 INIT_XMM sse2
%1 sse2 , avg, 8, 0 %1 avg, 8, 0
%endmacro %endmacro
%macro MC20 3-4 %macro MC20 2-3
cglobal_mc %1, %2, mc20, %3, 3,4,9 cglobal_mc %1, mc20, %2, 3,4,9
mov r3d, %3 mov r3d, %2
mova m1, [pw_pixel_max] mova m1, [pw_pixel_max]
%if num_mmregs > 8 %if num_mmregs > 8
mova m8, [pw_16] mova m8, [pw_16]
@ -315,10 +307,10 @@ MC_CACHE MC20
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC30 3-4 %macro MC30 2-3
cglobal_mc %1, %2, mc30, %3, 3,5,9 cglobal_mc %1, mc30, %2, 3,5,9
lea r4, [r1+2] lea r4, [r1+2]
jmp stub_%2_h264_qpel%3_mc10_10_%1.body jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC_CACHE MC30 MC_CACHE MC30
@ -326,11 +318,11 @@ MC_CACHE MC30
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC10 3-4 %macro MC10 2-3
cglobal_mc %1, %2, mc10, %3, 3,5,9 cglobal_mc %1, mc10, %2, 3,5,9
mov r4, r1 mov r4, r1
.body: .body:
mov r3d, %3 mov r3d, %2
mova m1, [pw_pixel_max] mova m1, [pw_pixel_max]
%if num_mmregs > 8 %if num_mmregs > 8
mova m8, [pw_16] mova m8, [pw_16]
@ -393,8 +385,8 @@ MC_CACHE MC10
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro V_FILT 11 %macro V_FILT 10
v_filt%9_%10_10_%11: v_filt%9_%10_10
add r4, r2 add r4, r2
.no_addr4: .no_addr4:
FILT_V m0, m1, m2, m3, m4, m5, m6, m7 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
@ -403,33 +395,33 @@ v_filt%9_%10_10_%11:
ret ret
%endmacro %endmacro
INIT_MMX INIT_MMX mmxext
RESET_MM_PERMUTATION RESET_MM_PERMUTATION
%assign i 0 %assign i 0
%rep 4 %rep 4
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
SWAP 0,1,2,3,4,5 SWAP 0,1,2,3,4,5
%assign i i+1 %assign i i+1
%endrep %endrep
INIT_XMM INIT_XMM sse2
RESET_MM_PERMUTATION RESET_MM_PERMUTATION
%assign i 0 %assign i 0
%rep 6 %rep 6
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
SWAP 0,1,2,3,4,5 SWAP 0,1,2,3,4,5
%assign i i+1 %assign i i+1
%endrep %endrep
%macro MC02 3 %macro MC02 2
cglobal_mc %1, %2, mc02, %3, 3,4,8 cglobal_mc %1, mc02, %2, 3,4,8
PRELOAD_V PRELOAD_V
sub r0, r2 sub r0, r2
%assign j 0 %assign j 0
%rep %3 %rep %2
%assign i (j % 6) %assign i (j % 6)
call v_filt%3_ %+ i %+ _10_%1.no_addr4 call v_filt%2_ %+ i %+ _10.no_addr4
OP_MOV [r0], m0 OP_MOV [r0], m0
SWAP 0,1,2,3,4,5 SWAP 0,1,2,3,4,5
%assign j j+1 %assign j j+1
@ -442,8 +434,8 @@ MC MC02
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC01 3 %macro MC01 2
cglobal_mc %1, %2, mc01, %3, 3,5,8 cglobal_mc %1, mc01, %2, 3,5,8
mov r4, r1 mov r4, r1
.body: .body:
PRELOAD_V PRELOAD_V
@ -451,9 +443,9 @@ cglobal_mc %1, %2, mc01, %3, 3,5,8
sub r4, r2 sub r4, r2
sub r0, r2 sub r0, r2
%assign j 0 %assign j 0
%rep %3 %rep %2
%assign i (j % 6) %assign i (j % 6)
call v_filt%3_ %+ i %+ _10_%1 call v_filt%2_ %+ i %+ _10
movu m7, [r4] movu m7, [r4]
pavgw m0, m7 pavgw m0, m7
OP_MOV [r0], m0 OP_MOV [r0], m0
@ -468,10 +460,10 @@ MC MC01
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC03 3 %macro MC03 2
cglobal_mc %1, %2, mc03, %3, 3,5,8 cglobal_mc %1, mc03, %2, 3,5,8
lea r4, [r1+r2] lea r4, [r1+r2]
jmp stub_%2_h264_qpel%3_mc01_10_%1.body jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC03 MC MC03
@ -479,8 +471,8 @@ MC MC03
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro H_FILT_AVG 3-4 %macro H_FILT_AVG 2-3
h_filt%2_%3_10_%1: h_filt%1_%2_10:
;FILT_H with fewer registers and averaged with the FILT_V result ;FILT_H with fewer registers and averaged with the FILT_V result
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
;unfortunately I need three registers, so m5 will have to be re-read from memory ;unfortunately I need three registers, so m5 will have to be re-read from memory
@ -507,32 +499,32 @@ h_filt%2_%3_10_%1:
ret ret
%endmacro %endmacro
INIT_MMX INIT_MMX mmxext
RESET_MM_PERMUTATION RESET_MM_PERMUTATION
%assign i 0 %assign i 0
%rep 3 %rep 3
H_FILT_AVG mmxext, 4, i H_FILT_AVG 4, i
SWAP 0,1,2,3,4,5 SWAP 0,1,2,3,4,5
%assign i i+1 %assign i i+1
%endrep %endrep
H_FILT_AVG mmxext, 4, i, 0 H_FILT_AVG 4, i, 0
INIT_XMM INIT_XMM sse2
RESET_MM_PERMUTATION RESET_MM_PERMUTATION
%assign i 0 %assign i 0
%rep 6 %rep 6
%if i==1 %if i==1
H_FILT_AVG sse2, 8, i, 0 H_FILT_AVG 8, i, 0
%else %else
H_FILT_AVG sse2, 8, i H_FILT_AVG 8, i
%endif %endif
SWAP 0,1,2,3,4,5 SWAP 0,1,2,3,4,5
%assign i i+1 %assign i i+1
%endrep %endrep
%macro MC11 3 %macro MC11 2
; this REALLY needs x86_64 ; this REALLY needs x86_64
cglobal_mc %1, %2, mc11, %3, 3,6,8 cglobal_mc %1, mc11, %2, 3,6,8
mov r4, r1 mov r4, r1
.body: .body:
PRELOAD_V PRELOAD_V
@ -542,11 +534,11 @@ cglobal_mc %1, %2, mc11, %3, 3,6,8
mov r5, r2 mov r5, r2
neg r5 neg r5
%assign j 0 %assign j 0
%rep %3 %rep %2
%assign i (j % 6) %assign i (j % 6)
call v_filt%3_ %+ i %+ _10_%1 call v_filt%2_ %+ i %+ _10
call h_filt%3_ %+ i %+ _10_%1 call h_filt%2_ %+ i %+ _10
%if %3==8 && i==1 %if %2==8 && i==1
movu m5, [r1+r5] movu m5, [r1+r5]
%endif %endif
OP_MOV [r0], m0 OP_MOV [r0], m0
@ -561,11 +553,11 @@ MC MC11
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC31 3 %macro MC31 2
cglobal_mc %1, %2, mc31, %3, 3,6,8 cglobal_mc %1, mc31, %2, 3,6,8
mov r4, r1 mov r4, r1
add r1, 2 add r1, 2
jmp stub_%2_h264_qpel%3_mc11_10_%1.body jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC31 MC MC31
@ -573,10 +565,10 @@ MC MC31
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC13 3 %macro MC13 2
cglobal_mc %1, %2, mc13, %3, 3,7,12 cglobal_mc %1, mc13, %2, 3,7,12
lea r4, [r1+r2] lea r4, [r1+r2]
jmp stub_%2_h264_qpel%3_mc11_10_%1.body jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC13 MC MC13
@ -584,11 +576,11 @@ MC MC13
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC33 3 %macro MC33 2
cglobal_mc %1, %2, mc33, %3, 3,6,8 cglobal_mc %1, mc33, %2, 3,6,8
lea r4, [r1+r2] lea r4, [r1+r2]
add r1, 2 add r1, 2
jmp stub_%2_h264_qpel%3_mc11_10_%1.body jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC33 MC MC33
@ -615,15 +607,15 @@ MC MC33
FILT_H2 %1, %7, %8 FILT_H2 %1, %7, %8
%endmacro %endmacro
%macro HV 2 %macro HV 1
%ifidn %1,sse2 %if mmsize==16
%define PAD 12 %define PAD 12
%define COUNT 2 %define COUNT 2
%else %else
%define PAD 4 %define PAD 4
%define COUNT 3 %define COUNT 3
%endif %endif
put_hv%2_10_%1: put_hv%1_10:
neg r2 ; This actually saves instructions neg r2 ; This actually saves instructions
lea r1, [r1+r2*2-mmsize+PAD] lea r1, [r1+r2*2-mmsize+PAD]
lea r4, [rsp+PAD+gprsize] lea r4, [rsp+PAD+gprsize]
@ -640,7 +632,7 @@ put_hv%2_10_%1:
movu m4, [r1] movu m4, [r1]
sub r1, r2 sub r1, r2
%assign i 0 %assign i 0
%rep %2-1 %rep %1-1
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20] psubw m0, [pad20]
movu [r4+i*mmsize*3], m0 movu [r4+i*mmsize*3], m0
@ -653,7 +645,7 @@ put_hv%2_10_%1:
movu [r4+i*mmsize*3], m0 movu [r4+i*mmsize*3], m0
add r4, mmsize add r4, mmsize
lea r1, [r1+r2*8+mmsize] lea r1, [r1+r2*8+mmsize]
%if %2==8 %if %1==8
lea r1, [r1+r2*4] lea r1, [r1+r2*4]
%endif %endif
dec r3d dec r3d
@ -662,12 +654,12 @@ put_hv%2_10_%1:
ret ret
%endmacro %endmacro
INIT_MMX INIT_MMX mmxext
HV mmxext, 4 HV 4
INIT_XMM INIT_XMM sse2
HV sse2 , 8 HV 8
%macro H_LOOP 2 %macro H_LOOP 1
%if num_mmregs > 8 %if num_mmregs > 8
%define s1 m8 %define s1 m8
%define s2 m9 %define s2 m9
@ -679,7 +671,7 @@ HV sse2 , 8
%define s3 [tap3] %define s3 [tap3]
%define d1 [depad] %define d1 [depad]
%endif %endif
h%2_loop_op_%1: h%1_loop_op:
movu m1, [r1+mmsize-4] movu m1, [r1+mmsize-4]
movu m2, [r1+mmsize-2] movu m2, [r1+mmsize-2]
mova m3, [r1+mmsize+0] mova m3, [r1+mmsize+0]
@ -726,21 +718,21 @@ h%2_loop_op_%1:
ret ret
%endmacro %endmacro
INIT_MMX INIT_MMX mmxext
H_LOOP mmxext, 4 H_LOOP 4
INIT_XMM INIT_XMM sse2
H_LOOP sse2 , 8 H_LOOP 8
%macro MC22 3 %macro MC22 2
cglobal_mc %1, %2, mc22, %3, 3,7,12 cglobal_mc %1, mc22, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD sub rsp, PAD
call put_hv%3_10_%1 call put_hv%2_10
mov r3d, %3 mov r3d, %2
mova m7, [pw_pixel_max] mova m7, [pw_pixel_max]
%if num_mmregs > 8 %if num_mmregs > 8
pxor m0, m0 pxor m0, m0
@ -751,7 +743,7 @@ cglobal_mc %1, %2, mc22, %3, 3,7,12
%endif %endif
mov r1, rsp mov r1, rsp
.h_loop: .h_loop:
call h%3_loop_op_%1 call h%2_loop_op
OP_MOV [r0], m1 OP_MOV [r0], m1
add r0, r2 add r0, r2
@ -767,18 +759,18 @@ MC MC22
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC12 3 %macro MC12 2
cglobal_mc %1, %2, mc12, %3, 3,7,12 cglobal_mc %1, mc12, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD sub rsp, PAD
call put_hv%3_10_%1 call put_hv%2_10
xor r4d, r4d xor r4d, r4d
.body: .body:
mov r3d, %3 mov r3d, %2
pxor m0, m0 pxor m0, m0
mova m7, [pw_pixel_max] mova m7, [pw_pixel_max]
%if num_mmregs > 8 %if num_mmregs > 8
@ -789,7 +781,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12
%endif %endif
mov r1, rsp mov r1, rsp
.h_loop: .h_loop:
call h%3_loop_op_%1 call h%2_loop_op
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
paddw m3, [depad2] paddw m3, [depad2]
@ -812,17 +804,17 @@ MC MC12
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC32 3 %macro MC32 2
cglobal_mc %1, %2, mc32, %3, 3,7,12 cglobal_mc %1, mc32, %2, 3,7,12
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD sub rsp, PAD
call put_hv%3_10_%1 call put_hv%2_10
mov r4d, 2 ; sizeof(pixel) mov r4d, 2 ; sizeof(pixel)
jmp stub_%2_h264_qpel%3_mc12_10_%1.body jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC32 MC MC32
@ -830,10 +822,10 @@ MC MC32
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro H_NRD 2 %macro H_NRD 1
put_h%2_10_%1: put_h%1_10:
add rsp, gprsize add rsp, gprsize
mov r3d, %2 mov r3d, %1
xor r4d, r4d xor r4d, r4d
mova m6, [pad20] mova m6, [pad20]
.nextrow: .nextrow:
@ -855,13 +847,13 @@ put_h%2_10_%1:
ret ret
%endmacro %endmacro
INIT_MMX INIT_MMX mmxext
H_NRD mmxext, 4 H_NRD 4
INIT_XMM INIT_XMM sse2
H_NRD sse2 , 8 H_NRD 8
%macro MC21 3 %macro MC21 2
cglobal_mc %1, %2, mc21, %3, 3,7,12 cglobal_mc %1, mc21, %2, 3,7,12
mov r5, r1 mov r5, r1
.body: .body:
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
@ -869,13 +861,13 @@ cglobal_mc %1, %2, mc21, %3, 3,7,12
and rsp, ~(mmsize-1) ; align stack and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD sub rsp, PAD
call put_h%3_10_%1 call put_h%2_10
sub rsp, PAD sub rsp, PAD
call put_hv%3_10_%1 call put_hv%2_10
mov r4d, PAD-mmsize ; H buffer mov r4d, PAD-mmsize ; H buffer
jmp stub_%2_h264_qpel%3_mc12_10_%1.body jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC21 MC MC21
@ -883,10 +875,10 @@ MC MC21
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC23 3 %macro MC23 2
cglobal_mc %1, %2, mc23, %3, 3,7,12 cglobal_mc %1, mc23, %2, 3,7,12
lea r5, [r1+r2] lea r5, [r1+r2]
jmp stub_%2_h264_qpel%3_mc21_10_%1.body jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
%endmacro %endmacro
MC MC23 MC MC23

View File

@ -919,10 +919,8 @@ cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
REP_RET REP_RET
%endmacro %endmacro
%define PALIGNR PALIGNR_MMX
INIT_XMM sse2 INIT_XMM sse2
CONV_S16_TO_S16P_6CH CONV_S16_TO_S16P_6CH
%define PALIGNR PALIGNR_SSSE3
INIT_XMM ssse3 INIT_XMM ssse3
CONV_S16_TO_S16P_6CH CONV_S16_TO_S16P_6CH
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
@ -1038,10 +1036,8 @@ cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
REP_RET REP_RET
%endmacro %endmacro
%define PALIGNR PALIGNR_MMX
INIT_XMM sse2 INIT_XMM sse2
CONV_S16_TO_FLTP_6CH CONV_S16_TO_FLTP_6CH
%define PALIGNR PALIGNR_SSSE3
INIT_XMM ssse3 INIT_XMM ssse3
CONV_S16_TO_FLTP_6CH CONV_S16_TO_FLTP_6CH
INIT_XMM sse4 INIT_XMM sse4
@ -1160,10 +1156,8 @@ cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
REP_RET REP_RET
%endmacro %endmacro
%define PALIGNR PALIGNR_MMX
INIT_XMM sse2 INIT_XMM sse2
CONV_FLT_TO_S16P_6CH CONV_FLT_TO_S16P_6CH
%define PALIGNR PALIGNR_SSSE3
INIT_XMM ssse3 INIT_XMM ssse3
CONV_FLT_TO_S16P_6CH CONV_FLT_TO_S16P_6CH
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL

View File

@ -280,7 +280,14 @@
%endif %endif
%endmacro %endmacro
%macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp %macro PALIGNR 4-5
%if cpuflag(ssse3)
%if %0==5
palignr %1, %2, %3, %4
%else
palignr %1, %2, %3
%endif
%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
%define %%dst %1 %define %%dst %1
%if %0==5 %if %0==5
%ifnidn %1, %2 %ifnidn %1, %2
@ -299,13 +306,6 @@
psrldq %4, %3 psrldq %4, %3
%endif %endif
por %%dst, %4 por %%dst, %4
%endmacro
%macro PALIGNR_SSSE3 4-5
%if %0==5
palignr %1, %2, %3, %4
%else
palignr %1, %2, %3
%endif %endif
%endmacro %endmacro

View File

@ -246,7 +246,6 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16 %endif ; %1 == 8/9/10/16
%endmacro %endmacro
%define PALIGNR PALIGNR_MMX
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmx2 INIT_MMX mmx2
yuv2planeX_fn 8, 0, 7 yuv2planeX_fn 8, 0, 7
@ -259,7 +258,6 @@ yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5 yuv2planeX_fn 10, 7, 5
%define PALIGNR PALIGNR_SSSE3
INIT_XMM sse4 INIT_XMM sse4
yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 9, 7, 5
@ -346,7 +344,7 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
%if mmsize == 16 %if mmsize == 16
punpcklqdq m3, m3 punpcklqdq m3, m3
%endif ; mmsize == 16 %endif ; mmsize == 16
PALIGNR_MMX m3, m3, 3, m2 PALIGNR m3, m3, 3, m2
.no_rot: .no_rot:
%if mmsize == 8 %if mmsize == 8
mova m2, m3 mova m2, m3