diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 75de5690a1..1c59e884ed 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -840,25 +840,6 @@ cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height jg .nextrow REP_RET -%if ARCH_X86_32 -INIT_MMX mmx -cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height -.nextrow: - movq mm0, [srcq+srcstrideq*0+0] - movq mm1, [srcq+srcstrideq*0+8] - movq mm2, [srcq+srcstrideq*1+0] - movq mm3, [srcq+srcstrideq*1+8] - lea srcq, [srcq+srcstrideq*2] - movq [dstq+dststrideq*0+0], mm0 - movq [dstq+dststrideq*0+8], mm1 - movq [dstq+dststrideq*1+0], mm2 - movq [dstq+dststrideq*1+8], mm3 - lea dstq, [dstq+dststrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET -%endif - INIT_XMM sse cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height .nextrow: @@ -895,32 +876,6 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height %4 [dst2q+strideq+%3], m5 %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride - ; load data - movd m0, [blockq] - - ; calculate DC - paddw m0, [pw_4] - pxor m1, m1 - psraw m0, 3 - movd [blockq], m1 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - punpcklbw m0, m0 - punpcklbw m1, m1 - punpcklwd m0, m0 - punpcklwd m1, m1 - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m1, 0, movh - RET -%endif - %macro VP8_IDCT_DC_ADD 0 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data @@ -971,44 +926,6 @@ VP8_IDCT_DC_ADD ; void ff_vp8_idct_dc_add4y_(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); ;----------------------------------------------------------------------------- -%if ARCH_X86_32 -INIT_MMX mmx -cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride - ; load data - movd m0, [blockq+32*0] ; A - movd m1, [blockq+32*2] ; C - punpcklwd m0, [blockq+32*1] ; A B - punpcklwd m1, [blockq+32*3] ; C D - punpckldq m0, m1 ; A B C D - pxor m6, m6 - - ; calculate DC - paddw m0, [pw_4] - movd [blockq+32*0], m6 - movd [blockq+32*1], m6 - movd [blockq+32*2], m6 - movd [blockq+32*3], m6 - psraw m0, 3 - psubw m6, m0 - packuswb m0, m0 - packuswb m6, m6 - punpcklbw m0, m0 ; AABBCCDD - punpcklbw m6, m6 ; AABBCCDD - movq m1, m0 - movq m7, m6 - punpcklbw m0, m0 ; AAAABBBB - punpckhbw m1, m1 ; CCCCDDDD - punpcklbw m6, m6 ; AAAABBBB - punpckhbw m7, m7 ; CCCCDDDD - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m6, 0, mova - ADD_DC m1, m7, 8, mova - RET -%endif - INIT_XMM sse2 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride ; load data @@ -1117,7 +1034,7 @@ cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride SWAP %4, %3 %endmacro -%macro VP8_IDCT_ADD 0 +INIT_MMX sse cglobal vp8_idct_add, 3, 3, 0, dst, block, stride ; load block data movq m0, [blockq+ 0] @@ -1126,17 +1043,9 @@ cglobal vp8_idct_add, 3, 3, 0, dst, block, stride movq m3, [blockq+24] movq m6, [pw_20091] movq m7, [pw_17734] -%if cpuflag(sse) xorps xmm0, xmm0 movaps [blockq+ 0], xmm0 movaps [blockq+16], xmm0 -%else - pxor m4, m4 - movq [blockq+ 0], m4 - movq [blockq+ 8], m4 - movq [blockq+16], m4 - movq [blockq+24], m4 -%endif ; actual IDCT VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 @@ -1153,14 +1062,6 @@ cglobal vp8_idct_add, 3, 3, 0, dst, block, stride STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -VP8_IDCT_ADD -%endif -INIT_MMX sse -VP8_IDCT_ADD ;----------------------------------------------------------------------------- ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) @@ -1193,23 +1094,15 @@ VP8_IDCT_ADD SWAP %1, %4, %3 %endmacro -%macro VP8_DC_WHT 0 +INIT_MMX sse cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 movq m0, [dc1q] movq m1, [dc1q+8] movq m2, [dc1q+16] movq m3, [dc1q+24] -%if cpuflag(sse) xorps xmm0, xmm0 movaps [dc1q+ 0], xmm0 movaps [dc1q+16], xmm0 -%else - pxor m4, m4 - movq [dc1q+ 0], m4 - movq [dc1q+ 8], m4 - movq [dc1q+16], m4 - movq [dc1q+24], m4 -%endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 paddw m0, [pw_3] @@ -1221,11 +1114,3 @@ cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 SCATTER_WHT 0, 1, 0 SCATTER_WHT 2, 3, 2 RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -VP8_DC_WHT -%endif -INIT_MMX sse -VP8_DC_WHT diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index 0238898e58..cceb346ced 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -112,9 +112,6 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -140,19 +137,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ dst + 4, dststride, src + 4, srcstride, height, mx, my); \ } -#if ARCH_X86_32 -TAP_W8 (mmxext, epel, h4) -TAP_W8 (mmxext, epel, h6) -TAP_W16(mmxext, epel, h6) -TAP_W8 (mmxext, epel, v4) -TAP_W8 (mmxext, epel, v6) -TAP_W16(mmxext, epel, v6) -TAP_W8 (mmxext, bilinear, h) -TAP_W16(mmxext, bilinear, h) -TAP_W8 (mmxext, bilinear, v) -TAP_W16(mmxext, bilinear, v) -#endif - TAP_W16(sse2, epel, h6) TAP_W16(sse2, epel, v6) TAP_W16(sse2, bilinear, h) @@ -177,16 +161,8 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT dst, dststride, tmpptr, SIZE, height, mx, my); \ } -#if ARCH_X86_32 -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) \ -HVTAP(mmxext, 8, x, y, 8, 16) - -HVTAP(mmxext, 8, 6, 6, 16, 16) -#else #define HVTAPMMX(x, y) \ HVTAP(mmxext, 8, x, y, 4, 8) -#endif HVTAPMMX(4, 4) HVTAPMMX(4, 6) @@ -221,31 +197,21 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ } HVBILIN(mmxext, 8, 4, 8) -#if ARCH_X86_32 -HVBILIN(mmxext, 8, 8, 16) -HVBILIN(mmxext, 8, 16, 16) -#endif HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 16, 16) HVBILIN(ssse3, 8, 4, 8) HVBILIN(ssse3, 8, 8, 16) HVBILIN(ssse3, 8, 16, 16) -void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], ptrdiff_t stride); -void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], - ptrdiff_t stride); void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], ptrdiff_t stride); -void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]); void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); -void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); #define DECLARE_LOOP_FILTER(NAME) \ @@ -284,8 +250,6 @@ void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ ptrdiff_t s, \ int e, int i, int hvt); -DECLARE_LOOP_FILTER(mmx) -DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(sse4) @@ -322,10 +286,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { -#if ARCH_X86_32 - c->put_vp8_epel_pixels_tab[0][0][0] = - c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; -#endif c->put_vp8_epel_pixels_tab[1][0][0] = c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; } @@ -335,12 +295,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) if (EXTERNAL_MMXEXT(cpu_flags)) { VP8_MC_FUNC(2, 4, mmxext); VP8_BILINEAR_MC_FUNC(2, 4, mmxext); -#if ARCH_X86_32 - VP8_LUMA_MC_FUNC(0, 16, mmxext); - VP8_MC_FUNC(1, 8, mmxext); - VP8_BILINEAR_MC_FUNC(0, 16, mmxext); - VP8_BILINEAR_MC_FUNC(1, 8, mmxext); -#endif } if (EXTERNAL_SSE(cpu_flags)) { @@ -373,44 +327,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) if (EXTERNAL_MMX(cpu_flags)) { c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; -#if ARCH_X86_32 - c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; - c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; - c->vp8_idct_add = ff_vp8_idct_add_mmx; - c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx; -#endif - } - - /* note that 4-tap width=16 functions are missing because w=16 - * is only used for luma, and luma is always a copy or sixtap. */ - if (EXTERNAL_MMXEXT(cpu_flags)) { -#if ARCH_X86_32 - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; -#endif } if (EXTERNAL_SSE(cpu_flags)) { diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm index caeb405267..ef397efd3e 100644 --- a/libavcodec/x86/vp8dsp_loopfilter.asm +++ b/libavcodec/x86/vp8dsp_loopfilter.asm @@ -46,30 +46,6 @@ SECTION .text ; void ff_vp8_h/v_loop_filter_simple_(uint8_t *dst, ptrdiff_t stride, int flim); ;----------------------------------------------------------------------------- -; macro called with 7 mm register indexes as argument, and 4 regular registers -; -; first 4 mm registers will carry the transposed pixel data -; the other three are scratchspace (one would be sufficient, but this allows -; for more spreading/pipelining and thus faster execution on OOE CPUs) -; -; first two regular registers are buf+4*stride and buf+5*stride -; third is -stride, fourth is +stride -%macro READ_8x4_INTERLEAVED 11 - ; interleave 8 (A-H) rows of 4 pixels each - movd m%1, [%8+%10*4] ; A0-3 - movd m%5, [%9+%10*4] ; B0-3 - movd m%2, [%8+%10*2] ; C0-3 - movd m%6, [%8+%10] ; D0-3 - movd m%3, [%8] ; E0-3 - movd m%7, [%9] ; F0-3 - movd m%4, [%9+%11] ; G0-3 - punpcklbw m%1, m%5 ; A/B interleaved - movd m%5, [%9+%11*2] ; H0-3 - punpcklbw m%2, m%6 ; C/D interleaved - punpcklbw m%3, m%7 ; E/F interleaved - punpcklbw m%4, m%5 ; G/H interleaved -%endmacro - ; macro called with 7 mm register indexes as argument, and 5 regular registers ; first 11 mean the same as READ_8x4_TRANSPOSED above ; fifth regular register is scratchspace to reach the bottom 8 rows, it @@ -112,26 +88,6 @@ SECTION .text punpcklbw m%4, m%5 ; G/H/O/P interleaved %endmacro -; write 4 mm registers of 2 dwords each -; first four arguments are mm register indexes containing source data -; last four are registers containing buf+4*stride, buf+5*stride, -; -stride and +stride -%macro WRITE_4x2D 8 - ; write out (2 dwords per register) - movd [%5+%7*4], m%1 - movd [%5+%7*2], m%2 - movd [%5], m%3 - movd [%6+%8], m%4 - punpckhdq m%1, m%1 - punpckhdq m%2, m%2 - punpckhdq m%3, m%3 - punpckhdq m%4, m%4 - movd [%6+%7*4], m%1 - movd [%5+%7], m%2 - movd [%6], m%3 - movd [%6+%8*2], m%4 -%endmacro - ; write 4 xmm registers of 4 dwords each ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride @@ -192,42 +148,6 @@ SECTION .text movd [%7+%9*2], m%4 %endmacro -; write 4 or 8 words in the mmx/xmm registers as 8 lines -; 1 and 2 are the registers to write, this can be the same (for SSE2) -; for pre-SSE4: -; 3 is a general-purpose register that we will clobber -; for SSE4: -; 3 is a pointer to the destination's 5th line -; 4 is a pointer to the destination's 4th line -; 5/6 is -stride and +stride -%macro WRITE_2x4W 6 - movd %3d, %1 - punpckhdq %1, %1 - mov [%4+%5*4], %3w - shr %3, 16 - add %4, %6 - mov [%4+%5*4], %3w - - movd %3d, %1 - add %4, %5 - mov [%4+%5*2], %3w - shr %3, 16 - mov [%4+%5 ], %3w - - movd %3d, %2 - punpckhdq %2, %2 - mov [%4 ], %3w - shr %3, 16 - mov [%4+%6 ], %3w - - movd %3d, %2 - add %4, %6 - mov [%4+%6 ], %3w - shr %3, 16 - mov [%4+%6*2], %3w - add %4, %5 -%endmacro - %macro WRITE_8W 5 %if cpuflag(sse4) pextrw [%3+%4*4], %1, 0 @@ -269,29 +189,19 @@ SECTION .text %macro SIMPLE_LOOPFILTER 2 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr -%if mmsize == 8 ; mmx/mmxext - mov cntrq, 2 -%endif %if cpuflag(ssse3) pxor m0, m0 %endif SPLATB_REG m7, flim, m0 ; splat "flim" into register ; set up indexes to address 4 rows -%if mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, cntr, dst2 -%else DEFINE_ARGS dst1, mstride, stride, dst3, dst2 -%endif mov strideq, mstrideq neg mstrideq %ifidn %1, h lea dst1q, [dst1q+4*strideq-2] %endif -%if mmsize == 8 ; mmx / mmxext -.next8px: -%endif %ifidn %1, v ; read 4 half/full rows of pixels mova m0, [dst1q+mstrideq*2] ; p1 @@ -301,11 +211,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %else ; h lea dst2q, [dst1q+ strideq] -%if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq -%else ; sse2 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q -%endif TRANSPOSE4x4W 0, 1, 2, 3, 4 %endif @@ -380,7 +286,6 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr inc dst1q SBUTTERFLY bw, 6, 4, 0 -%if mmsize == 16 ; sse2 %if cpuflag(sse4) inc dst2q %endif @@ -390,35 +295,11 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr inc dst3q %endif WRITE_8W m4, dst3q, dst2q, mstrideq, strideq -%else ; mmx/mmxext - WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq -%endif %endif -%if mmsize == 8 ; mmx/mmxext - ; next 8 pixels -%ifidn %1, v - add dst1q, 8 ; advance 8 cols = pixels -%else ; h - lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines -%endif - dec cntrq - jg .next8px - REP_RET -%else ; sse2 RET -%endif %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -INIT_MMX mmxext -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -%endif - INIT_XMM sse2 SIMPLE_LOOPFILTER v, 3 SIMPLE_LOOPFILTER h, 5 @@ -485,9 +366,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f %if %2 == 8 ; chroma DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 %else DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif @@ -500,9 +378,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f %endif %endif -%if mmsize == 8 -.next8px: -%endif ; read lea dst2q, [dst1q+strideq] %ifidn %1, v @@ -527,33 +402,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f movhps m7, [dst8q+ strideq*2] add dst8q, mstrideq %endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) +%else ; h %if %2 == 16 lea dst8q, [dst1q+ strideq*8] %endif @@ -641,25 +490,9 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 pmaxub m0, m1 pmaxub m6, m7 pmaxub m0, m6 -%endif ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero @@ -681,18 +514,8 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh -%endif SWAP 6, 4 ; now m6 is I %ifidn %1, v @@ -712,17 +535,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 @@ -730,7 +542,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif %ifdef m12 SWAP 6, 12 %else @@ -820,25 +631,12 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f %else mova m6, m_maskres %endif -%if notcpuflag(mmxext) - mova m7, [pb_1] -%else ; mmxext/sse2 pxor m7, m7 -%endif pand m0, m6 pand m1, m6 -%if notcpuflag(mmxext) - paddusb m0, m7 - pand m1, [pb_FE] - pandn m7, m0 - psrlq m1, 1 - psrlq m7, 1 - SWAP 0, 7 -%else ; mmxext/sse2 psubusb m1, [pb_1] pavgb m0, m7 ; a pavgb m1, m7 ; -a -%endif psubusb m5, m0 psubusb m2, m1 paddusb m5, m1 ; q1-a @@ -863,51 +661,13 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq -%else ; sse2 (h) lea dst8q, [dst8q+mstrideq +2] WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 %endif -%endif -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 2 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-2] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 RET -%endif %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -INIT_MMX mmxext -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 -%endif - INIT_XMM sse2 INNER_LOOPFILTER v, 16 INNER_LOOPFILTER h, 16 @@ -992,9 +752,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, %if %2 == 8 ; chroma DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 %else DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif @@ -1007,9 +764,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, %endif %endif -%if mmsize == 8 -.next8px: -%endif ; read lea dst2q, [dst1q+ strideq ] %ifidn %1, v @@ -1034,33 +788,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, movhps m7, [dst8q+ strideq*2] add dst8q, mstrideq %endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) +%else ; h %if %2 == 16 lea dst8q, [dst1q+ strideq*8 ] %endif @@ -1150,25 +878,9 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 pmaxub m0, m1 pmaxub m6, m7 pmaxub m0, m6 -%endif ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero @@ -1190,18 +902,8 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh -%endif SWAP 6, 4 ; now m6 is I %ifidn %1, v @@ -1221,17 +923,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 @@ -1239,7 +930,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif %ifdef m12 SWAP 6, 12 %else @@ -1510,11 +1200,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, TRANSPOSE4x4B 1, 2, 3, 4, 0 SBUTTERFLY bw, 5, 6, 0 -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq - add dst1q, 4 - WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq -%else ; sse2 (h) lea dst8q, [dst8q+mstrideq+1] WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 lea dst1q, [dst2q+mstrideq+4] @@ -1528,45 +1213,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, %endif WRITE_8W m6, dst2q, dst8q, mstrideq, strideq %endif -%endif -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 5 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-5] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 RET -%endif %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_MMX mmxext -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 -%endif - INIT_XMM sse2 MBEDGE_LOOPFILTER v, 16 MBEDGE_LOOPFILTER h, 16