From 5705b02079449c685a3dd337fcc3a8b440dca4a0 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 11 May 2011 10:11:55 -0700 Subject: [PATCH] 10-bit H.264 x86 chroma v loopfilter asm Also delete some unused deblock asm macros. --- libavcodec/x86/h264_deblock.asm | 41 ---------- libavcodec/x86/h264_deblock_10bit.asm | 106 ++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 16 +++- 3 files changed, 121 insertions(+), 42 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 37866812e7..0cf013f58f 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -106,47 +106,6 @@ cextern pb_A1 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro -%macro TRANSPOSE4x8W_LOAD 8 -%if mmsize==16 - TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 -%else - SWAP 1, 4, 2, 3 - mova m0, [t5] - mova m1, [t5+r1] - mova m2, [t5+r1*2] - mova m3, [t5+t6] - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif -%endmacro - -%macro TRANSPOSE8x2W_STORE 8 - punpckhwd m0, m1, m2 - punpcklwd m1, m2 -%if mmsize==8 - movd %3, m0 - movd %1, m1 - psrlq m1, 32 - psrlq m0, 32 - movd %2, m1 - movd %4, m0 -%else - movd %5, m0 - movd %1, m1 - psrldq m1, 4 - psrldq m0, 4 - movd %2, m1 - movd %6, m0 - psrldq m1, 4 - psrldq m0, 4 - movd %3, m1 - movd %7, m0 - psrldq m1, 4 - psrldq m0, 4 - movd %4, m1 - movd %8, m0 -%endif -%endmacro - %macro SBUTTERFLY3 4 punpckh%1 %4, %2, %3 punpckl%1 %2, %3 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index 402ed9bfac..c253d02954 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -34,6 +34,7 @@ pw_pixel_max: times 8 dw ((1 << 10)-1) SECTION .text cextern pw_2 +cextern pw_3 cextern pw_4 ; out: %4 = |%1-%2|-%3 @@ -802,3 +803,108 @@ INIT_AVX DEBLOCK_LUMA avx DEBLOCK_LUMA_INTRA avx %endif + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', %2=q0' +%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 + mova %6, [pw_2] + paddw %6, %3 + paddw %6, %4 + paddw %7, %6, %2 + paddw %6, %1 + paddw %6, %3 + paddw %7, %4 + psraw %6, 2 + psraw %7, 2 + psubw %6, %1 + psubw %7, %2 + pand %6, %5 + pand %7, %5 + paddw %1, %6 + paddw %2, %7 +%endmacro + +%macro CHROMA_V_LOAD 1 + mova m0, [r0] ; p1 + mova m1, [r0+r1] ; p0 + mova m2, [%1] ; q0 + mova m3, [%1+r1] ; q1 +%endmacro + +%macro CHROMA_V_STORE 0 + mova [r0+1*r1], m1 + mova [r0+2*r1], m2 +%endmacro + +%macro DEBLOCK_CHROMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) + mov r5, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r6, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r5 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + LOAD_TC m6, r4 + psubw m6, [pw_3] + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r5, mmsize + add r4, mmsize/8 + dec r6 + jg .loop + REP_RET +%else + RET +%endif + +;----------------------------------------------------------------------------- +; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) + mov r4, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r5, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r4 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r4, mmsize + dec r5 + jg .loop + REP_RET +%else + RET +%endif +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif +INIT_XMM +DEBLOCK_CHROMA sse2 +INIT_AVX +DEBLOCK_CHROMA avx diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 42dae93f2d..01b11163c8 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -236,10 +236,18 @@ LF_FUNC (h, luma, depth, sse2)\ LF_IFUNC(h, luma_intra, depth, sse2)\ LF_FUNC (v, luma, depth, sse2)\ LF_IFUNC(v, luma_intra, depth, sse2)\ +LF_FUNC (h, chroma, depth, sse2)\ +LF_IFUNC(h, chroma_intra, depth, sse2)\ +LF_FUNC (v, chroma, depth, sse2)\ +LF_IFUNC(v, chroma_intra, depth, sse2)\ LF_FUNC (h, luma, depth, avx)\ LF_IFUNC(h, luma_intra, depth, avx)\ LF_FUNC (v, luma, depth, avx)\ -LF_IFUNC(v, luma_intra, depth, avx) +LF_IFUNC(v, luma_intra, depth, avx)\ +LF_FUNC (h, chroma, depth, avx)\ +LF_IFUNC(h, chroma_intra, depth, avx)\ +LF_FUNC (v, chroma, depth, avx)\ +LF_IFUNC(v, chroma_intra, depth, avx) LF_FUNCS( uint8_t, 8) LF_FUNCS(uint16_t, 10) @@ -401,12 +409,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) if (mm_flags & AV_CPU_FLAG_MMX) { if (mm_flags & AV_CPU_FLAG_MMX2) { #if ARCH_X86_32 + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext; c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; #endif if (mm_flags&AV_CPU_FLAG_SSE2) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; @@ -415,6 +427,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) #endif } if (mm_flags&AV_CPU_FLAG_AVX) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;