From 7042a55c55d7ac04e9a6b3cd150c85d64cedb79f Mon Sep 17 00:00:00 2001 From: James Darnley Date: Wed, 13 Jan 2016 14:34:44 +0000 Subject: [PATCH] avcodec/h264: mmxext 4:2:2 chroma deblock/loop filter 2.6 times faster (366 vs. 142 cycles) --- libavcodec/x86/h264_deblock.asm | 46 ++++++++++++++++++++++++++++++--- libavcodec/x86/h264dsp_init.c | 4 +++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 5151f3c9cd..8f80863fc5 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -864,7 +864,50 @@ ff_chroma_inter_body_mmxext: DEBLOCK_P0_Q0 ret +%define t5 r4 +%define t6 r5 +cglobal deblock_h_chroma422_8, 5, 6, 0, 0-(1+ARCH_X86_64*2)*mmsize + %if ARCH_X86_64 + %define buf0 [rsp+16] + %define buf1 [rsp+8] + %else + %define buf0 r0m + %define buf1 r2m + %endif + + movd m6, [r4] + punpcklbw m6, m6 + movq [rsp], m6 + CHROMA_H_START + + TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + LOAD_MASK r2d, r3d + movd m6, [rsp] + punpcklwd m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) + + lea r0, [r0+r1*8] + lea t5, [t5+r1*8] + + TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + LOAD_MASK r2d, r3d + movd m6, [rsp+4] + punpcklwd m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) +RET ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 @@ -877,9 +920,6 @@ ff_chroma_inter_body_mmxext: pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro -%define t5 r4 -%define t6 r5 - ;------------------------------------------------------------------------------ ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 35db20014a..c8cd0650c7 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -129,6 +129,8 @@ LF_IFUNC(v, chroma_intra, depth, avx) LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) +void ff_deblock_h_chroma422_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); + #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL LF_FUNC(v8, luma, 8, mmxext) static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, @@ -245,6 +247,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, if (chroma_format_idc <= 1) { c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; + } else { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext; } #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;