From 88307b3eec018b86f6adfddeb6e9b384c95b7ca6 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Wed, 15 Feb 2017 14:54:11 +0100 Subject: [PATCH] avcodec/h264: add avx 8-bit 4:2:2 chroma h deblock/loop filter ~1.21x faster (68 vs. 56 cycles) compared with mmxext function --- libavcodec/x86/h264_deblock.asm | 27 +++++++++++++++++++++++++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0465c9f13b..e2eb002f71 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1163,6 +1163,33 @@ cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) RET +cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_, + CHROMA_H_START_XMM r5, r6 + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + TRANSPOSE_8x4B_XMM + movq [rsp], m0 + movq [rsp + 8], m3 + CHROMA_INTER_BODY_XMM 2 + movq m0, [rsp] + movq m3, [rsp + 8] + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + + lea pix_q, [pix_q + 8*stride_q] + lea r5, [r5 + 8*stride_q] + add tc0_q, 2 + + LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) + TRANSPOSE_8x4B_XMM + movq [rsp], m0 + movq [rsp + 8], m3 + CHROMA_INTER_BODY_XMM 2 + movq m0, [rsp] + movq m3, [rsp + 8] + TRANSPOSE_4x8B_XMM + STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) +RET + %endmacro ; DEBLOCK_CHROMA_XMM DEBLOCK_CHROMA_XMM avx diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 0b15471675..6073932b25 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -321,6 +321,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx; if (chroma_format_idc <= 1) { c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx; + } else { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; } } } else if (bit_depth == 10) {