From 1dae7ffa0b11293fa3294fc25cd38055915fcf19 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Mon, 28 Nov 2016 18:22:17 +0100 Subject: [PATCH] avcodec/h264: mmx 4:2:2 idct add8 function 2.87 times faster (1830 vs. 638 cycles) --- libavcodec/x86/h264_idct.asm | 32 ++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_init.c | 7 ++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 27222cbc3c..c36fea59b0 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -697,6 +697,38 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, call h264_idct_add8_mmx_plane RET +cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg +; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg + movsxdifnidn r3, r3d +%ifdef PIC + lea picregq, [scan8_mem] +%endif +%if ARCH_X86_64 + mov dst2q, r0 +%endif + + mov r5, 16 ; i + add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t + + call h264_idct_add8_mmx_plane + add r5, 4 + call h264_idct_add8_mmx_plane + +%if ARCH_X86_64 + add dst2q, gprsize ; dest[1] +%else + add r0mp, gprsize +%endif + + add r5, 4 ; set to 32 + add r2, 256 ; set to i * 16 * sizeof(dctcoef) + + call h264_idct_add8_mmx_plane + add r5, 4 + call h264_idct_add8_mmx_plane + + RET + h264_idct_add8_mmxext_plane: movsxdifnidn r3, r3d .nextblock: diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 027c1ae0b3..ed52c4d8ef 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -78,6 +78,8 @@ IDCT_ADD_REP_FUNC2(, 8, 8, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, avx) +IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx) + void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); @@ -228,8 +230,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - if (chroma_format_idc <= 1) + if (chroma_format_idc <= 1) { c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; + } else { + c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx; + } c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; if (cpu_flags & AV_CPU_FLAG_CMOV) c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;