From 5e586e1befaab7b77883d73c0ebcb19f1d786ed8 Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Mon, 19 Oct 2015 02:25:11 +0100 Subject: [PATCH] huffyuvencdsp: Add ff_diff_bytes_{sse2,avx2} SSE2 version 4%-35% faster than MMX depending on the width. AVX2 version 1%-13% faster than SSE2 depending on the width. --- libavcodec/huffyuvenc.c | 4 +- libavcodec/x86/huffyuvencdsp.asm | 110 ++++++++++++++++++++++++----- libavcodec/x86/huffyuvencdsp_mmx.c | 14 +++- 3 files changed, 107 insertions(+), 21 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 49d711a948..e080cd90bb 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -60,12 +60,12 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, } return left; } else { - for (i = 0; i < 16; i++) { + for (i = 0; i < 32; i++) { const int temp = src[i]; dst[i] = temp - left; left = temp; } - s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16); + s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32); return src[w-1]; } } else { diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index e001906742..699fd38495 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -27,9 +27,9 @@ section .text -INIT_MMX mmx ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; intptr_t w); +%macro DIFF_BYTES_PROLOGUE 0 %if ARCH_X86_32 cglobal diff_bytes, 3,5,2, dst, src1, src2 %define wq r4q @@ -40,34 +40,108 @@ cglobal diff_bytes, 4,5,2, dst, src1, src2, w DECLARE_REG_TMP 4 %endif ; ARCH_X86_32 %define i t0q +%endmacro + +; label to jump to if w < regsize +%macro DIFF_BYTES_LOOP_PREP 1 mov i, wq - and i, -2 * mmsize - jz .setup_loop2 + and i, -2 * regsize + jz %1 add dstq, i add src1q, i add src2q, i neg i -.loop: - mova m0, [src1q + i] - mova m1, [src1q + i + mmsize] - psubb m0, [src2q + i] - psubb m1, [src2q + i + mmsize] - mova [dstq + i], m0 - mova [mmsize + dstq + i], m1 - add i, 2 * mmsize - jl .loop -.setup_loop2: - and wq, 2 * mmsize - 1 - jz .end +%endmacro + +; mov type used for src1q, dstq, first reg, second reg +%macro DIFF_BYTES_LOOP_CORE 4 +%if regsize != 16 + mov%1 %3, [src1q + i] + mov%1 %4, [src1q + i + regsize] + psubb %3, [src2q + i] + psubb %4, [src2q + i + regsize] + mov%2 [dstq + i], %3 + mov%2 [regsize + dstq + i], %4 +%else + ; SSE enforces alignment of psubb operand + mov%1 %3, [src1q + i] + movu %4, [src2q + i] + psubb %3, %4 + mov%2 [dstq + i], %3 + mov%1 %3, [src1q + i + regsize] + movu %4, [src2q + i + regsize] + psubb %3, %4 + mov%2 [regsize + dstq + i], %3 +%endif +%endmacro + +%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq + %define regsize mmsize +.loop_%1%2: + DIFF_BYTES_LOOP_CORE %1, %2, m0, m1 + add i, 2 * regsize + jl .loop_%1%2 +.skip_main_%1%2: + and wq, 2 * regsize - 1 + jz .end_%1%2 +%if mmsize > 16 + ; fall back to narrower xmm + %define regsize mmsize / 2 + DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa +.loop2_%1%2: + DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1 + add i, 2 * regsize + jl .loop2_%1%2 +.setup_loop_gpr_%1%2: + and wq, 2 * regsize - 1 + jz .end_%1%2 +%endif add dstq, wq add src1q, wq add src2q, wq neg wq -.loop2: +.loop_gpr_%1%2: mov t0b, [src1q + wq] sub t0b, [src2q + wq] mov [dstq + wq], t0b inc wq - jl .loop2 -.end: + jl .loop_gpr_%1%2 +.end_%1%2: REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +DIFF_BYTES_PROLOGUE + %define regsize mmsize + DIFF_BYTES_LOOP_PREP .skip_main_aa + DIFF_BYTES_BODY a, a +%endif + +INIT_XMM sse2 +DIFF_BYTES_PROLOGUE + %define regsize mmsize + DIFF_BYTES_LOOP_PREP .skip_main_aa + test dstq, regsize - 1 + jnz .loop_uu + test src1q, regsize - 1 + jnz .loop_ua + DIFF_BYTES_BODY a, a + DIFF_BYTES_BODY u, a + DIFF_BYTES_BODY u, u + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +DIFF_BYTES_PROLOGUE + %define regsize mmsize + ; Directly using unaligned SSE2 version is marginally faster than + ; branching based on arguments. + DIFF_BYTES_LOOP_PREP .skip_main_uu + test dstq, regsize - 1 + jnz .loop_uu + test src1q, regsize - 1 + jnz .loop_ua + DIFF_BYTES_BODY a, a + DIFF_BYTES_BODY u, a + DIFF_BYTES_BODY u, u +%endif diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index f28df05dba..0ba4358165 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -31,6 +31,10 @@ void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); +void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); #if HAVE_INLINE_ASM @@ -80,7 +84,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { + if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { c->diff_bytes = ff_diff_bytes_mmx; } @@ -89,4 +93,12 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext; } #endif /* HAVE_INLINE_ASM */ + + if (EXTERNAL_SSE2(cpu_flags)) { + c->diff_bytes = ff_diff_bytes_sse2; + } + + if (EXTERNAL_AVX2(cpu_flags)) { + c->diff_bytes = ff_diff_bytes_avx2; + } }