From 83b67ca056093d3b8fffc5e0b37f84177113a556 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Tue, 21 Jan 2014 02:53:43 +0100 Subject: [PATCH] avcodec/x86/lossless_videodsp: Port lorens add_hfyu_left_prediction_ssse3/sse4 to 16bit Signed-off-by: Michael Niedermayer --- libavcodec/x86/lossless_videodsp.asm | 84 +++++++++++++++++++++++++ libavcodec/x86/lossless_videodsp_init.c | 10 +++ 2 files changed, 94 insertions(+) diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 8c429fa4d7..e71de76874 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -1,5 +1,6 @@ ;****************************************************************************** ;* SIMD lossless video DSP utils +;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2014 Michael Niedermayer ;* ;* This file is part of FFmpeg. @@ -21,6 +22,13 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +pb_ef: times 8 db 14,15 +pb_67: times 8 db 6, 7 +pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 +pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 + SECTION_TEXT %macro ADD_INT16_LOOP 1 ; %1 = is_aligned @@ -84,3 +92,79 @@ cglobal add_int16, 4,4,5, dst, src, mask, w ADD_INT16_LOOP 1 .unaligned: ADD_INT16_LOOP 0 + +%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst_is_aligned, %2 = src_is_aligned + add wq, wq + add srcq, wq + add dstq, wq + neg wq +%%.loop: +%if %2 + mova m1, [srcq+wq] +%else + movu m1, [srcq+wq] +%endif + mova m2, m1 + pslld m1, 16 + paddw m1, m2 + mova m2, m1 + + pshufb m1, m3 + paddw m1, m2 + pshufb m0, m5 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m4 + paddw m1, m2 +%endif + paddw m0, m1 + pand m0, m7 +%if %1 + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + mov wd, eax + shl wd, 8 + lea eax, [wd+eax-1] + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int add_hfyu_left_prediction_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) +INIT_MMX ssse3 +cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left +.skip_prologue: + mova m5, [pb_67] + mova m3, [pb_zzzz2323zzzzabab] + movd m0, leftm + psllq m0, 48 + movd m7, maskm + SPLATW m7 ,m7 + ADD_HFYU_LEFT_LOOP_INT16 1, 1 + +INIT_XMM sse4 +cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left + mova m5, [pb_ef] + mova m4, [pb_zzzzzzzz67676767] + mova m3, [pb_zzzz2323zzzzabab] + movd m0, leftm + pslldq m0, 14 + movd m7, maskm + SPLATW m7 ,m7 + test srcq, 15 + jnz .src_unaligned + test dstq, 15 + jnz .dst_unaligned + ADD_HFYU_LEFT_LOOP_INT16 1, 1 +.dst_unaligned: + ADD_HFYU_LEFT_LOOP_INT16 0, 1 +.src_unaligned: + ADD_HFYU_LEFT_LOOP_INT16 0, 0 diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index bc51d51cc1..88424ba1f9 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -23,6 +23,8 @@ void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w); void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w); +int ff_add_hfyu_left_prediction_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc); +int ff_add_hfyu_left_prediction_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc); void ff_llviddsp_init_x86(LLVidDSPContext *c) { @@ -35,4 +37,12 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) if (EXTERNAL_SSE2(cpu_flags)) { c->add_int16 = ff_add_int16_sse2; } + + if (EXTERNAL_SSSE3(cpu_flags)) { + c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_ssse3; + } + + if (EXTERNAL_SSE4(cpu_flags)) { + c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_sse4; + } }