avcodec/x86/lossless_videodsp: Port lorens add_hfyu_left_prediction_ssse3/sse4 to 16bit

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2014-01-21 02:53:43 +01:00
parent 63d2be7533
commit 83b67ca056
2 changed files with 94 additions and 0 deletions

View File

@ -1,5 +1,6 @@
;******************************************************************************
;* SIMD lossless video DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Michael Niedermayer
;*
;* This file is part of FFmpeg.
@ -21,6 +22,13 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_ef: times 8 db 14,15
pb_67: times 8 db 6, 7
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
SECTION_TEXT
%macro ADD_INT16_LOOP 1 ; %1 = is_aligned
@ -84,3 +92,79 @@ cglobal add_int16, 4,4,5, dst, src, mask, w
ADD_INT16_LOOP 1
.unaligned:
ADD_INT16_LOOP 0
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add wq, wq
add srcq, wq
add dstq, wq
neg wq
%%.loop:
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
mova m2, m1
pslld m1, 16
paddw m1, m2
mova m2, m1
pshufb m1, m3
paddw m1, m2
pshufb m0, m5
%if mmsize == 16
mova m2, m1
pshufb m1, m4
paddw m1, m2
%endif
paddw m0, m1
pand m0, m7
%if %1
mova [dstq+wq], m0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
%endif
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
sub eax, wd
mov wd, eax
shl wd, 8
lea eax, [wd+eax-1]
movd m1, eax
pshufb m0, m1
movd eax, m0
RET
%endmacro
; int add_hfyu_left_prediction_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
INIT_MMX ssse3
cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left
.skip_prologue:
mova m5, [pb_67]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
psllq m0, 48
movd m7, maskm
SPLATW m7 ,m7
ADD_HFYU_LEFT_LOOP_INT16 1, 1
INIT_XMM sse4
cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left
mova m5, [pb_ef]
mova m4, [pb_zzzzzzzz67676767]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
pslldq m0, 14
movd m7, maskm
SPLATW m7 ,m7
test srcq, 15
jnz .src_unaligned
test dstq, 15
jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP_INT16 1, 1
.dst_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 0, 0

View File

@ -23,6 +23,8 @@
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
int ff_add_hfyu_left_prediction_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
int ff_add_hfyu_left_prediction_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
void ff_llviddsp_init_x86(LLVidDSPContext *c)
{
@ -35,4 +37,12 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_int16 = ff_add_int16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_sse4;
}
}