avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred

This commit is contained in:
Martin Vignali 2017-12-02 19:09:58 +01:00
parent cfbcea1cca
commit 4353c35067
2 changed files with 44 additions and 22 deletions

View File

@ -114,40 +114,54 @@ MEDIAN_PRED
add dstq, wq
neg wq
%%.loop:
pshufb xm0, xm5
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
mova m2, m1
psllw m1, 8
psllw m2, m1, 8
paddb m1, m2
mova m2, m1
pshufb m1, m3
pshufb m2, m1, m3
paddb m1, m2
pshufb m0, m5
mova m2, m1
pshufb m1, m4
pshufb m2, m1, m4
paddb m1, m2
%if mmsize == 16
mova m2, m1
pshufb m1, m6
%if mmsize >= 16
pshufb m2, m1, m6
paddb m1, m2
%endif
paddb m0, m1
paddb xm0, xm1
%if %1
mova [dstq+wq], m0
mova [dstq+wq], xm0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
movq [dstq+wq], xm0
movhps [dstq+wq+8], xm0
%endif
%if mmsize == 32
vextracti128 xm2, m1, 1 ; get second lane of the ymm
pshufb xm0, xm5 ; set alls val to last val of the first lane
paddb xm0, xm2
;store val
%if %1
mova [dstq+wq+16], xm0
%else;
movq [dstq+wq+16], xm0
movhps [dstq+wq+16+8], xm0
%endif
%endif
add wq, mmsize
jl %%.loop
%if mmsize == 32
mov eax, [dstq -1]
and eax, 0xff
%else;
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
%endif
RET
%endmacro
@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left
%macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
mova m5, [pb_15]
mova m6, [pb_zzzzzzzz77777777]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
pslldq m0, 15
test srcq, 15
mova xm5, [pb_15]
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
movd xm0, leftm
pslldq xm0, 15
test srcq, mmsize - 1
jnz .src_unaligned
test dstq, 15
test dstq, mmsize - 1
jnz .dst_unaligned
ADD_LEFT_LOOP 1, 1
.dst_unaligned:
@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
INIT_XMM ssse3
ADD_LEFT_PRED_UNALIGNED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_LEFT_PRED_UNALIGNED
%endif
;------------------------------------------------------------------------------
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
;------------------------------------------------------------------------------

View File

@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2;
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
}
}