mirror of https://git.ffmpeg.org/ffmpeg.git
libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation
vp9_diag_downleft_32x32_8bpp_c: 580.2 vp9_diag_downleft_32x32_8bpp_sse2: 75.6 vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 vp9_diag_downleft_32x32_8bpp_avx: 72.7 vp9_diag_downleft_32x32_10bpp_c: 1101.2 vp9_diag_downleft_32x32_10bpp_sse2: 145.4 vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 vp9_diag_downleft_32x32_10bpp_avx: 134.8 vp9_diag_downleft_32x32_10bpp_avx2: 94.0 vp9_diag_downleft_32x32_12bpp_c: 1108.5 vp9_diag_downleft_32x32_12bpp_sse2: 145.5 vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 vp9_diag_downleft_32x32_12bpp_avx: 135.2 vp9_diag_downleft_32x32_12bpp_avx2: 94.0 ~30% faster than avx implementation Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
dc70ea8c19
commit
73d9a9a6af
|
@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
|
||||||
decl_ipred_fns(dc_top, 16, mmxext, sse2);
|
decl_ipred_fns(dc_top, 16, mmxext, sse2);
|
||||||
decl_ipred_fns(dc_left, 16, mmxext, sse2);
|
decl_ipred_fns(dc_left, 16, mmxext, sse2);
|
||||||
decl_ipred_fn(dl, 16, 16, avx2);
|
decl_ipred_fn(dl, 16, 16, avx2);
|
||||||
|
decl_ipred_fn(dl, 32, 16, avx2);
|
||||||
|
|
||||||
#define decl_ipred_dir_funcs(type) \
|
#define decl_ipred_dir_funcs(type) \
|
||||||
decl_ipred_fns(type, 16, sse2, sse2); \
|
decl_ipred_fns(type, 16, sse2, sse2); \
|
||||||
|
@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
|
||||||
init_fpel_func(1, 1, 64, avg, _16, avx2);
|
init_fpel_func(1, 1, 64, avg, _16, avx2);
|
||||||
init_fpel_func(0, 1, 128, avg, _16, avx2);
|
init_fpel_func(0, 1, 128, avg, _16, avx2);
|
||||||
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
|
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
|
||||||
|
init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HAVE_YASM */
|
#endif /* HAVE_YASM */
|
||||||
|
|
|
@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
|
||||||
DEFINE_ARGS dst, stride, stride3, cnt
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
mov cntd, 2
|
mov cntd, 2
|
||||||
lea stride3q, [strideq*3]
|
lea stride3q, [strideq*3]
|
||||||
|
|
||||||
.loop:
|
.loop:
|
||||||
mova [dstq+strideq*0], m0
|
mova [dstq+strideq*0], m0
|
||||||
vpalignr m3, m2, m0, 2
|
vpalignr m3, m2, m0, 2
|
||||||
|
@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
|
||||||
dec cntd
|
dec cntd
|
||||||
jg .loop
|
jg .loop
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
|
||||||
|
movifnidn aq, amp
|
||||||
|
mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
|
||||||
|
mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
|
||||||
|
vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
|
||||||
|
vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
|
||||||
|
vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
|
||||||
|
vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
|
||||||
|
LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
|
||||||
|
vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
|
||||||
|
vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
|
||||||
|
vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
|
||||||
|
LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
|
||||||
|
vperm2i128 m2, m1, m4, q0201 ; Z......555555555
|
||||||
|
vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
|
||||||
|
DEFINE_ARGS dst, stride, stride3, cnt
|
||||||
|
lea stride3q, [strideq*3]
|
||||||
|
mov cntd, 4
|
||||||
|
|
||||||
|
.loop:
|
||||||
|
mova [dstq+strideq*0 + 0], m0
|
||||||
|
mova [dstq+strideq*0 +32], m1
|
||||||
|
vpalignr m3, m5, m0, 2
|
||||||
|
vpalignr m4, m2, m1, 2
|
||||||
|
mova [dstq+strideq*1 + 0], m3
|
||||||
|
mova [dstq+strideq*1 +32], m4
|
||||||
|
vpalignr m3, m5, m0, 4
|
||||||
|
vpalignr m4, m2, m1, 4
|
||||||
|
mova [dstq+strideq*2 + 0], m3
|
||||||
|
mova [dstq+strideq*2 +32], m4
|
||||||
|
vpalignr m3, m5, m0, 6
|
||||||
|
vpalignr m4, m2, m1, 6
|
||||||
|
mova [dstq+stride3q*1+ 0], m3
|
||||||
|
mova [dstq+stride3q*1+32], m4
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
vpalignr m3, m5, m0, 8
|
||||||
|
vpalignr m4, m2, m1, 8
|
||||||
|
mova [dstq+strideq*0 + 0], m3
|
||||||
|
mova [dstq+strideq*0 +32], m4
|
||||||
|
vpalignr m3, m5, m0, 10
|
||||||
|
vpalignr m4, m2, m1, 10
|
||||||
|
mova [dstq+strideq*1 + 0], m3
|
||||||
|
mova [dstq+strideq*1 +32], m4
|
||||||
|
vpalignr m3, m5, m0, 12
|
||||||
|
vpalignr m4, m2, m1, 12
|
||||||
|
mova [dstq+strideq*2+ 0], m3
|
||||||
|
mova [dstq+strideq*2+32], m4
|
||||||
|
vpalignr m3, m5, m0, 14
|
||||||
|
vpalignr m4, m2, m1, 14
|
||||||
|
mova [dstq+stride3q+ 0], m3
|
||||||
|
mova [dstq+stride3q+ 32], m4
|
||||||
|
vpalignr m3, m5, m0, 16
|
||||||
|
vpalignr m4, m2, m1, 16
|
||||||
|
vperm2i128 m5, m3, m4, q0201
|
||||||
|
vperm2i128 m2, m4, m4, q0101
|
||||||
|
mova m0, m3
|
||||||
|
mova m1, m4
|
||||||
|
lea dstq, [dstq+strideq*4]
|
||||||
|
dec cntd
|
||||||
|
jg .loop
|
||||||
|
RET
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
|
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
|
||||||
|
|
Loading…
Reference in New Issue