lavu/x86/lls: add fma3 optimizations for update_lls

This improves accuracy (very slightly) and speed for processors having
fma3.

Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
old:
5993610 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
5951528 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips

new:
5252410 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
5232869 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips

Tested with FATE and --disable-fma3, also examined contents of
lavu/lls-test.

Reviewed-by: James Almer <jamrial@gmail.com>
Reviewed-by: Henrik Gramner <henrik@gramner.com>
Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
This commit is contained in:
Ganesh Ajjanagadde 2016-01-13 17:59:26 -05:00
parent d4ce63a1bf
commit 5989add4ab
2 changed files with 61 additions and 2 deletions

View File

@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
.ret:
REP_RET
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
%macro UPDATE_LLS 0
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq
mov countd, [ctxq + LLSModel.indep_count]
@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
vbroadcastsd ymm6, [varq + iq*8 + 16]
vbroadcastsd ymm7, [varq + iq*8 + 24]
vextractf128 xmm3, ymm1, 1
%if cpuflag(fma3)
mova ymm0, COVAR(iq ,0)
mova xmm2, COVAR(iq+2,2)
fmaddpd ymm0, ymm1, ymm4, ymm0
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
mova COVAR(iq ,0), ymm0
mova COVAR(iq ,1), ymm1
mova COVAR(iq+2,2), xmm2
mova COVAR(iq+2,3), xmm3
%else
vmulpd ymm0, ymm1, ymm4
vmulpd ymm1, ymm1, ymm5
vmulpd xmm2, xmm3, xmm6
@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(iq ,1), ymm1
ADDPD_MEM COVAR(iq+2,2), xmm2
ADDPD_MEM COVAR(iq+2,3), xmm3
%endif ; cpuflag(fma3)
lea jd, [iq + 4]
cmp jd, count2d
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mova ymm3, [varq + jq*8]
%if cpuflag(fma3)
mova ymm0, COVAR(jq, 0)
mova ymm1, COVAR(jq, 1)
mova ymm2, COVAR(jq, 2)
fmaddpd ymm0, ymm3, ymm4, ymm0
fmaddpd ymm1, ymm3, ymm5, ymm1
fmaddpd ymm2, ymm3, ymm6, ymm2
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
mova COVAR(jq, 0), ymm0
mova COVAR(jq, 1), ymm1
mova COVAR(jq, 2), ymm2
mova COVAR(jq, 3), ymm3
%else
vmulpd ymm0, ymm3, ymm4
vmulpd ymm1, ymm3, ymm5
vmulpd ymm2, ymm3, ymm6
@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), ymm1
ADDPD_MEM COVAR(jq,2), ymm2
ADDPD_MEM COVAR(jq,3), ymm3
%endif ; cpuflag(fma3)
add jd, 4
cmp jd, count2d
jle .loop4x4
@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
cmp jd, countd
jg .skip2x4
mova xmm3, [varq + jq*8]
%if cpuflag(fma3)
mova xmm0, COVAR(jq, 0)
mova xmm1, COVAR(jq, 1)
mova xmm2, COVAR(jq, 2)
fmaddpd xmm0, xmm3, xmm4, xmm0
fmaddpd xmm1, xmm3, xmm5, xmm1
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
mova COVAR(jq, 0), xmm0
mova COVAR(jq, 1), xmm1
mova COVAR(jq, 2), xmm2
mova COVAR(jq, 3), xmm3
%else
vmulpd xmm0, xmm3, xmm4
vmulpd xmm1, xmm3, xmm5
vmulpd xmm2, xmm3, xmm6
@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), xmm1
ADDPD_MEM COVAR(jq,2), xmm2
ADDPD_MEM COVAR(jq,3), xmm3
%endif ; cpuflag(fma3)
.skip2x4:
add id, 4
add covarq, 4*COVAR_STRIDE
@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
mov jd, id
.loop2x1:
vmovddup xmm0, [varq + iq*8]
%if cpuflag(fma3)
mova xmm1, [varq + jq*8]
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
mova COVAR(jq,0), xmm0
%else
vmulpd xmm0, [varq + jq*8]
ADDPD_MEM COVAR(jq,0), xmm0
%endif ; cpuflag(fma3)
inc id
add covarq, COVAR_STRIDE
cmp id, countd
jle .loop2x1
.ret:
REP_RET
%endmacro ; UPDATE_LLS
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
UPDATE_LLS
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
UPDATE_LLS
%endif
INIT_XMM sse2

View File

@ -25,6 +25,7 @@
void ff_update_lls_sse2(LLSModel *m, const double *var);
void ff_update_lls_avx(LLSModel *m, const double *var);
void ff_update_lls_fma3(LLSModel *m, const double *var);
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
if (EXTERNAL_AVX_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_avx;
}
if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
m->update_lls = ff_update_lls_fma3;
}
}