lavu/x86/lls: add fma3 optimizations for update_lls

This improves accuracy (very slightly) and speed for processors having
fma3.

Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
old:
5993610 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
5951528 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips

new:
5252410 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
5232869 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips

Tested with FATE and --disable-fma3, also examined contents of
lavu/lls-test.

Reviewed-by: James Almer <jamrial@gmail.com>
Reviewed-by: Henrik Gramner <henrik@gramner.com>
Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
This commit is contained in:
Ganesh Ajjanagadde 2016-01-13 17:59:26 -05:00
parent d4ce63a1bf
commit 5989add4ab
2 changed files with 61 additions and 2 deletions

View File

@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
.ret: .ret:
REP_RET REP_RET
%if HAVE_AVX_EXTERNAL %macro UPDATE_LLS 0
INIT_YMM avx
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq %define covarq ctxq
mov countd, [ctxq + LLSModel.indep_count] mov countd, [ctxq + LLSModel.indep_count]
@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
vbroadcastsd ymm6, [varq + iq*8 + 16] vbroadcastsd ymm6, [varq + iq*8 + 16]
vbroadcastsd ymm7, [varq + iq*8 + 24] vbroadcastsd ymm7, [varq + iq*8 + 24]
vextractf128 xmm3, ymm1, 1 vextractf128 xmm3, ymm1, 1
%if cpuflag(fma3)
mova ymm0, COVAR(iq ,0)
mova xmm2, COVAR(iq+2,2)
fmaddpd ymm0, ymm1, ymm4, ymm0
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
mova COVAR(iq ,0), ymm0
mova COVAR(iq ,1), ymm1
mova COVAR(iq+2,2), xmm2
mova COVAR(iq+2,3), xmm3
%else
vmulpd ymm0, ymm1, ymm4 vmulpd ymm0, ymm1, ymm4
vmulpd ymm1, ymm1, ymm5 vmulpd ymm1, ymm1, ymm5
vmulpd xmm2, xmm3, xmm6 vmulpd xmm2, xmm3, xmm6
@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(iq ,1), ymm1 ADDPD_MEM COVAR(iq ,1), ymm1
ADDPD_MEM COVAR(iq+2,2), xmm2 ADDPD_MEM COVAR(iq+2,2), xmm2
ADDPD_MEM COVAR(iq+2,3), xmm3 ADDPD_MEM COVAR(iq+2,3), xmm3
%endif ; cpuflag(fma3)
lea jd, [iq + 4] lea jd, [iq + 4]
cmp jd, count2d cmp jd, count2d
jg .skip4x4 jg .skip4x4
.loop4x4: .loop4x4:
; Compute all 16 pairwise products of a 4x4 block ; Compute all 16 pairwise products of a 4x4 block
mova ymm3, [varq + jq*8] mova ymm3, [varq + jq*8]
%if cpuflag(fma3)
mova ymm0, COVAR(jq, 0)
mova ymm1, COVAR(jq, 1)
mova ymm2, COVAR(jq, 2)
fmaddpd ymm0, ymm3, ymm4, ymm0
fmaddpd ymm1, ymm3, ymm5, ymm1
fmaddpd ymm2, ymm3, ymm6, ymm2
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
mova COVAR(jq, 0), ymm0
mova COVAR(jq, 1), ymm1
mova COVAR(jq, 2), ymm2
mova COVAR(jq, 3), ymm3
%else
vmulpd ymm0, ymm3, ymm4 vmulpd ymm0, ymm3, ymm4
vmulpd ymm1, ymm3, ymm5 vmulpd ymm1, ymm3, ymm5
vmulpd ymm2, ymm3, ymm6 vmulpd ymm2, ymm3, ymm6
@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), ymm1 ADDPD_MEM COVAR(jq,1), ymm1
ADDPD_MEM COVAR(jq,2), ymm2 ADDPD_MEM COVAR(jq,2), ymm2
ADDPD_MEM COVAR(jq,3), ymm3 ADDPD_MEM COVAR(jq,3), ymm3
%endif ; cpuflag(fma3)
add jd, 4 add jd, 4
cmp jd, count2d cmp jd, count2d
jle .loop4x4 jle .loop4x4
@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
cmp jd, countd cmp jd, countd
jg .skip2x4 jg .skip2x4
mova xmm3, [varq + jq*8] mova xmm3, [varq + jq*8]
%if cpuflag(fma3)
mova xmm0, COVAR(jq, 0)
mova xmm1, COVAR(jq, 1)
mova xmm2, COVAR(jq, 2)
fmaddpd xmm0, xmm3, xmm4, xmm0
fmaddpd xmm1, xmm3, xmm5, xmm1
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
mova COVAR(jq, 0), xmm0
mova COVAR(jq, 1), xmm1
mova COVAR(jq, 2), xmm2
mova COVAR(jq, 3), xmm3
%else
vmulpd xmm0, xmm3, xmm4 vmulpd xmm0, xmm3, xmm4
vmulpd xmm1, xmm3, xmm5 vmulpd xmm1, xmm3, xmm5
vmulpd xmm2, xmm3, xmm6 vmulpd xmm2, xmm3, xmm6
@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), xmm1 ADDPD_MEM COVAR(jq,1), xmm1
ADDPD_MEM COVAR(jq,2), xmm2 ADDPD_MEM COVAR(jq,2), xmm2
ADDPD_MEM COVAR(jq,3), xmm3 ADDPD_MEM COVAR(jq,3), xmm3
%endif ; cpuflag(fma3)
.skip2x4: .skip2x4:
add id, 4 add id, 4
add covarq, 4*COVAR_STRIDE add covarq, 4*COVAR_STRIDE
@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
mov jd, id mov jd, id
.loop2x1: .loop2x1:
vmovddup xmm0, [varq + iq*8] vmovddup xmm0, [varq + iq*8]
%if cpuflag(fma3)
mova xmm1, [varq + jq*8]
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
mova COVAR(jq,0), xmm0
%else
vmulpd xmm0, [varq + jq*8] vmulpd xmm0, [varq + jq*8]
ADDPD_MEM COVAR(jq,0), xmm0 ADDPD_MEM COVAR(jq,0), xmm0
%endif ; cpuflag(fma3)
inc id inc id
add covarq, COVAR_STRIDE add covarq, COVAR_STRIDE
cmp id, countd cmp id, countd
jle .loop2x1 jle .loop2x1
.ret: .ret:
REP_RET REP_RET
%endmacro ; UPDATE_LLS
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
UPDATE_LLS
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
UPDATE_LLS
%endif %endif
INIT_XMM sse2 INIT_XMM sse2

View File

@ -25,6 +25,7 @@
void ff_update_lls_sse2(LLSModel *m, const double *var); void ff_update_lls_sse2(LLSModel *m, const double *var);
void ff_update_lls_avx(LLSModel *m, const double *var); void ff_update_lls_avx(LLSModel *m, const double *var);
void ff_update_lls_fma3(LLSModel *m, const double *var);
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order); double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m) av_cold void ff_init_lls_x86(LLSModel *m)
@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
if (EXTERNAL_AVX_FAST(cpu_flags)) { if (EXTERNAL_AVX_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_avx; m->update_lls = ff_update_lls_avx;
} }
if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
m->update_lls = ff_update_lls_fma3;
}
} }