mirror of https://git.ffmpeg.org/ffmpeg.git
lavu/x86/lls: add fma3 optimizations for update_lls
This improves accuracy (very slightly) and speed for processors having fma3. Sample benchmark (fate flac-16-lpc-cholesky, Haswell): old: 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips new: 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips Tested with FATE and --disable-fma3, also examined contents of lavu/lls-test. Reviewed-by: James Almer <jamrial@gmail.com> Reviewed-by: Henrik Gramner <henrik@gramner.com> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
This commit is contained in:
parent
d4ce63a1bf
commit
5989add4ab
|
@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
|
|||
.ret:
|
||||
REP_RET
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
%macro UPDATE_LLS 0
|
||||
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||
%define covarq ctxq
|
||||
mov countd, [ctxq + LLSModel.indep_count]
|
||||
|
@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
vbroadcastsd ymm6, [varq + iq*8 + 16]
|
||||
vbroadcastsd ymm7, [varq + iq*8 + 24]
|
||||
vextractf128 xmm3, ymm1, 1
|
||||
%if cpuflag(fma3)
|
||||
mova ymm0, COVAR(iq ,0)
|
||||
mova xmm2, COVAR(iq+2,2)
|
||||
fmaddpd ymm0, ymm1, ymm4, ymm0
|
||||
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
|
||||
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
|
||||
mova COVAR(iq ,0), ymm0
|
||||
mova COVAR(iq ,1), ymm1
|
||||
mova COVAR(iq+2,2), xmm2
|
||||
mova COVAR(iq+2,3), xmm3
|
||||
%else
|
||||
vmulpd ymm0, ymm1, ymm4
|
||||
vmulpd ymm1, ymm1, ymm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
|
@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
ADDPD_MEM COVAR(iq ,1), ymm1
|
||||
ADDPD_MEM COVAR(iq+2,2), xmm2
|
||||
ADDPD_MEM COVAR(iq+2,3), xmm3
|
||||
%endif ; cpuflag(fma3)
|
||||
lea jd, [iq + 4]
|
||||
cmp jd, count2d
|
||||
jg .skip4x4
|
||||
.loop4x4:
|
||||
; Compute all 16 pairwise products of a 4x4 block
|
||||
mova ymm3, [varq + jq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova ymm0, COVAR(jq, 0)
|
||||
mova ymm1, COVAR(jq, 1)
|
||||
mova ymm2, COVAR(jq, 2)
|
||||
fmaddpd ymm0, ymm3, ymm4, ymm0
|
||||
fmaddpd ymm1, ymm3, ymm5, ymm1
|
||||
fmaddpd ymm2, ymm3, ymm6, ymm2
|
||||
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
|
||||
mova COVAR(jq, 0), ymm0
|
||||
mova COVAR(jq, 1), ymm1
|
||||
mova COVAR(jq, 2), ymm2
|
||||
mova COVAR(jq, 3), ymm3
|
||||
%else
|
||||
vmulpd ymm0, ymm3, ymm4
|
||||
vmulpd ymm1, ymm3, ymm5
|
||||
vmulpd ymm2, ymm3, ymm6
|
||||
|
@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
ADDPD_MEM COVAR(jq,1), ymm1
|
||||
ADDPD_MEM COVAR(jq,2), ymm2
|
||||
ADDPD_MEM COVAR(jq,3), ymm3
|
||||
%endif ; cpuflag(fma3)
|
||||
add jd, 4
|
||||
cmp jd, count2d
|
||||
jle .loop4x4
|
||||
|
@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
cmp jd, countd
|
||||
jg .skip2x4
|
||||
mova xmm3, [varq + jq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova xmm0, COVAR(jq, 0)
|
||||
mova xmm1, COVAR(jq, 1)
|
||||
mova xmm2, COVAR(jq, 2)
|
||||
fmaddpd xmm0, xmm3, xmm4, xmm0
|
||||
fmaddpd xmm1, xmm3, xmm5, xmm1
|
||||
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
|
||||
mova COVAR(jq, 0), xmm0
|
||||
mova COVAR(jq, 1), xmm1
|
||||
mova COVAR(jq, 2), xmm2
|
||||
mova COVAR(jq, 3), xmm3
|
||||
%else
|
||||
vmulpd xmm0, xmm3, xmm4
|
||||
vmulpd xmm1, xmm3, xmm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
|
@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
ADDPD_MEM COVAR(jq,1), xmm1
|
||||
ADDPD_MEM COVAR(jq,2), xmm2
|
||||
ADDPD_MEM COVAR(jq,3), xmm3
|
||||
%endif ; cpuflag(fma3)
|
||||
.skip2x4:
|
||||
add id, 4
|
||||
add covarq, 4*COVAR_STRIDE
|
||||
|
@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||
mov jd, id
|
||||
.loop2x1:
|
||||
vmovddup xmm0, [varq + iq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova xmm1, [varq + jq*8]
|
||||
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
|
||||
mova COVAR(jq,0), xmm0
|
||||
%else
|
||||
vmulpd xmm0, [varq + jq*8]
|
||||
ADDPD_MEM COVAR(jq,0), xmm0
|
||||
%endif ; cpuflag(fma3)
|
||||
inc id
|
||||
add covarq, COVAR_STRIDE
|
||||
cmp id, countd
|
||||
jle .loop2x1
|
||||
.ret:
|
||||
REP_RET
|
||||
%endmacro ; UPDATE_LLS
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
UPDATE_LLS
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
UPDATE_LLS
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
void ff_update_lls_sse2(LLSModel *m, const double *var);
|
||||
void ff_update_lls_avx(LLSModel *m, const double *var);
|
||||
void ff_update_lls_fma3(LLSModel *m, const double *var);
|
||||
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
|
||||
|
||||
av_cold void ff_init_lls_x86(LLSModel *m)
|
||||
|
@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
|
|||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
m->update_lls = ff_update_lls_avx;
|
||||
}
|
||||
if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
|
||||
m->update_lls = ff_update_lls_fma3;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue