mirror of https://git.ffmpeg.org/ffmpeg.git
lavu/x86/lls: add fma3 optimizations for update_lls
This improves accuracy (very slightly) and speed for processors having fma3. Sample benchmark (fate flac-16-lpc-cholesky, Haswell): old: 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips new: 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips Tested with FATE and --disable-fma3, also examined contents of lavu/lls-test. Reviewed-by: James Almer <jamrial@gmail.com> Reviewed-by: Henrik Gramner <henrik@gramner.com> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
This commit is contained in:
parent
d4ce63a1bf
commit
5989add4ab
|
@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
|
||||||
.ret:
|
.ret:
|
||||||
REP_RET
|
REP_RET
|
||||||
|
|
||||||
%if HAVE_AVX_EXTERNAL
|
%macro UPDATE_LLS 0
|
||||||
INIT_YMM avx
|
|
||||||
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
%define covarq ctxq
|
%define covarq ctxq
|
||||||
mov countd, [ctxq + LLSModel.indep_count]
|
mov countd, [ctxq + LLSModel.indep_count]
|
||||||
|
@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
vbroadcastsd ymm6, [varq + iq*8 + 16]
|
vbroadcastsd ymm6, [varq + iq*8 + 16]
|
||||||
vbroadcastsd ymm7, [varq + iq*8 + 24]
|
vbroadcastsd ymm7, [varq + iq*8 + 24]
|
||||||
vextractf128 xmm3, ymm1, 1
|
vextractf128 xmm3, ymm1, 1
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova ymm0, COVAR(iq ,0)
|
||||||
|
mova xmm2, COVAR(iq+2,2)
|
||||||
|
fmaddpd ymm0, ymm1, ymm4, ymm0
|
||||||
|
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||||
|
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
|
||||||
|
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
|
||||||
|
mova COVAR(iq ,0), ymm0
|
||||||
|
mova COVAR(iq ,1), ymm1
|
||||||
|
mova COVAR(iq+2,2), xmm2
|
||||||
|
mova COVAR(iq+2,3), xmm3
|
||||||
|
%else
|
||||||
vmulpd ymm0, ymm1, ymm4
|
vmulpd ymm0, ymm1, ymm4
|
||||||
vmulpd ymm1, ymm1, ymm5
|
vmulpd ymm1, ymm1, ymm5
|
||||||
vmulpd xmm2, xmm3, xmm6
|
vmulpd xmm2, xmm3, xmm6
|
||||||
|
@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
ADDPD_MEM COVAR(iq ,1), ymm1
|
ADDPD_MEM COVAR(iq ,1), ymm1
|
||||||
ADDPD_MEM COVAR(iq+2,2), xmm2
|
ADDPD_MEM COVAR(iq+2,2), xmm2
|
||||||
ADDPD_MEM COVAR(iq+2,3), xmm3
|
ADDPD_MEM COVAR(iq+2,3), xmm3
|
||||||
|
%endif ; cpuflag(fma3)
|
||||||
lea jd, [iq + 4]
|
lea jd, [iq + 4]
|
||||||
cmp jd, count2d
|
cmp jd, count2d
|
||||||
jg .skip4x4
|
jg .skip4x4
|
||||||
.loop4x4:
|
.loop4x4:
|
||||||
; Compute all 16 pairwise products of a 4x4 block
|
; Compute all 16 pairwise products of a 4x4 block
|
||||||
mova ymm3, [varq + jq*8]
|
mova ymm3, [varq + jq*8]
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova ymm0, COVAR(jq, 0)
|
||||||
|
mova ymm1, COVAR(jq, 1)
|
||||||
|
mova ymm2, COVAR(jq, 2)
|
||||||
|
fmaddpd ymm0, ymm3, ymm4, ymm0
|
||||||
|
fmaddpd ymm1, ymm3, ymm5, ymm1
|
||||||
|
fmaddpd ymm2, ymm3, ymm6, ymm2
|
||||||
|
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
|
||||||
|
mova COVAR(jq, 0), ymm0
|
||||||
|
mova COVAR(jq, 1), ymm1
|
||||||
|
mova COVAR(jq, 2), ymm2
|
||||||
|
mova COVAR(jq, 3), ymm3
|
||||||
|
%else
|
||||||
vmulpd ymm0, ymm3, ymm4
|
vmulpd ymm0, ymm3, ymm4
|
||||||
vmulpd ymm1, ymm3, ymm5
|
vmulpd ymm1, ymm3, ymm5
|
||||||
vmulpd ymm2, ymm3, ymm6
|
vmulpd ymm2, ymm3, ymm6
|
||||||
|
@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
ADDPD_MEM COVAR(jq,1), ymm1
|
ADDPD_MEM COVAR(jq,1), ymm1
|
||||||
ADDPD_MEM COVAR(jq,2), ymm2
|
ADDPD_MEM COVAR(jq,2), ymm2
|
||||||
ADDPD_MEM COVAR(jq,3), ymm3
|
ADDPD_MEM COVAR(jq,3), ymm3
|
||||||
|
%endif ; cpuflag(fma3)
|
||||||
add jd, 4
|
add jd, 4
|
||||||
cmp jd, count2d
|
cmp jd, count2d
|
||||||
jle .loop4x4
|
jle .loop4x4
|
||||||
|
@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
cmp jd, countd
|
cmp jd, countd
|
||||||
jg .skip2x4
|
jg .skip2x4
|
||||||
mova xmm3, [varq + jq*8]
|
mova xmm3, [varq + jq*8]
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova xmm0, COVAR(jq, 0)
|
||||||
|
mova xmm1, COVAR(jq, 1)
|
||||||
|
mova xmm2, COVAR(jq, 2)
|
||||||
|
fmaddpd xmm0, xmm3, xmm4, xmm0
|
||||||
|
fmaddpd xmm1, xmm3, xmm5, xmm1
|
||||||
|
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||||
|
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
|
||||||
|
mova COVAR(jq, 0), xmm0
|
||||||
|
mova COVAR(jq, 1), xmm1
|
||||||
|
mova COVAR(jq, 2), xmm2
|
||||||
|
mova COVAR(jq, 3), xmm3
|
||||||
|
%else
|
||||||
vmulpd xmm0, xmm3, xmm4
|
vmulpd xmm0, xmm3, xmm4
|
||||||
vmulpd xmm1, xmm3, xmm5
|
vmulpd xmm1, xmm3, xmm5
|
||||||
vmulpd xmm2, xmm3, xmm6
|
vmulpd xmm2, xmm3, xmm6
|
||||||
|
@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
ADDPD_MEM COVAR(jq,1), xmm1
|
ADDPD_MEM COVAR(jq,1), xmm1
|
||||||
ADDPD_MEM COVAR(jq,2), xmm2
|
ADDPD_MEM COVAR(jq,2), xmm2
|
||||||
ADDPD_MEM COVAR(jq,3), xmm3
|
ADDPD_MEM COVAR(jq,3), xmm3
|
||||||
|
%endif ; cpuflag(fma3)
|
||||||
.skip2x4:
|
.skip2x4:
|
||||||
add id, 4
|
add id, 4
|
||||||
add covarq, 4*COVAR_STRIDE
|
add covarq, 4*COVAR_STRIDE
|
||||||
|
@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||||
mov jd, id
|
mov jd, id
|
||||||
.loop2x1:
|
.loop2x1:
|
||||||
vmovddup xmm0, [varq + iq*8]
|
vmovddup xmm0, [varq + iq*8]
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova xmm1, [varq + jq*8]
|
||||||
|
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
|
||||||
|
mova COVAR(jq,0), xmm0
|
||||||
|
%else
|
||||||
vmulpd xmm0, [varq + jq*8]
|
vmulpd xmm0, [varq + jq*8]
|
||||||
ADDPD_MEM COVAR(jq,0), xmm0
|
ADDPD_MEM COVAR(jq,0), xmm0
|
||||||
|
%endif ; cpuflag(fma3)
|
||||||
inc id
|
inc id
|
||||||
add covarq, COVAR_STRIDE
|
add covarq, COVAR_STRIDE
|
||||||
cmp id, countd
|
cmp id, countd
|
||||||
jle .loop2x1
|
jle .loop2x1
|
||||||
.ret:
|
.ret:
|
||||||
REP_RET
|
REP_RET
|
||||||
|
%endmacro ; UPDATE_LLS
|
||||||
|
|
||||||
|
%if HAVE_AVX_EXTERNAL
|
||||||
|
INIT_YMM avx
|
||||||
|
UPDATE_LLS
|
||||||
|
%endif
|
||||||
|
%if HAVE_FMA3_EXTERNAL
|
||||||
|
INIT_YMM fma3
|
||||||
|
UPDATE_LLS
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
void ff_update_lls_sse2(LLSModel *m, const double *var);
|
void ff_update_lls_sse2(LLSModel *m, const double *var);
|
||||||
void ff_update_lls_avx(LLSModel *m, const double *var);
|
void ff_update_lls_avx(LLSModel *m, const double *var);
|
||||||
|
void ff_update_lls_fma3(LLSModel *m, const double *var);
|
||||||
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
|
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
|
||||||
|
|
||||||
av_cold void ff_init_lls_x86(LLSModel *m)
|
av_cold void ff_init_lls_x86(LLSModel *m)
|
||||||
|
@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
|
||||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||||
m->update_lls = ff_update_lls_avx;
|
m->update_lls = ff_update_lls_avx;
|
||||||
}
|
}
|
||||||
|
if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
|
||||||
|
m->update_lls = ff_update_lls_fma3;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue