mirror of https://git.ffmpeg.org/ffmpeg.git
sbr_hf_gen_sse: Optimize code a bit more.
Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
7f154bd54f
commit
0110108a7c
|
@ -134,7 +134,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||||
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
||||||
mova m3, m1
|
mova m3, m1
|
||||||
mova m4, m2
|
mova m4, m2
|
||||||
mova m7, [ps_mask]
|
|
||||||
|
|
||||||
; Set pointers
|
; Set pointers
|
||||||
%if ARCH_X86_64 == 0 || WIN64
|
%if ARCH_X86_64 == 0 || WIN64
|
||||||
|
@ -154,30 +153,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||||
shl start, 3 ; offset from num loops
|
shl start, 3 ; offset from num loops
|
||||||
|
|
||||||
mova m0, [X_lowq + start]
|
mova m0, [X_lowq + start]
|
||||||
movlhps m1, m1 ; (a2 a3 a2 a3)
|
shufps m3, m3, q1111
|
||||||
movlhps m2, m2 ; (a0 a1 a0 a1)
|
shufps m4, m4, q1111
|
||||||
shufps m3, m3, q0101 ; (a3 a2 a3 a2)
|
xorps m3, [ps_mask]
|
||||||
shufps m4, m4, q0101 ; (a1 a0 a1 a0)
|
shufps m1, m1, q0000
|
||||||
xorps m3, m7 ; (-a3 a2 -a3 a2)
|
shufps m2, m2, q0000
|
||||||
xorps m4, m7 ; (-a1 a0 -a1 a0)
|
xorps m4, [ps_mask]
|
||||||
.loop2:
|
.loop2:
|
||||||
mova m5, m0
|
movu m7, [X_lowq + start + 8] ; BbCc
|
||||||
mova m6, m0
|
mova m6, m0
|
||||||
shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
|
mova m5, m7
|
||||||
shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
|
shufps m0, m0, q2301 ; aAbB
|
||||||
mulps m0, m2
|
shufps m7, m7, q2301 ; bBcC
|
||||||
mulps m5, m4
|
mulps m0, m4
|
||||||
mova m7, m6
|
|
||||||
addps m5, m0
|
|
||||||
mova m0, [X_lowq + start + 2*2*4]
|
|
||||||
shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
|
|
||||||
shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
|
|
||||||
mulps m6, m1
|
|
||||||
mulps m7, m3
|
mulps m7, m3
|
||||||
addps m5, m6
|
mulps m6, m2
|
||||||
|
mulps m5, m1
|
||||||
addps m7, m0
|
addps m7, m0
|
||||||
addps m5, m7
|
mova m0, [X_lowq + start +16] ; CcDd
|
||||||
mova [X_highq + start], m5
|
addps m7, m0
|
||||||
|
addps m6, m5
|
||||||
|
addps m7, m6
|
||||||
|
mova [X_highq + start], m7
|
||||||
add start, 16
|
add start, 16
|
||||||
jnz .loop2
|
jnz .loop2
|
||||||
RET
|
RET
|
||||||
|
|
Loading…
Reference in New Issue