mirror of https://git.ffmpeg.org/ffmpeg.git
arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
The theoretical maximum value of E is 193, so we can just saturate the addition to 255. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 After: vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0 vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7 vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0 vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7 Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
ed6a891c36
commit
c582cb8537
|
@ -51,13 +51,6 @@
|
||||||
// see the arm version instead.
|
// see the arm version instead.
|
||||||
|
|
||||||
|
|
||||||
.macro uabdl_sz dst1, dst2, in1, in2, sz
|
|
||||||
uabdl \dst1, \in1\().8b, \in2\().8b
|
|
||||||
.ifc \sz, .16b
|
|
||||||
uabdl2 \dst2, \in1\().16b, \in2\().16b
|
|
||||||
.endif
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
|
.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
|
||||||
add \dst1, \in1, \in3
|
add \dst1, \in1, \in3
|
||||||
.ifc \sz, .16b
|
.ifc \sz, .16b
|
||||||
|
@ -86,20 +79,6 @@
|
||||||
.endif
|
.endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
|
|
||||||
cmhs \dst1, \in1, \in3
|
|
||||||
.ifc \sz, .16b
|
|
||||||
cmhs \dst2, \in2, \in4
|
|
||||||
.endif
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro xtn_sz dst, in1, in2, sz
|
|
||||||
xtn \dst\().8b, \in1
|
|
||||||
.ifc \sz, .16b
|
|
||||||
xtn2 \dst\().16b, \in2
|
|
||||||
.endif
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro usubl_sz dst1, dst2, in1, in2, sz
|
.macro usubl_sz dst1, dst2, in1, in2, sz
|
||||||
usubl \dst1, \in1\().8b, \in2\().8b
|
usubl \dst1, \in1\().8b, \in2\().8b
|
||||||
.ifc \sz, .16b
|
.ifc \sz, .16b
|
||||||
|
@ -179,20 +158,20 @@
|
||||||
// tmpq2 == tmp3 + tmp4, etc.
|
// tmpq2 == tmp3 + tmp4, etc.
|
||||||
.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
||||||
.if \mix == 0
|
.if \mix == 0
|
||||||
dup v0.8h, w2 // E
|
dup v0\sz, w2 // E
|
||||||
dup v1.8h, w2 // E
|
|
||||||
dup v2\sz, w3 // I
|
dup v2\sz, w3 // I
|
||||||
dup v3\sz, w4 // H
|
dup v3\sz, w4 // H
|
||||||
.else
|
.else
|
||||||
dup v0.8h, w2 // E
|
dup v0.8b, w2 // E
|
||||||
dup v2.8b, w3 // I
|
dup v2.8b, w3 // I
|
||||||
dup v3.8b, w4 // H
|
dup v3.8b, w4 // H
|
||||||
|
lsr w5, w2, #8
|
||||||
lsr w6, w3, #8
|
lsr w6, w3, #8
|
||||||
lsr w7, w4, #8
|
lsr w7, w4, #8
|
||||||
ushr v1.8h, v0.8h, #8 // E
|
dup v1.8b, w5 // E
|
||||||
dup v4.8b, w6 // I
|
dup v4.8b, w6 // I
|
||||||
bic v0.8h, #255, lsl 8 // E
|
|
||||||
dup v5.8b, w7 // H
|
dup v5.8b, w7 // H
|
||||||
|
trn1 v0.2d, v0.2d, v1.2d
|
||||||
trn1 v2.2d, v2.2d, v4.2d
|
trn1 v2.2d, v2.2d, v4.2d
|
||||||
trn1 v3.2d, v3.2d, v5.2d
|
trn1 v3.2d, v3.2d, v5.2d
|
||||||
.endif
|
.endif
|
||||||
|
@ -206,16 +185,15 @@
|
||||||
umax v4\sz, v4\sz, v5\sz
|
umax v4\sz, v4\sz, v5\sz
|
||||||
umax v5\sz, v6\sz, v7\sz
|
umax v5\sz, v6\sz, v7\sz
|
||||||
umax \tmp1\sz, \tmp1\sz, \tmp2\sz
|
umax \tmp1\sz, \tmp1\sz, \tmp2\sz
|
||||||
uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0)
|
uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0)
|
||||||
umax v4\sz, v4\sz, v5\sz
|
umax v4\sz, v4\sz, v5\sz
|
||||||
add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2
|
uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2
|
||||||
uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
|
uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
|
||||||
umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
|
umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
|
||||||
ushr v5\sz, v5\sz, #1
|
ushr v5\sz, v5\sz, #1
|
||||||
cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
|
cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
|
||||||
uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||||
cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz
|
cmhs v5\sz, v0\sz, v6\sz
|
||||||
xtn_sz v5, v6.8h, v7.8h, \sz
|
|
||||||
and v4\sz, v4\sz, v5\sz // fm
|
and v4\sz, v4\sz, v5\sz // fm
|
||||||
|
|
||||||
// If no pixels need filtering, just exit as soon as possible
|
// If no pixels need filtering, just exit as soon as possible
|
||||||
|
|
|
@ -51,7 +51,7 @@
|
||||||
@ and d28-d31 as temp registers, or d8-d15.
|
@ and d28-d31 as temp registers, or d8-d15.
|
||||||
@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
|
@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
|
||||||
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
|
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
|
||||||
vdup.u16 q0, r2 @ E
|
vdup.u8 d0, r2 @ E
|
||||||
vdup.u8 d2, r3 @ I
|
vdup.u8 d2, r3 @ I
|
||||||
ldr r3, [sp]
|
ldr r3, [sp]
|
||||||
|
|
||||||
|
@ -64,16 +64,15 @@
|
||||||
vmax.u8 d4, d4, d5
|
vmax.u8 d4, d4, d5
|
||||||
vmax.u8 d5, d6, d7
|
vmax.u8 d5, d6, d7
|
||||||
vmax.u8 \tmp1, \tmp1, \tmp2
|
vmax.u8 \tmp1, \tmp1, \tmp2
|
||||||
vabdl.u8 q3, d23, d24 @ abs(p0 - q0)
|
vabd.u8 d6, d23, d24 @ abs(p0 - q0)
|
||||||
vmax.u8 d4, d4, d5
|
vmax.u8 d4, d4, d5
|
||||||
vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2
|
vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
|
||||||
vabd.u8 d5, d22, d25 @ abs(p1 - q1)
|
vabd.u8 d5, d22, d25 @ abs(p1 - q1)
|
||||||
vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
|
vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
|
||||||
vshr.u8 d5, d5, #1
|
vshr.u8 d5, d5, #1
|
||||||
vcle.u8 d4, d4, d2 @ max(abs()) <= I
|
vcle.u8 d4, d4, d2 @ max(abs()) <= I
|
||||||
vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||||
vcle.u16 q3, q3, q0
|
vcle.u8 d5, d6, d0
|
||||||
vmovn.u16 d5, q3
|
|
||||||
vand d4, d4, d5 @ fm
|
vand d4, d4, d5 @ fm
|
||||||
|
|
||||||
vdup.u8 d3, r3 @ H
|
vdup.u8 d3, r3 @ H
|
||||||
|
|
Loading…
Reference in New Issue