arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit

The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.

Before:                     Cortex A7      A8      A9     A53  A53/AArch64
vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
After:
vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2017-01-14 20:49:19 +02:00
parent ed6a891c36
commit c582cb8537
2 changed files with 14 additions and 37 deletions

View File

@ -51,13 +51,6 @@
// see the arm version instead. // see the arm version instead.
.macro uabdl_sz dst1, dst2, in1, in2, sz
uabdl \dst1, \in1\().8b, \in2\().8b
.ifc \sz, .16b
uabdl2 \dst2, \in1\().16b, \in2\().16b
.endif
.endm
.macro add_sz dst1, dst2, in1, in2, in3, in4, sz .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
add \dst1, \in1, \in3 add \dst1, \in1, \in3
.ifc \sz, .16b .ifc \sz, .16b
@ -86,20 +79,6 @@
.endif .endif
.endm .endm
.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
cmhs \dst1, \in1, \in3
.ifc \sz, .16b
cmhs \dst2, \in2, \in4
.endif
.endm
.macro xtn_sz dst, in1, in2, sz
xtn \dst\().8b, \in1
.ifc \sz, .16b
xtn2 \dst\().16b, \in2
.endif
.endm
.macro usubl_sz dst1, dst2, in1, in2, sz .macro usubl_sz dst1, dst2, in1, in2, sz
usubl \dst1, \in1\().8b, \in2\().8b usubl \dst1, \in1\().8b, \in2\().8b
.ifc \sz, .16b .ifc \sz, .16b
@ -179,20 +158,20 @@
// tmpq2 == tmp3 + tmp4, etc. // tmpq2 == tmp3 + tmp4, etc.
.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
.if \mix == 0 .if \mix == 0
dup v0.8h, w2 // E dup v0\sz, w2 // E
dup v1.8h, w2 // E
dup v2\sz, w3 // I dup v2\sz, w3 // I
dup v3\sz, w4 // H dup v3\sz, w4 // H
.else .else
dup v0.8h, w2 // E dup v0.8b, w2 // E
dup v2.8b, w3 // I dup v2.8b, w3 // I
dup v3.8b, w4 // H dup v3.8b, w4 // H
lsr w5, w2, #8
lsr w6, w3, #8 lsr w6, w3, #8
lsr w7, w4, #8 lsr w7, w4, #8
ushr v1.8h, v0.8h, #8 // E dup v1.8b, w5 // E
dup v4.8b, w6 // I dup v4.8b, w6 // I
bic v0.8h, #255, lsl 8 // E
dup v5.8b, w7 // H dup v5.8b, w7 // H
trn1 v0.2d, v0.2d, v1.2d
trn1 v2.2d, v2.2d, v4.2d trn1 v2.2d, v2.2d, v4.2d
trn1 v3.2d, v3.2d, v5.2d trn1 v3.2d, v3.2d, v5.2d
.endif .endif
@ -206,16 +185,15 @@
umax v4\sz, v4\sz, v5\sz umax v4\sz, v4\sz, v5\sz
umax v5\sz, v6\sz, v7\sz umax v5\sz, v6\sz, v7\sz
umax \tmp1\sz, \tmp1\sz, \tmp2\sz umax \tmp1\sz, \tmp1\sz, \tmp2\sz
uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0)
umax v4\sz, v4\sz, v5\sz umax v4\sz, v4\sz, v5\sz
add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2 uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2
uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
ushr v5\sz, v5\sz, #1 ushr v5\sz, v5\sz, #1
cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz cmhs v5\sz, v0\sz, v6\sz
xtn_sz v5, v6.8h, v7.8h, \sz
and v4\sz, v4\sz, v5\sz // fm and v4\sz, v4\sz, v5\sz // fm
// If no pixels need filtering, just exit as soon as possible // If no pixels need filtering, just exit as soon as possible

View File

@ -51,7 +51,7 @@
@ and d28-d31 as temp registers, or d8-d15. @ and d28-d31 as temp registers, or d8-d15.
@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
vdup.u16 q0, r2 @ E vdup.u8 d0, r2 @ E
vdup.u8 d2, r3 @ I vdup.u8 d2, r3 @ I
ldr r3, [sp] ldr r3, [sp]
@ -64,16 +64,15 @@
vmax.u8 d4, d4, d5 vmax.u8 d4, d4, d5
vmax.u8 d5, d6, d7 vmax.u8 d5, d6, d7
vmax.u8 \tmp1, \tmp1, \tmp2 vmax.u8 \tmp1, \tmp1, \tmp2
vabdl.u8 q3, d23, d24 @ abs(p0 - q0) vabd.u8 d6, d23, d24 @ abs(p0 - q0)
vmax.u8 d4, d4, d5 vmax.u8 d4, d4, d5
vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2 vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
vabd.u8 d5, d22, d25 @ abs(p1 - q1) vabd.u8 d5, d22, d25 @ abs(p1 - q1)
vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
vshr.u8 d5, d5, #1 vshr.u8 d5, d5, #1
vcle.u8 d4, d4, d2 @ max(abs()) <= I vcle.u8 d4, d4, d2 @ max(abs()) <= I
vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
vcle.u16 q3, q3, q0 vcle.u8 d5, d6, d0
vmovn.u16 d5, q3
vand d4, d4, d5 @ fm vand d4, d4, d5 @ fm
vdup.u8 d3, r3 @ H vdup.u8 d3, r3 @ H