aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon

Using absolute-difference-accumulate does use twice the amount of
absolute-difference instructions, but avoids the need for the
uaddl and add instructions, reducing the total number of instructions
by 3.

These can be interleaved in the rest of the calculation, to avoid
tight dependencies at the end. Unfortunately, this is marginally
slower on Cortex A53, but faster on A72 and A73.

Before:       Cortex A53    A72    A73   Graviton 3
pix_abs_0_3_neon:  175.7  109.2   92.0   41.2
After:
pix_abs_0_3_neon:  179.7   96.7   87.5   41.2

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2022-07-13 00:06:31 +03:00
parent b46de9aba4
commit 68a03f6424

View File

@ -124,6 +124,9 @@ function ff_pix_abs16_xy2_neon, export=1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
ld1 {v21.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
@ -137,6 +140,9 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
@ -144,33 +150,17 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
// Averages are now stored in these registers:
// v23, v16, v28, v30
// pix1 values in these registers:
// v1, v16, v17, v20
// available:
// v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
sub w4, w4, #4 // h -= 4
// Using absolute-difference instructions instead of absolute-difference-accumulate allows
// us to keep the results in 16b vectors instead of widening values with twice the instructions.
// This approach also has fewer data dependencies, allowing better instruction level parallelism.
uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0
uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1
uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2
uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3
uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
cmp w4, #4 // loop if h >= 4
// Now add up all the values in each vector, v4-v7 with widening adds
uaddl v19.8h, v4.8b, v5.8b
uaddl2 v18.8h, v4.16b, v5.16b
uaddl v4.8h, v6.8b, v7.8b
uaddl2 v5.8h, v6.16b, v7.16b
add v4.8h, v4.8h, v5.8h
add v4.8h, v4.8h, v18.8h
add v4.8h, v4.8h, v19.8h
add v4.8h, v23.8h, v24.8h
uaddlv s4, v4.8h // finish adding up accumulated values
add d0, d0, d4 // add the value to the top level accumulator