lavc/aarch64: Add neon implementation for pix_median_abs8

Provide optimized implementation for pix_median_abs8 function.

Performance comparison tests are shown below.
- median_sad_1_c: 277.0
- median_sad_1_neon: 82.0

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Hubert Mazur 2022-09-20 13:01:58 +02:00 committed by Martin Storsjö
parent e9a6170213
commit b2732115dd
2 changed files with 65 additions and 0 deletions

View File

@ -57,6 +57,8 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@ -85,6 +87,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->nsse[0] = nsse16_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
c->median_sad[1] = pix_median_abs8_neon;
}
}

View File

@ -1089,3 +1089,65 @@ function vsad_intra8_neon, export=1
ret
endfunc
function pix_median_abs8_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
ld1 {v2.8b}, [x1], x3
ld1 {v3.8b}, [x2], x3
movi v31.8h, #0
ext v0.8b, v2.8b, v2.8b, #1
ext v1.8b, v3.8b, v3.8b, #1
usubl v28.8h, v2.8b, v3.8b
usubl v26.8h, v0.8b, v1.8b
sub w4, w4, #1 // we need to make h-1 iterations
saba v31.8h, v26.8h, v28.8h
mov h18, v28.h[0]
cmp w4, #1
sqabs h18, h18
movi v0.8h, #0
b.lt 2f
1:
ld1 {v6.8b}, [x1], x3 // pix1 vector for V(j-1)
ld1 {v7.8b}, [x2], x3 // pix2 vector for V(j-1)
subs w4, w4, #1
ext v4.8b, v6.8b, v6.8b, #1 // pix1 vector for V(j)
ext v5.8b, v7.8b, v7.8b, #1 // pix2 vector for V(j)
// protected registers: v30, v29, v28, v27, v26, v25, v24, v23
// scratch registers: v22, v21, v20, v19, v17
// To find median of three values, calculate sum of them
// and subtract max and min value from it.
usubl v30.8h, v6.8b, v7.8b // V(j-1)
usubl v24.8h, v4.8b, v5.8b // V(j)
saba v0.8h, v30.8h, v28.8h
add v22.8h, v26.8h, v30.8h
smin v20.8h, v26.8h, v30.8h
smax v19.8h, v26.8h, v30.8h
sub v22.8h, v22.8h, v28.8h
smin v17.8h, v19.8h, v22.8h
mov v28.16b, v30.16b
smax v20.8h, v20.8h, v17.8h // median values lower half
smax v19.8h, v25.8h, v29.8h
saba v31.8h, v24.8h, v20.8h
mov v26.16b, v24.16b
smax v17.8h, v22.8h, v19.8h // median values upper half
b.ne 1b
2:
mov h17, v0.h[0]
ins v31.h[7], wzr
add d18, d18, d17
uaddlv s17, v31.8h
add d18, d18, d17
fmov w0, s18
ret
endfunc