lavc/aarch64: Provide neon implementation of nsse16

Add vectorized implementation of nsse16 function.

Performance comparison tests are shown below.
- nsse_0_c: 682.2
- nsse_0_neon: 116.5

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Co-authored-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Hubert Mazur 2022-09-08 11:25:07 +02:00 committed by Martin Storsjö
parent 908abe8032
commit 06b98e396a
2 changed files with 137 additions and 0 deletions

View File

@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h);
int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
ptrdiff_t stride, int h);
int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[0] = vsse16_neon;
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
}
}
int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h)
{
if (c)
return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
else
return nsse16_neon(8, s1, s2, stride, h);
}

View File

@ -847,3 +847,125 @@ function vsse_intra16_neon, export=1
ret
endfunc
function nsse16_neon, export=1
// x0 multiplier
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
str x0, [sp, #-0x40]!
stp x1, x2, [sp, #0x10]
stp x3, x4, [sp, #0x20]
str x30, [sp, #0x30]
bl X(sse16_neon)
ldr x30, [sp, #0x30]
mov w9, w0 // here we store score1
ldr x5, [sp]
ldp x1, x2, [sp, #0x10]
ldp x3, x4, [sp, #0x20]
add sp, sp, #0x40
movi v16.8h, #0
movi v17.8h, #0
movi v18.8h, #0
movi v19.8h, #0
ld1 {v0.16b}, [x1], x3
subs w4, w4, #1 // we need to make h-1 iterations
ld1 {v2.16b}, [x2], x3
ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1
cmp w4, #2
ext v3.16b, v2.16b, v2.16b, #1 // x2 + 1
b.lt 2f
// make 2 iterations at once
1:
ld1 {v4.16b}, [x1], x3
ld1 {v6.16b}, [x2], x3
ld1 {v20.16b}, [x1], x3
ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
usubl v31.8h, v0.8b, v4.8b
usubl2 v30.8h, v0.16b, v4.16b
ld1 {v22.16b}, [x2], x3
usubl v29.8h, v1.8b, v5.8b
usubl2 v28.8h, v1.16b, v5.16b
ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
saba v16.8h, v31.8h, v29.8h
ext v21.16b, v20.16b, v20.16b, #1
saba v17.8h, v30.8h, v28.8h
usubl v27.8h, v2.8b, v6.8b
usubl2 v26.8h, v2.16b, v6.16b
ext v23.16b, v22.16b, v22.16b, #1
usubl v25.8h, v3.8b, v7.8b
usubl2 v24.8h, v3.16b, v7.16b
saba v18.8h, v27.8h, v25.8h
saba v19.8h, v26.8h, v24.8h
usubl v31.8h, v4.8b, v20.8b
usubl2 v30.8h, v4.16b, v20.16b
usubl v29.8h, v5.8b, v21.8b
usubl2 v28.8h, v5.16b, v21.16b
saba v16.8h, v31.8h, v29.8h
saba v17.8h, v30.8h, v28.8h
usubl v27.8h, v6.8b, v22.8b
usubl2 v26.8h, v6.16b, v22.16b
usubl v25.8h, v7.8b, v23.8b
usubl2 v24.8h, v7.16b, v23.16b
saba v18.8h, v27.8h, v25.8h
saba v19.8h, v26.8h, v24.8h
sub w4, w4, #2
mov v0.16b, v20.16b
mov v1.16b, v21.16b
cmp w4, #2
mov v2.16b, v22.16b
mov v3.16b, v23.16b
b.ge 1b
cbz w4, 3f
// iterate by one
2:
ld1 {v4.16b}, [x1], x3
subs w4, w4, #1
ld1 {v6.16b}, [x2], x3
ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
usubl v31.8h, v0.8b, v4.8b
ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
usubl2 v30.8h, v0.16b, v4.16b
usubl v29.8h, v1.8b, v5.8b
usubl2 v28.8h, v1.16b, v5.16b
saba v16.8h, v31.8h, v29.8h
saba v17.8h, v30.8h, v28.8h
usubl v27.8h, v2.8b, v6.8b
usubl2 v26.8h, v2.16b, v6.16b
usubl v25.8h, v3.8b, v7.8b
usubl2 v24.8h, v3.16b, v7.16b
saba v18.8h, v27.8h, v25.8h
saba v19.8h, v26.8h, v24.8h
mov v0.16b, v4.16b
mov v1.16b, v5.16b
mov v2.16b, v6.16b
mov v3.16b, v7.16b
cbnz w4, 2b
3:
sqsub v17.8h, v17.8h, v19.8h
sqsub v16.8h, v16.8h, v18.8h
ins v17.h[7], wzr
sqadd v16.8h, v16.8h, v17.8h
saddlv s16, v16.8h
sqabs s16, s16
fmov w0, s16
mul w0, w0, w5
add w0, w0, w9
ret
endfunc