lavc/aarch64: Add neon implementation for vsad8_intra

Provide optimized implementation for vsad8_intra function.

Performance comparison tests are shown below.
- vsad_5_c: 94.7
- vsad_5_neon: 20.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Hubert Mazur 2022-09-20 13:01:57 +02:00 committed by Martin Storsjö
parent 0ee535b1db
commit e9a6170213
2 changed files with 45 additions and 0 deletions

View File

@ -45,6 +45,8 @@ int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h) ;
int vsad_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h) ;
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
@ -75,6 +77,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[0] = vsad16_neon;
c->vsad[4] = vsad_intra16_neon;
c->vsad[5] = vsad_intra8_neon;
c->vsse[0] = vsse16_neon;
c->vsse[4] = vsse_intra16_neon;

View File

@ -1047,3 +1047,45 @@ function pix_median_abs16_neon, export=1
ret
endfunc
function vsad_intra8_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *dummy
// x3 ptrdiff_t stride
// w4 int h
ld1 {v0.8b}, [x1], x3
sub w4, w4, #1 // we need to make h-1 iterations
cmp w4, #3
movi v16.8h, #0
b.lt 2f
1:
// v = abs( pix1[0] - pix1[0 + stride] )
// score = sum(v)
ld1 {v1.8b}, [x1], x3
sub w4, w4, #3
ld1 {v2.8b}, [x1], x3
uabal v16.8h, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
uabal v16.8h, v1.8b, v2.8b
cmp w4, #3
mov v0.8b, v3.8b
uabal v16.8h, v2.8b, v3.8b
b.ge 1b
cbz w4, 3f
2:
ld1 {v1.8b}, [x1], x3
subs w4, w4, #1
uabal v16.8h, v0.8b, v1.8b
mov v0.8b, v1.8b
cbnz w4, 2b
3:
uaddlv s17, v16.8h
fmov w0, s17
ret
endfunc