lavc/aarch64: Provide optimized implementation of vsse8 for arm64.

Provide optimized implementation of vsse8 for arm64.

Performance comparison tests are shown below.
- vsse_1_c: 141.5
- vsse_1_neon: 32.5

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Grzegorz Bernacki 2022-10-03 16:10:18 +02:00 committed by Martin Storsjö
parent faea56c9c7
commit bad67cb9fd
2 changed files with 75 additions and 0 deletions

View File

@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[5] = vsad_intra8_neon;
c->vsse[0] = vsse16_neon;
c->vsse[1] = vsse8_neon;
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;

View File

@ -838,6 +838,76 @@ function vsad16_neon, export=1
ret
endfunc
function vsse8_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
ld1 {v0.8b}, [x1], x3 // Load pix1[0], first iteration
ld1 {v1.8b}, [x2], x3 // Load pix2[0], first iteration
sub w4, w4, #1 // we need to make h-1 iterations
movi v16.4s, #0
movi v17.4s, #0
cmp w4, #3 // check if we can make 3 iterations at once
usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
b.lt 2f
1:
// x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
// res = (x) * (x)
ld1 {v0.8b}, [x1], x3 // Load pix1[0 + stride], first iteration
ld1 {v1.8b}, [x2], x3 // Load pix2[0 + stride], first iteration
ld1 {v2.8b}, [x1], x3 // Load pix1[0 + stride], second iteration
ld1 {v3.8b}, [x2], x3 // Load pix2[0 + stride], second iteration
usubl v29.8h, v0.8b, v1.8b
usubl2 v28.8h, v0.16b, v1.16b
ld1 {v4.8b}, [x1], x3 // Load pix1[0 + stride], third iteration
ld1 {v5.8b}, [x2], x3 // Load pix1[0 + stride], third iteration
sabd v31.8h, v31.8h, v29.8h
usubl v27.8h, v2.8b, v3.8b
usubl v25.8h, v4.8b, v5.8b
sabd v29.8h, v29.8h, v27.8h
sabd v27.8h, v27.8h, v25.8h
umlal v16.4s, v31.4h, v31.4h
umlal2 v17.4s, v31.8h, v31.8h
mov v31.16b, v25.16b
umlal v16.4s, v29.4h, v29.4h
umlal2 v17.4s, v29.8h, v29.8h
sub w4, w4, #3
umlal v16.4s, v27.4h, v27.4h
umlal2 v17.4s, v27.8h, v27.8h
cmp w4, #3
b.ge 1b
cbz w4, 3f
// iterate by once
2:
ld1 {v0.8b}, [x1], x3
ld1 {v1.8b}, [x2], x3
subs w4, w4, #1
usubl v29.8h, v0.8b, v1.8b
sabd v31.8h, v31.8h, v29.8h
umlal v16.4s, v31.4h, v31.4h
umlal2 v17.4s, v31.8h, v31.8h
mov v31.16b, v29.16b
b.ne 2b
3:
add v16.4s, v16.4s, v17.4s
uaddlv d17, v16.4s
fmov w0, s17
ret
endfunc
function vsse16_neon, export=1
// x0 unused
// x1 uint8_t *pix1