mirror of https://git.ffmpeg.org/ffmpeg.git
aarch64: hevc: Produce epel_uni_w_hv functions for both neon and i8mm
AWS Graviton 3: put_hevc_epel_uni_w_hv4_8_c: 191.2 put_hevc_epel_uni_w_hv4_8_neon: 87.7 put_hevc_epel_uni_w_hv4_8_i8mm: 83.2 put_hevc_epel_uni_w_hv6_8_c: 349.5 put_hevc_epel_uni_w_hv6_8_neon: 153.0 put_hevc_epel_uni_w_hv6_8_i8mm: 148.5 put_hevc_epel_uni_w_hv8_8_c: 581.2 put_hevc_epel_uni_w_hv8_8_neon: 166.7 put_hevc_epel_uni_w_hv8_8_i8mm: 163.5 put_hevc_epel_uni_w_hv12_8_c: 1230.0 put_hevc_epel_uni_w_hv12_8_neon: 387.7 put_hevc_epel_uni_w_hv12_8_i8mm: 370.2 put_hevc_epel_uni_w_hv16_8_c: 2003.2 put_hevc_epel_uni_w_hv16_8_neon: 501.5 put_hevc_epel_uni_w_hv16_8_i8mm: 490.2 put_hevc_epel_uni_w_hv24_8_c: 4448.7 put_hevc_epel_uni_w_hv24_8_neon: 1092.2 put_hevc_epel_uni_w_hv24_8_i8mm: 1069.7 put_hevc_epel_uni_w_hv32_8_c: 7817.2 put_hevc_epel_uni_w_hv32_8_neon: 1916.2 put_hevc_epel_uni_w_hv32_8_i8mm: 1829.5 put_hevc_epel_uni_w_hv48_8_c: 16728.2 put_hevc_epel_uni_w_hv48_8_neon: 4263.7 put_hevc_epel_uni_w_hv48_8_i8mm: 4342.7 put_hevc_epel_uni_w_hv64_8_c: 29563.2 put_hevc_epel_uni_w_hv64_8_neon: 7474.2 put_hevc_epel_uni_w_hv64_8_i8mm: 7128.5 Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
d7294199ab
commit
96e5adda9f
|
@ -3573,10 +3573,8 @@ function hevc_put_hevc_epel_uni_w_hv24_8_end_neon
|
|||
ret
|
||||
endfunc
|
||||
|
||||
#if HAVE_I8MM
|
||||
ENABLE_I8MM
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
|
||||
.macro epel_uni_w_hv suffix
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv4_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3591,14 +3589,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h4_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv4_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv6_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3613,14 +3611,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h6_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv6_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv8_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3635,14 +3633,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h8_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv8_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv12_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3657,14 +3655,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h12_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv12_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3679,14 +3677,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h16_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv16_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix, export=1
|
||||
epel_uni_w_hv_start
|
||||
sxtw x4, w4
|
||||
|
||||
|
@ -3701,14 +3699,14 @@ function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
|
|||
mov x2, x3
|
||||
add x3, x4, #3
|
||||
mov x4, x5
|
||||
bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_h24_8_\suffix)
|
||||
ldp x4, x6, [sp, #16]
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldr x30, [sp], #48
|
||||
b hevc_put_hevc_epel_uni_w_hv24_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix, export=1
|
||||
ldp x15, x16, [sp]
|
||||
mov x17, #16
|
||||
stp x15, x16, [sp, #-96]!
|
||||
|
@ -3718,7 +3716,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
|
|||
stp x5, x6, [sp, #64]
|
||||
stp x17, x7, [sp, #80]
|
||||
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix)
|
||||
ldp x0, x30, [sp, #16]
|
||||
ldp x1, x2, [sp, #32]
|
||||
ldp x3, x4, [sp, #48]
|
||||
|
@ -3730,13 +3728,13 @@ function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
|
|||
mov x17, #16
|
||||
stp x15, x16, [sp, #-32]!
|
||||
stp x17, x30, [sp, #16]
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_\suffix)
|
||||
ldp x17, x30, [sp, #16]
|
||||
ldp x15, x16, [sp], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv48_8_\suffix, export=1
|
||||
ldp x15, x16, [sp]
|
||||
mov x17, #24
|
||||
stp x15, x16, [sp, #-96]!
|
||||
|
@ -3745,7 +3743,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
|
|||
stp x3, x4, [sp, #48]
|
||||
stp x5, x6, [sp, #64]
|
||||
stp x17, x7, [sp, #80]
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix)
|
||||
ldp x0, x30, [sp, #16]
|
||||
ldp x1, x2, [sp, #32]
|
||||
ldp x3, x4, [sp, #48]
|
||||
|
@ -3757,13 +3755,13 @@ function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
|
|||
mov x17, #24
|
||||
stp x15, x16, [sp, #-32]!
|
||||
stp x17, x30, [sp, #16]
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_\suffix)
|
||||
ldp x17, x30, [sp, #16]
|
||||
ldp x15, x16, [sp], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
|
||||
function ff_hevc_put_hevc_epel_uni_w_hv64_8_\suffix, export=1
|
||||
ldp x15, x16, [sp]
|
||||
mov x17, #32
|
||||
stp x15, x16, [sp, #-96]!
|
||||
|
@ -3773,7 +3771,7 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
|
|||
stp x5, x6, [sp, #64]
|
||||
stp x17, x7, [sp, #80]
|
||||
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix)
|
||||
ldp x0, x30, [sp, #16]
|
||||
ldp x1, x2, [sp, #32]
|
||||
ldp x3, x4, [sp, #48]
|
||||
|
@ -3785,11 +3783,19 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
|
|||
mov x17, #32
|
||||
stp x15, x16, [sp, #-32]!
|
||||
stp x17, x30, [sp, #16]
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
|
||||
bl X(ff_hevc_put_hevc_epel_uni_w_hv32_8_\suffix)
|
||||
ldp x17, x30, [sp, #16]
|
||||
ldp x15, x16, [sp], #32
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
epel_uni_w_hv neon
|
||||
|
||||
#if HAVE_I8MM
|
||||
ENABLE_I8MM
|
||||
|
||||
epel_uni_w_hv neon_i8mm
|
||||
|
||||
DISABLE_I8MM
|
||||
#endif
|
||||
|
|
|
@ -278,6 +278,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
|
|||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width), _i8mm);
|
||||
|
||||
NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width),);
|
||||
|
||||
NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
|
@ -417,6 +422,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
|
|||
|
||||
NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv,);
|
||||
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv,);
|
||||
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv,);
|
||||
|
||||
if (have_i8mm(cpu_flags)) {
|
||||
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
|
||||
|
|
Loading…
Reference in New Issue