codec/aarch64/hevc: add transform_luma_neon

got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103

Signed-off-by: xufuji456 <839789740@qq.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
xufuji456 2023-04-13 21:34:47 +08:00 committed by Martin Storsjö
parent 4eaaa38d3d
commit bd2f00f665
2 changed files with 50 additions and 0 deletions

View File

@ -842,6 +842,54 @@ tr_32x4 secondpass_10, 20 - 10
idct_32x32 8
idct_32x32 10
.macro tr4_luma_shift r0, r1, r2, r3, shift
saddl v0.4s, \r0, \r2 // c0 = src0 + src2
saddl v1.4s, \r2, \r3 // c1 = src2 + src3
ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
smull v3.4s, \r1, v21.4h // c3 = 74 * src1
saddl v7.4s, \r0, \r3 // src0 + src3
ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
mul v5.4s, v0.4s, v19.4s // 29 * c0
mul v6.4s, v1.4s, v20.4s // 55 * c1
add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
mul v1.4s, v1.4s, v19.4s // 29 * c1
mul v6.4s, v2.4s, v20.4s // 55 * c2
sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
mul v0.4s, v0.4s, v20.4s // 55 * c0
mul v2.4s, v2.4s, v19.4s // 29 * c2
add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
sqrshrn \r0, v5.4s, \shift
sqrshrn \r1, v6.4s, \shift
sqrshrn \r2, v7.4s, \shift
sqrshrn \r3, v0.4s, \shift
.endm
function ff_hevc_transform_luma_4x4_neon_8, export=1
ld1 {v28.4h-v31.4h}, [x0]
movi v18.4s, #74
movi v19.4s, #29
movi v20.4s, #55
movi v21.4h, #74
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
st1 {v28.4h-v31.4h}, [x0]
ret
endfunc
// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
.macro idct_dc size, bitdepth
function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1

View File

@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
const int16_t *sao_offset_val, int sao_left_class,
@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
c->sao_band_filter[0] =
c->sao_band_filter[1] =
c->sao_band_filter[2] =