lavc/aarch64: hevc_add_res add 12bit variants

hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0

Signed-off-by: J. Dekker <jdek@itanimul.li>
This commit is contained in:
J. Dekker 2022-08-16 07:01:53 +02:00
parent 48be6616d0
commit ce2f47318b
2 changed files with 102 additions and 70 deletions

View File

@ -5,7 +5,7 @@
*
* Ported from arm/hevcdsp_idct_neon.S by
* Copyright (c) 2020 Reimar Döffinger
* Copyright (c) 2020 Josh Dekker
* Copyright (c) 2020 J. Dekker
*
* This file is part of FFmpeg.
*
@ -37,11 +37,11 @@ const trans, align=4
.short 31, 22, 13, 4
endconst
.macro clip10 in1, in2, c1, c2
smax \in1, \in1, \c1
smax \in2, \in2, \c1
smin \in1, \in1, \c2
smin \in2, \in2, \c2
.macro clip2 in1, in2, min, max
smax \in1, \in1, \min
smax \in2, \in2, \min
smin \in1, \in1, \max
smin \in2, \in2, \max
.endm
function ff_hevc_add_residual_4x4_8_neon, export=1
@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
ret
endfunc
function ff_hevc_add_residual_4x4_10_neon, export=1
mov x12, x0
ld1 {v0.8h-v1.8h}, [x1]
ld1 {v2.d}[0], [x12], x2
ld1 {v2.d}[1], [x12], x2
ld1 {v3.d}[0], [x12], x2
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.d}[1], [x12], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x0], x2
ret
endfunc
function ff_hevc_add_residual_8x8_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
ret
endfunc
function ff_hevc_add_residual_8x8_10_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #8
movi v4.8h, #0
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #2
ld1 {v0.8h-v1.8h}, [x1], #32
ld1 {v2.8h}, [x0]
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_8_neon, export=1
mov x3, #16
add x12, x0, x2
@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
ret
endfunc
function ff_hevc_add_residual_16x16_10_neon, export=1
mov x3, #16
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.8h-v17.8h}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
sqadd v0.8h, v0.8h, v16.8h
ld1 {v18.8h-v19.8h}, [x12]
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
ret
endfunc
function ff_hevc_add_residual_32x32_10_neon, export=1
.macro add_res bitdepth
function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
b hevc_add_residual_4x4_16_neon
endfunc
function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
b hevc_add_residual_8x8_16_neon
endfunc
function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
b hevc_add_residual_16x16_16_neon
endfunc
function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1
mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
b hevc_add_residual_32x32_16_neon
endfunc
.endm
add_res 10
add_res 12
function hevc_add_residual_4x4_16_neon, export=0
mov x12, x0
ld1 {v0.8h-v1.8h}, [x1]
ld1 {v2.d}[0], [x12], x2
ld1 {v2.d}[1], [x12], x2
ld1 {v3.d}[0], [x12], x2
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.d}[1], [x12], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
clip2 v0.8h, v1.8h, v4.8h, v21.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x0], x2
ret
endfunc
function hevc_add_residual_8x8_16_neon, export=0
add x12, x0, x2
add x2, x2, x2
mov x3, #8
movi v4.8h, #0
1: subs x3, x3, #2
ld1 {v0.8h-v1.8h}, [x1], #32
ld1 {v2.8h}, [x0]
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
clip2 v0.8h, v1.8h, v4.8h, v21.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
ret
endfunc
function hevc_add_residual_16x16_16_neon, export=0
mov x3, #16
movi v20.8h, #0
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.8h-v17.8h}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
sqadd v0.8h, v0.8h, v16.8h
ld1 {v18.8h-v19.8h}, [x12]
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip2 v0.8h, v1.8h, v20.8h, v21.8h
clip2 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
ret
endfunc
function hevc_add_residual_32x32_16_neon, export=0
mov x3, #32
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #1
ld1 {v0.8h -v3.8h}, [x1], #64
ld1 {v16.8h-v19.8h}, [x0]
@ -220,8 +238,8 @@ function ff_hevc_add_residual_32x32_10_neon, export=1
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
clip2 v0.8h, v1.8h, v20.8h, v21.8h
clip2 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v3.8h}, [x0], x2
bne 1b
ret

View File

@ -29,18 +29,26 @@ void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@ -100,4 +108,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
}
if (bit_depth == 12) {
c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_12_neon;
}
}