aarch64/vvc: Add dmvr_hv

dmvr_hv_8_12x20_c:                                       8.0 ( 1.00x)
dmvr_hv_8_12x20_neon:                                    1.2 ( 6.62x)
dmvr_hv_8_20x12_c:                                       8.0 ( 1.00x)
dmvr_hv_8_20x12_neon:                                    0.9 ( 8.37x)
dmvr_hv_8_20x20_c:                                      12.9 ( 1.00x)
dmvr_hv_8_20x20_neon:                                    1.7 ( 7.62x)
dmvr_hv_10_12x20_c:                                      7.0 ( 1.00x)
dmvr_hv_10_12x20_neon:                                   1.7 ( 4.09x)
dmvr_hv_10_20x12_c:                                      7.0 ( 1.00x)
dmvr_hv_10_20x12_neon:                                   1.7 ( 4.09x)
dmvr_hv_10_20x20_c:                                     11.2 ( 1.00x)
dmvr_hv_10_20x20_neon:                                   2.7 ( 4.15x)
dmvr_hv_12_12x20_c:                                      6.5 ( 1.00x)
dmvr_hv_12_12x20_neon:                                   1.7 ( 3.79x)
dmvr_hv_12_20x12_c:                                      6.5 ( 1.00x)
dmvr_hv_12_20x12_neon:                                   1.7 ( 3.79x)
dmvr_hv_12_20x20_c:                                     10.2 ( 1.00x)
dmvr_hv_12_20x20_neon:                                   2.2 ( 4.64x)

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
This commit is contained in:
Zhao Zhili 2024-09-29 20:02:43 +08:00 committed by Nuo Mi
parent 0ba9e8d0d4
commit bcd65ebd8f
2 changed files with 319 additions and 0 deletions

View File

@ -85,6 +85,15 @@ W_AVG_FUN(8)
W_AVG_FUN(10)
W_AVG_FUN(12)
#define DMVR_FUN(fn, bd) \
void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
const uint8_t *_src, ptrdiff_t _src_stride, int height, \
intptr_t mx, intptr_t my, int width);
DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@ -157,6 +166,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.avg = ff_vvc_avg_8_neon;
c->inter.w_avg = vvc_w_avg_8;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@ -198,12 +208,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;

View File

@ -226,3 +226,310 @@ vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12
/* x0: int16_t *dst
* x1: const uint8_t *_src
* x2: ptrdiff_t _src_stride
* w3: int height
* x4: intptr_t mx
* x5: intptr_t my
* w6: int width
*/
function ff_vvc_dmvr_hv_8_neon, export=1
dst .req x0
src .req x1
src_stride .req x2
height .req w3
mx .req x4
my .req x5
width .req w6
tmp0 .req x7
tmp1 .req x8
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
dup v0.8h, w10 // filter_x[0]
dup v1.8h, w11 // filter_x[1]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
sxtw x6, w6
movi v30.8h, #(1 << (8 - 7)) // offset1
movi v31.8h, #8 // offset2
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, x6
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
ldur q5, [src, #1]
ldr q4, [src], #16
uxtl v7.8h, v5.8b
uxtl2 v17.8h, v5.16b
uxtl v6.8h, v4.8b
uxtl2 v16.8h, v4.16b
mul v6.8h, v6.8h, v0.8h
mul v16.8h, v16.8h, v0.8h
mla v6.8h, v7.8h, v1.8h
mla v16.8h, v17.8h, v1.8h
add v6.8h, v6.8h, v30.8h
add v16.8h, v16.8h, v30.8h
ushr v6.8h, v6.8h, #(8 - 6)
ushr v7.8h, v16.8h, #(8 - 6)
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q16, q17, [x12], #32
mul v16.8h, v16.8h, v2.8h
mul v17.8h, v17.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
mla v17.8h, v7.8h, v3.8h
add v16.8h, v16.8h, v31.8h
add v17.8h, v17.8h, v31.8h
ushr v16.8h, v16.8h, #4
ushr v17.8h, v17.8h, #4
stp q16, q17, [x14], #32
b 3f
2:
// width > 8
ldur d5, [src, #1]
ldr d4, [src], #8
uxtl v7.8h, v5.8b
uxtl v6.8h, v4.8b
mul v6.8h, v6.8h, v0.8h
mla v6.8h, v7.8h, v1.8h
add v6.8h, v6.8h, v30.8h
ushr v6.8h, v6.8h, #(8 - 6)
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
mul v16.8h, v16.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
add v16.8h, v16.8h, v31.8h
ushr v16.8h, v16.8h, #4
str q16, [x14], #16
3:
ldr s5, [src, #1]
ldr s4, [src], #4
uxtl v7.8h, v5.8b
uxtl v6.8h, v4.8b
mul v6.4h, v6.4h, v0.4h
mla v6.4h, v7.4h, v1.4h
add v6.4h, v6.4h, v30.4h
ushr v6.4h, v6.4h, #(8 - 6)
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
mul v16.4h, v16.4h, v2.4h
mla v16.4h, v6.4h, v3.4h
add v16.4h, v16.4h, v31.4h
ushr v16.4h, v16.4h, #4
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
endfunc
function ff_vvc_dmvr_hv_12_neon, export=1
movi v29.4s, #(12 - 6)
movi v30.4s, #(1 << (12 - 7)) // offset1
b 0f
endfunc
function ff_vvc_dmvr_hv_10_neon, export=1
movi v29.4s, #(10 - 6)
movi v30.4s, #(1 << (10 - 7)) // offset1
0:
movi v31.4s, #8 // offset2
neg v29.4s, v29.4s
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
dup v0.8h, w10 // filter_x[0]
dup v1.8h, w11 // filter_x[1]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, x6, lsl #1
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
add x16, src, #2
ldp q6, q16, [src], #32
ldp q7, q17, [x16]
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umull v18.4s, v16.4h, v0.4h
umull2 v19.4s, v16.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
umlal v18.4s, v17.4h, v1.4h
umlal2 v19.4s, v17.8h, v1.8h
add v4.4s, v4.4s, v30.4s
add v5.4s, v5.4s, v30.4s
add v18.4s, v18.4s, v30.4s
add v19.4s, v19.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
ushl v5.4s, v5.4s, v29.4s
ushl v18.4s, v18.4s, v29.4s
ushl v19.4s, v19.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
uqxtn v7.4h, v18.4s
uqxtn2 v7.8h, v19.4s
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q4, q5, [x12], #32
umull v17.4s, v4.4h, v2.4h
umull2 v18.4s, v4.8h, v2.8h
umull v19.4s, v5.4h, v2.4h
umull2 v20.4s, v5.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
umlal v19.4s, v7.4h, v3.4h
umlal2 v20.4s, v7.8h, v3.8h
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
add v19.4s, v19.4s, v31.4s
add v20.4s, v20.4s, v31.4s
ushr v17.4s, v17.4s, #4
ushr v18.4s, v18.4s, #4
ushr v19.4s, v19.4s, #4
ushr v20.4s, v20.4s, #4
uqxtn v6.4h, v17.4s
uqxtn2 v6.8h, v18.4s
uqxtn v7.4h, v19.4s
uqxtn2 v7.8h, v20.4s
stp q6, q7, [x14], #32
b 3f
2:
// width > 8
ldur q7, [src, #2]
ldr q6, [src], #16
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
add v4.4s, v4.4s, v30.4s
add v5.4s, v5.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
ushl v5.4s, v5.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
umull v17.4s, v16.4h, v2.4h
umull2 v18.4s, v16.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
ushr v17.4s, v17.4s, #4
ushr v18.4s, v18.4s, #4
uqxtn v16.4h, v17.4s
uqxtn2 v16.8h, v18.4s
str q16, [x14], #16
3:
ldr d7, [src, #2]
ldr d6, [src], #8
umull v4.4s, v7.4h, v1.4h
umlal v4.4s, v6.4h, v0.4h
add v4.4s, v4.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
uqxtn v6.4h, v4.4s
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
umull v17.4s, v16.4h, v2.4h
umlal v17.4s, v6.4h, v3.4h
add v17.4s, v17.4s, v31.4s
ushr v17.4s, v17.4s, #4
uqxtn v16.4h, v17.4s
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
.unreq dst
.unreq src
.unreq src_stride
.unreq height
.unreq mx
.unreq my
.unreq width
.unreq tmp0
.unreq tmp1
endfunc