lavc/aarch64: h264, add chroma loop filters for 10bit

Benchmarks:                                             A53     A72
h264_h_loop_filter_chroma422_10bpp_c:                  282.7   114.2
h264_h_loop_filter_chroma422_10bpp_neon:               109.5    78.5
h264_h_loop_filter_chroma_10bpp_c:                     165.0    81.5
h264_h_loop_filter_chroma_10bpp_neon:                  120.0    76.7
h264_h_loop_filter_chroma_intra422_10bpp_c:            323.7   124.2
h264_h_loop_filter_chroma_intra422_10bpp_neon:         155.0   102.7
h264_h_loop_filter_chroma_intra_10bpp_c:               121.0    49.5
h264_h_loop_filter_chroma_intra_10bpp_neon:             79.7    53.7
h264_h_loop_filter_chroma_mbaff422_10bpp_c:            188.5    75.0
h264_h_loop_filter_chroma_mbaff422_10bpp_neon:         120.0    75.5
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c:      116.7    46.0
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon:    79.7    53.7
h264_h_loop_filter_chroma_mbaff_intra_10bpp_c:          63.0    27.2
h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon:       48.5    34.0
h264_v_loop_filter_chroma_10bpp_c:                     258.7   135.5
h264_v_loop_filter_chroma_10bpp_neon:                   71.2    51.0
h264_v_loop_filter_chroma_intra_10bpp_c:               158.0    70.7
h264_v_loop_filter_chroma_intra_10bpp_neon:             48.7    31.5

Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Mikhail Nitenko 2021-08-20 00:08:00 +03:00 committed by Martin Storsjö
parent 756d2e087a
commit 43ca887bc2
2 changed files with 292 additions and 0 deletions

View File

@ -83,6 +83,29 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[5 * 8]);
void ff_h264_v_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_v_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta);
void ff_h264_h_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta);
void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
@ -125,5 +148,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c->h264_idct8_add = ff_h264_idct8_add_neon;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
} else if (have_neon(cpu_flags) && bit_depth == 10) {
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
} else {
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
}
}
}

View File

@ -819,3 +819,258 @@ endfunc
weight_func 16
weight_func 8
weight_func 4
.macro h264_loop_filter_start_10
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
lsl w2, w2, #2
mov v24.S[0], w6
lsl w3, w3, #2
and w8, w6, w6, lsl #16
b.eq 1f
ands w8, w8, w8, lsl #8
b.ge 2f
1:
ret
2:
.endm
.macro h264_loop_filter_start_intra_10
orr w4, w2, w3
cbnz w4, 1f
ret
1:
lsl w2, w2, #2
lsl w3, w3, #2
dup v30.8h, w2 // alpha
dup v31.8h, w3 // beta
.endm
.macro h264_loop_filter_chroma_10
dup v22.8h, w2 // alpha
dup v23.8h, w3 // beta
uxtl v24.8h, v24.8b // tc0
uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
cmhi v26.8h, v22.8h, v26.8h // < alpha
cmhi v28.8h, v23.8h, v28.8h // < beta
cmhi v30.8h, v23.8h, v30.8h // < beta
and v26.16b, v26.16b, v28.16b
mov v4.16b, v0.16b
sub v4.8h, v4.8h, v16.8h
and v26.16b, v26.16b, v30.16b
shl v4.8h, v4.8h, #2
mov x8, v26.d[0]
mov x9, v26.d[1]
sli v24.8h, v24.8h, #8
uxtl v24.8h, v24.8b
add v4.8h, v4.8h, v18.8h
adds x8, x8, x9
shl v24.8h, v24.8h, #2
b.eq 9f
movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
uqsub v24.8h, v24.8h, v31.8h
sub v4.8h, v4.8h, v2.8h
srshr v4.8h, v4.8h, #3
smin v4.8h, v4.8h, v24.8h
neg v25.8h, v24.8h
smax v4.8h, v4.8h, v25.8h
and v4.16b, v4.16b, v26.16b
add v16.8h, v16.8h, v4.8h
sub v0.8h, v0.8h, v4.8h
mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
movi v5.8h, #0
smin v0.8h, v0.8h, v4.8h
smin v16.8h, v16.8h, v4.8h
smax v0.8h, v0.8h, v5.8h
smax v16.8h, v16.8h, v5.8h
.endm
function ff_h264_v_loop_filter_chroma_neon_10, export=1
h264_loop_filter_start_10
mov x10, x0
sub x0, x0, x1, lsl #1
ld1 {v18.8h}, [x0 ], x1
ld1 {v0.8h}, [x10], x1
ld1 {v16.8h}, [x0 ], x1
ld1 {v2.8h}, [x10]
h264_loop_filter_chroma_10
sub x0, x10, x1, lsl #1
st1 {v16.8h}, [x0], x1
st1 {v0.8h}, [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon_10, export=1
h264_loop_filter_start_10
sub x0, x0, #4 // access the 2nd left pixel
h_loop_filter_chroma420_10:
add x10, x0, x1, lsl #2
ld1 {v18.d}[0], [x0 ], x1
ld1 {v18.d}[1], [x10], x1
ld1 {v16.d}[0], [x0 ], x1
ld1 {v16.d}[1], [x10], x1
ld1 {v0.d}[0], [x0 ], x1
ld1 {v0.d}[1], [x10], x1
ld1 {v2.d}[0], [x0 ], x1
ld1 {v2.d}[1], [x10], x1
transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma_10
transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x10, x1, lsl #3
st1 {v18.d}[0], [x0], x1
st1 {v16.d}[0], [x0], x1
st1 {v0.d}[0], [x0], x1
st1 {v2.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
st1 {v16.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma422_neon_10, export=1
h264_loop_filter_start_10
add x5, x0, x1
sub x0, x0, #4
add x1, x1, x1
mov x7, x30
bl h_loop_filter_chroma420_10
mov x30, x7
sub x0, x5, #4
mov v24.s[0], w6
b h_loop_filter_chroma420_10
endfunc
.macro h264_loop_filter_chroma_intra_10
uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
cmhi v26.8h, v30.8h, v26.8h // < alpha
cmhi v27.8h, v31.8h, v27.8h // < beta
cmhi v28.8h, v31.8h, v28.8h // < beta
and v26.16b, v26.16b, v27.16b
and v26.16b, v26.16b, v28.16b
mov x2, v26.d[0]
mov x3, v26.d[1]
shl v4.8h, v18.8h, #1
shl v6.8h, v19.8h, #1
adds x2, x2, x3
b.eq 9f
add v20.8h, v16.8h, v19.8h
add v22.8h, v17.8h, v18.8h
add v20.8h, v20.8h, v4.8h
add v22.8h, v22.8h, v6.8h
urshr v24.8h, v20.8h, #2
urshr v25.8h, v22.8h, #2
bit v16.16b, v24.16b, v26.16b
bit v17.16b, v25.16b, v26.16b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
h264_loop_filter_start_intra_10
mov x9, x0
sub x0, x0, x1, lsl #1
ld1 {v18.8h}, [x0], x1
ld1 {v17.8h}, [x9], x1
ld1 {v16.8h}, [x0], x1
ld1 {v19.8h}, [x9]
h264_loop_filter_chroma_intra_10
sub x0, x9, x1, lsl #1
st1 {v16.8h}, [x0], x1
st1 {v17.8h}, [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
sub x0, x0, #2
add x9, x4, x1, lsl #1
ld1 {v18.8h}, [x4], x1
ld1 {v17.8h}, [x9], x1
ld1 {v16.8h}, [x4], x1
ld1 {v19.8h}, [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
sub x0, x0, #2
h_loop_filter_chroma420_intra_10:
add x9, x4, x1, lsl #2
ld1 {v18.4h}, [x4], x1
ld1 {v18.d}[1], [x9], x1
ld1 {v16.4h}, [x4], x1
ld1 {v16.d}[1], [x9], x1
ld1 {v17.4h}, [x4], x1
ld1 {v17.d}[1], [x9], x1
ld1 {v19.4h}, [x4], x1
ld1 {v19.d}[1], [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
9:
ret
endfunc
function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
add x5, x0, x1, lsl #3
sub x0, x0, #2
mov x7, x30
bl h_loop_filter_chroma420_intra_10
mov x4, x9
sub x0, x5, #2
mov x30, x7
b h_loop_filter_chroma420_intra_10
endfunc