avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl

This commit uses the instruction `vpdpbusd` introduced by AVX512 VNNI
to calculate the horizontal filter.

ff_hevc_put_hevc_qpel_h8_8_sse4       1039169
ff_hevc_put_hevc_qpel_h8_8_avx512icl   677153
ff_hevc_put_hevc_qpel_hv8_8_sse4      3603511
ff_hevc_put_hevc_qpel_hv8_8_avx512icl 2995354

Reviewed-by: Henrik Gramner <henrik@gramner.com>
Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
This commit is contained in:
Wu Jianhua 2022-03-11 15:52:09 +08:00 committed by Haihao Xiang
parent fe85afbf8c
commit d4cd8830bd
3 changed files with 144 additions and 0 deletions

View File

@ -87,6 +87,26 @@ QPEL_TABLE 12, 4, w, sse4
QPEL_TABLE 8,16, b, avx2
QPEL_TABLE 10, 8, w, avx2
QPEL_TABLE 8, 1, b, avx512icl_h
QPEL_TABLE 8, 1, d, avx512icl_v
pb_qpel_shuffle_index: db 0, 1, 2, 3
db 1, 2, 3, 4
db 2, 3, 4, 5
db 3, 4, 5, 6
db 4, 5, 6, 7
db 5, 6, 7, 8
db 6, 7, 8, 9
db 7, 8, 9, 10
db 4, 5, 6, 7
db 5, 6, 7, 8
db 6, 7, 8, 9
db 7, 8, 9, 10
db 8, 9, 10, 11
db 9, 10, 11, 12
db 10, 11, 12, 13
db 11, 12, 13, 14
SECTION .text
%define MAX_PB_SIZE 64
@ -1670,3 +1690,120 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
%endif ;AVX2
%endif ; ARCH_X86_64
%macro QPEL_FILTER_H 5
%define %%table hevc_qpel_filters_avx512icl_h_%1
%assign %%offset 4
dec %2q
shl %2q, 3
%ifdef PIC
lea %5q, [%%table]
%define FILTER %5q
%else
%define FILTER %%table
%endif
vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
%endmacro
%macro QPEL_FILTER_V 5
vpbroadcastd m%3, [%5 + %2q + 4*%4]
%endmacro
%macro QPEL_LOAD_SHUF 2
movu m%1, [pb_qpel_shuffle_index + 0]
movu m%2, [pb_qpel_shuffle_index + 32]
%endmacro
; required: m0-m5
; %1: dst register index
; %2: name for src
%macro QPEL_H_LOAD_COMPUTE 2
pxor m%1, m%1
movu xm4, [%2q - 3]
vpermb m5, m2, m4
vpermb m4, m3, m4
vpdpbusd m%1, m5, m0
vpdpbusd m%1, m4, m1
%endmacro
%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
QPEL_FILTER_H %1, mx, 0, 1, tmp
QPEL_LOAD_SHUF 2, 3
.loop:
QPEL_H_LOAD_COMPUTE 6, src
vpmovdw [dstq], m6
LOOP_END dst, src, srcstride
RET
%endmacro
%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
%assign %%shift 6
%assign %%extra 7
QPEL_FILTER_H %1, mx, 0, 1, tmp
QPEL_LOAD_SHUF 2, 3
lea tmpq, [srcstrideq*3]
sub srcq, tmpq
sub myq, 1
shl myq, 5
%ifdef PIC
%define %%table hevc_qpel_filters_avx512icl_v_%1
lea tmpq, [%%table]
%define FILTER tmpq
%else
%define FILTER %%table
%endif
%assign %%i 6
%assign %%j 0
%rep %1
QPEL_FILTER_V %1, my, %%i, %%j, FILTER
%assign %%i %%i+1
%assign %%j %%j+1
%endrep
%rep %%extra
QPEL_H_LOAD_COMPUTE %%i, src
add srcq, srcstrideq
%assign %%i %%i+1
%endrep
.loop:
QPEL_H_LOAD_COMPUTE %%i, src
vpmulld m22, m14, m6
vpmulld m23, m15, m7
vpmulld m24, m16, m8
vpmulld m25, m17, m9
vpaddd m26, m22, m23
vpaddd m24, m25
vpaddd m26, m24
vpmulld m22, m18, m10
vpmulld m23, m19, m11
vpmulld m24, m20, m12
vpmulld m25, m21, m13
vpaddd m22, m22, m23
vpaddd m24, m25
vpaddd m26, m24
vpaddd m22, m26
mova m14, m15
mova m15, m16
mova m16, m17
mova m17, m18
mova m18, m19
mova m19, m20
mova m20, m21
vpsrad m22, %%shift
vpmovdw [dstq], m22
LOOP_END dst, src, srcstride
RET
%endmacro
%if ARCH_X86_64
%if HAVE_AVX512ICL_EXTERNAL
INIT_YMM avx512icl
HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
%endif
%endif

View File

@ -233,6 +233,9 @@ WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
WEIGHTING_PROTOTYPES(12, sse4);
void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////

View File

@ -878,6 +878,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;