mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-13 10:51:38 +00:00
avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl
This commit uses the instruction `vpdpbusd` introduced by AVX512 VNNI to calculate the horizontal filter. ff_hevc_put_hevc_qpel_h8_8_sse4 1039169 ff_hevc_put_hevc_qpel_h8_8_avx512icl 677153 ff_hevc_put_hevc_qpel_hv8_8_sse4 3603511 ff_hevc_put_hevc_qpel_hv8_8_avx512icl 2995354 Reviewed-by: Henrik Gramner <henrik@gramner.com> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
This commit is contained in:
parent
fe85afbf8c
commit
d4cd8830bd
@ -87,6 +87,26 @@ QPEL_TABLE 12, 4, w, sse4
|
||||
QPEL_TABLE 8,16, b, avx2
|
||||
QPEL_TABLE 10, 8, w, avx2
|
||||
|
||||
QPEL_TABLE 8, 1, b, avx512icl_h
|
||||
QPEL_TABLE 8, 1, d, avx512icl_v
|
||||
|
||||
pb_qpel_shuffle_index: db 0, 1, 2, 3
|
||||
db 1, 2, 3, 4
|
||||
db 2, 3, 4, 5
|
||||
db 3, 4, 5, 6
|
||||
db 4, 5, 6, 7
|
||||
db 5, 6, 7, 8
|
||||
db 6, 7, 8, 9
|
||||
db 7, 8, 9, 10
|
||||
db 4, 5, 6, 7
|
||||
db 5, 6, 7, 8
|
||||
db 6, 7, 8, 9
|
||||
db 7, 8, 9, 10
|
||||
db 8, 9, 10, 11
|
||||
db 9, 10, 11, 12
|
||||
db 10, 11, 12, 13
|
||||
db 11, 12, 13, 14
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define MAX_PB_SIZE 64
|
||||
@ -1670,3 +1690,120 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
|
||||
|
||||
%endif ;AVX2
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro QPEL_FILTER_H 5
|
||||
%define %%table hevc_qpel_filters_avx512icl_h_%1
|
||||
%assign %%offset 4
|
||||
dec %2q
|
||||
shl %2q, 3
|
||||
%ifdef PIC
|
||||
lea %5q, [%%table]
|
||||
%define FILTER %5q
|
||||
%else
|
||||
%define FILTER %%table
|
||||
%endif
|
||||
vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
|
||||
vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
|
||||
%endmacro
|
||||
|
||||
%macro QPEL_FILTER_V 5
|
||||
vpbroadcastd m%3, [%5 + %2q + 4*%4]
|
||||
%endmacro
|
||||
|
||||
%macro QPEL_LOAD_SHUF 2
|
||||
movu m%1, [pb_qpel_shuffle_index + 0]
|
||||
movu m%2, [pb_qpel_shuffle_index + 32]
|
||||
%endmacro
|
||||
|
||||
; required: m0-m5
|
||||
; %1: dst register index
|
||||
; %2: name for src
|
||||
%macro QPEL_H_LOAD_COMPUTE 2
|
||||
pxor m%1, m%1
|
||||
movu xm4, [%2q - 3]
|
||||
vpermb m5, m2, m4
|
||||
vpermb m4, m3, m4
|
||||
vpdpbusd m%1, m5, m0
|
||||
vpdpbusd m%1, m4, m1
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
|
||||
cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
|
||||
QPEL_FILTER_H %1, mx, 0, 1, tmp
|
||||
QPEL_LOAD_SHUF 2, 3
|
||||
.loop:
|
||||
QPEL_H_LOAD_COMPUTE 6, src
|
||||
vpmovdw [dstq], m6
|
||||
LOOP_END dst, src, srcstride
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
|
||||
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
|
||||
%assign %%shift 6
|
||||
%assign %%extra 7
|
||||
QPEL_FILTER_H %1, mx, 0, 1, tmp
|
||||
QPEL_LOAD_SHUF 2, 3
|
||||
lea tmpq, [srcstrideq*3]
|
||||
sub srcq, tmpq
|
||||
sub myq, 1
|
||||
shl myq, 5
|
||||
%ifdef PIC
|
||||
%define %%table hevc_qpel_filters_avx512icl_v_%1
|
||||
lea tmpq, [%%table]
|
||||
%define FILTER tmpq
|
||||
%else
|
||||
%define FILTER %%table
|
||||
%endif
|
||||
%assign %%i 6
|
||||
%assign %%j 0
|
||||
%rep %1
|
||||
QPEL_FILTER_V %1, my, %%i, %%j, FILTER
|
||||
%assign %%i %%i+1
|
||||
%assign %%j %%j+1
|
||||
%endrep
|
||||
%rep %%extra
|
||||
QPEL_H_LOAD_COMPUTE %%i, src
|
||||
add srcq, srcstrideq
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
.loop:
|
||||
QPEL_H_LOAD_COMPUTE %%i, src
|
||||
vpmulld m22, m14, m6
|
||||
vpmulld m23, m15, m7
|
||||
vpmulld m24, m16, m8
|
||||
vpmulld m25, m17, m9
|
||||
vpaddd m26, m22, m23
|
||||
vpaddd m24, m25
|
||||
vpaddd m26, m24
|
||||
vpmulld m22, m18, m10
|
||||
vpmulld m23, m19, m11
|
||||
vpmulld m24, m20, m12
|
||||
vpmulld m25, m21, m13
|
||||
vpaddd m22, m22, m23
|
||||
vpaddd m24, m25
|
||||
vpaddd m26, m24
|
||||
vpaddd m22, m26
|
||||
mova m14, m15
|
||||
mova m15, m16
|
||||
mova m16, m17
|
||||
mova m17, m18
|
||||
mova m18, m19
|
||||
mova m19, m20
|
||||
mova m20, m21
|
||||
vpsrad m22, %%shift
|
||||
vpmovdw [dstq], m22
|
||||
LOOP_END dst, src, srcstride
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if HAVE_AVX512ICL_EXTERNAL
|
||||
|
||||
INIT_YMM avx512icl
|
||||
HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
|
||||
HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
|
||||
|
||||
%endif
|
||||
%endif
|
||||
|
@ -233,6 +233,9 @@ WEIGHTING_PROTOTYPES(8, sse4);
|
||||
WEIGHTING_PROTOTYPES(10, sse4);
|
||||
WEIGHTING_PROTOTYPES(12, sse4);
|
||||
|
||||
void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TRANSFORM_ADD
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -878,6 +878,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
|
||||
c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
|
||||
}
|
||||
if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
|
||||
c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
|
||||
c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
|
||||
|
Loading…
Reference in New Issue
Block a user