avcodec/x86/diracdsp: migrate last remaining MMX function to SSE2

The add_dirac_obmc8_mmx function was the only MMX function left. This
patch migrates it to SSE2.

Here are the checkasm benchmark results:

diracdsp.add_dirac_obmc_8_c:    2299.1 ( 1.00x)
diracdsp.add_dirac_obmc_8_mmx:   237.6 ( 9.68x)
diracdsp.add_dirac_obmc_8_sse2:  109.1 (21.07x)

Signed-off-by: Kyosuke Kawakami <kawakami150708@gmail.com>
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Kyosuke Kawakami 2024-11-15 03:25:34 +09:00 committed by Ronald S. Bultje
parent 8f683ee417
commit 61aa532e22
2 changed files with 23 additions and 11 deletions

View File

@ -228,7 +228,7 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
punpcklbw m2, m4
punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
@ -248,9 +248,6 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
RET
%endm
INIT_MMX
ADD_OBMC 8, mmx
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
@ -259,6 +256,25 @@ HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
cglobal add_dirac_obmc8_sse2, 5,5,4, dst, src, stride, obmc, yblen
pxor m3, m3
movsxdifnidn strideq, strided
.loop:
movh m0, [srcq]
punpcklbw m0, m3
movh m1, [obmcq]
punpcklbw m1, m3
pmullw m0, m1
movu m1, [dstq]
paddw m0, m1
movu [dstq], m0
lea srcq, [srcq+strideq]
lea dstq, [dstq+2*strideq]
add obmcq, 32
sub yblend, 1
jg .loop
RET
INIT_XMM sse4
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)

View File

@ -24,8 +24,7 @@
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
#if HAVE_X86ASM
int mm_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(mm_flags)) {
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
}
if (EXTERNAL_SSE2(mm_flags)) {
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
#endif
#endif // HAVE_X86ASM
}