mirror of https://git.ffmpeg.org/ffmpeg.git
avcodec/x86/diracdsp: migrate last remaining MMX function to SSE2
The add_dirac_obmc8_mmx function was the only MMX function left. This patch migrates it to SSE2. Here are the checkasm benchmark results: diracdsp.add_dirac_obmc_8_c: 2299.1 ( 1.00x) diracdsp.add_dirac_obmc_8_mmx: 237.6 ( 9.68x) diracdsp.add_dirac_obmc_8_sse2: 109.1 (21.07x) Signed-off-by: Kyosuke Kawakami <kawakami150708@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
8f683ee417
commit
61aa532e22
|
@ -228,7 +228,7 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
|
|||
punpckhbw m1, m4
|
||||
mova m2, [obmcq+i]
|
||||
mova m3, m2
|
||||
punpcklbw m2, m4
|
||||
punpcklbw m2, m4
|
||||
punpckhbw m3, m4
|
||||
pmullw m0, m2
|
||||
pmullw m1, m3
|
||||
|
@ -248,9 +248,6 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
|
|||
RET
|
||||
%endm
|
||||
|
||||
INIT_MMX
|
||||
ADD_OBMC 8, mmx
|
||||
|
||||
INIT_XMM
|
||||
PUT_RECT sse2
|
||||
ADD_RECT sse2
|
||||
|
@ -259,6 +256,25 @@ HPEL_FILTER sse2
|
|||
ADD_OBMC 32, sse2
|
||||
ADD_OBMC 16, sse2
|
||||
|
||||
cglobal add_dirac_obmc8_sse2, 5,5,4, dst, src, stride, obmc, yblen
|
||||
pxor m3, m3
|
||||
movsxdifnidn strideq, strided
|
||||
.loop:
|
||||
movh m0, [srcq]
|
||||
punpcklbw m0, m3
|
||||
movh m1, [obmcq]
|
||||
punpcklbw m1, m3
|
||||
pmullw m0, m1
|
||||
movu m1, [dstq]
|
||||
paddw m0, m1
|
||||
movu [dstq], m0
|
||||
lea srcq, [srcq+strideq]
|
||||
lea dstq, [dstq+2*strideq]
|
||||
add obmcq, 32
|
||||
sub yblend, 1
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
INIT_XMM sse4
|
||||
|
||||
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
|
||||
|
|
|
@ -24,8 +24,7 @@
|
|||
|
||||
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
|
||||
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
|
@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
|
|||
#if HAVE_X86ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_sse2;
|
||||
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
|
||||
|
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
|
||||
|
||||
|
@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
|
|||
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
|
||||
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
|
||||
}
|
||||
#endif
|
||||
#endif // HAVE_X86ASM
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue