x86/vp3dsp: port put_vp_no_rnd_pixels8_l2_mmx to yasm

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Almer 2014-12-20 03:18:57 -03:00 committed by Michael Niedermayer
parent 4ae87554f3
commit 7696e429c7
2 changed files with 50 additions and 63 deletions

View File

@ -36,6 +36,7 @@ vp3_idct_data: times 8 dw 64277
pb_7: times 8 db 0x07
pb_1F: times 8 db 0x1f
pb_81: times 8 db 0x81
pb_FE: times 8 db 0xFE
cextern pb_1
cextern pb_3
@ -147,6 +148,49 @@ cglobal vp3_h_loop_filter, 3, 4
STORE_4_WORDS m3
RET
%macro PAVGB_NO_RND 0
mova m4, m0
mova m5, m2
pand m4, m1
pand m5, m3
pxor m1, m0
pxor m3, m2
pand m1, m6
pand m3, m6
psrlq m1, 1
psrlq m3, 1
paddb m4, m1
paddb m5, m3
%endmacro
INIT_MMX mmx
cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
mova m6, [pb_FE]
lea stride3q,[strideq+strideq*2]
.loop
mova m0, [src1q]
mova m1, [src2q]
mova m2, [src1q+strideq]
mova m3, [src2q+strideq]
PAVGB_NO_RND
mova [dstq], m4
mova [dstq+strideq], m5
mova m0, [src1q+strideq*2]
mova m1, [src2q+strideq*2]
mova m2, [src1q+stride3q]
mova m3, [src2q+stride3q]
PAVGB_NO_RND
mova [dstq+strideq*2], m4
mova [dstq+stride3q], m5
lea src1q, [src1q+strideq*4]
lea src2q, [src2q+strideq*4]
lea dstq, [dstq+strideq*4]
sub hd, 4
jnz .loop
RET
; from original comments: The Macro does IDct on 4 1-D Dcts
%macro BeginIDCT 0
movq m2, I(3)

View File

@ -23,10 +23,8 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
#include "config.h"
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
@ -42,76 +40,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
#if HAVE_MMX_INLINE
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#if HAVE_6REGS
static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
{
// START_TIMER
MOVQ_BFE(mm6);
__asm__ volatile(
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"movq (%1,%4), %%mm2 \n\t"
"movq (%2,%4), %%mm3 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, (%3,%4) \n\t"
"movq (%1,%4,2), %%mm0 \n\t"
"movq (%2,%4,2), %%mm1 \n\t"
"movq (%1,%5), %%mm2 \n\t"
"movq (%2,%5), %%mm3 \n\t"
"lea (%1,%4,4), %1 \n\t"
"lea (%2,%4,4), %2 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3,%4,2) \n\t"
"movq %%mm5, (%3,%5) \n\t"
"lea (%3,%4,4), %3 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
:"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
:"memory");
// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
}
#endif /*HAVE_6REGS */
#endif /* HAVE_MMX_INLINE */
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
const uint8_t *b, ptrdiff_t stride,
int h);
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_6REGS && HAVE_MMX_INLINE
c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
#endif /* HAVE_6REGS && HAVE_MMX_INLINE */
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
#if ARCH_X86_32
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
}
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;