vf_ilpack: remove inline asm

This makes it multiple times slower. However, the output format (packed
YUV) isn't handled efficiently by anything to begin with, and I have no
clue we even have this filter. I guess it's one of these filters which
find some use sometimes, but are not of higher importance, which
justifies removing the faster inline asm.
This commit is contained in:
wm4 2014-04-19 15:47:10 +02:00
parent c4f5dc0d53
commit b69c835945
1 changed files with 4 additions and 277 deletions

View File

@ -20,16 +20,15 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <libavutil/attributes.h>
#include "config.h"
#include "common/msg.h"
#include "common/cpudetect.h"
#include "options/m_option.h"
#include "video/img_format.h"
#include "video/mp_image.h"
#include "vf.h"
#include "libavutil/attributes.h"
typedef void (pack_func_t)(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w, int us, int vs);
@ -39,7 +38,7 @@ struct vf_priv_s {
pack_func_t *pack[2];
};
static void pack_nn_C(unsigned char *dst, unsigned char *y,
static void pack_nn(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w,
int av_unused us, int av_unused vs)
{
@ -52,7 +51,7 @@ static void pack_nn_C(unsigned char *dst, unsigned char *y,
}
}
static void pack_li_0_C(unsigned char *dst, unsigned char *y,
static void pack_li_0(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w, int us, int vs)
{
int j;
@ -65,7 +64,7 @@ static void pack_li_0_C(unsigned char *dst, unsigned char *y,
}
}
static void pack_li_1_C(unsigned char *dst, unsigned char *y,
static void pack_li_1(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w, int us, int vs)
{
int j;
@ -78,265 +77,6 @@ static void pack_li_1_C(unsigned char *dst, unsigned char *y,
}
}
#if HAVE_MMX
static void pack_nn_MMX(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w,
int av_unused us, int av_unused vs)
{
__asm__ volatile (""
".align 4 \n\t"
"1: \n\t"
"movq (%0), %%mm1 \n\t"
"movq (%0), %%mm2 \n\t"
"movq (%1), %%mm4 \n\t"
"movq (%2), %%mm6 \n\t"
"punpcklbw %%mm6, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm1 \n\t"
"punpckhbw %%mm4, %%mm2 \n\t"
"add $8, %0 \n\t"
"add $4, %1 \n\t"
"add $4, %2 \n\t"
"movq %%mm1, (%3) \n\t"
"movq %%mm2, 8(%3) \n\t"
"add $16, %3 \n\t"
"decl %4 \n\t"
"jnz 1b \n\t"
"emms \n\t"
:
: "r" (y), "r" (u), "r" (v), "r" (dst), "r" (w/8)
: "memory"
);
pack_nn_C(dst, y, u, v, (w&7), 0, 0);
}
#if HAVE_EBX_AVAILABLE
static void pack_li_0_MMX(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w, int us, int vs)
{
__asm__ volatile (""
"push %%"REG_BP" \n\t"
#if ARCH_X86_64
"mov %6, %%"REG_BP" \n\t"
#else
"movl 4(%%"REG_d"), %%"REG_BP" \n\t"
"movl (%%"REG_d"), %%"REG_d" \n\t"
#endif
"pxor %%mm0, %%mm0 \n\t"
".align 4 \n\t"
".Lli0: \n\t"
"movq (%%"REG_S"), %%mm1 \n\t"
"movq (%%"REG_S"), %%mm2 \n\t"
"movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
"movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
"punpcklbw %%mm0, %%mm4 \n\t"
"punpcklbw %%mm0, %%mm6 \n\t"
"movq (%%"REG_a"), %%mm3 \n\t"
"movq (%%"REG_b"), %%mm5 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t"
"punpcklbw %%mm0, %%mm5 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"psrlw $3, %%mm4 \n\t"
"psrlw $3, %%mm6 \n\t"
"packuswb %%mm4, %%mm4 \n\t"
"packuswb %%mm6, %%mm6 \n\t"
"punpcklbw %%mm6, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm1 \n\t"
"punpckhbw %%mm4, %%mm2 \n\t"
"movq %%mm1, (%%"REG_D") \n\t"
"movq %%mm2, 8(%%"REG_D") \n\t"
"movq 8(%%"REG_S"), %%mm1 \n\t"
"movq 8(%%"REG_S"), %%mm2 \n\t"
"movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
"movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
"punpckhbw %%mm0, %%mm4 \n\t"
"punpckhbw %%mm0, %%mm6 \n\t"
"movq (%%"REG_a"), %%mm3 \n\t"
"movq (%%"REG_b"), %%mm5 \n\t"
"punpckhbw %%mm0, %%mm3 \n\t"
"punpckhbw %%mm0, %%mm5 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"psrlw $3, %%mm4 \n\t"
"psrlw $3, %%mm6 \n\t"
"packuswb %%mm4, %%mm4 \n\t"
"packuswb %%mm6, %%mm6 \n\t"
"punpcklbw %%mm6, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm1 \n\t"
"punpckhbw %%mm4, %%mm2 \n\t"
"add $16, %%"REG_S" \n\t"
"add $8, %%"REG_a" \n\t"
"add $8, %%"REG_b" \n\t"
"movq %%mm1, 16(%%"REG_D") \n\t"
"movq %%mm2, 24(%%"REG_D") \n\t"
"add $32, %%"REG_D" \n\t"
"decl %%ecx \n\t"
"jnz .Lli0 \n\t"
"emms \n\t"
"pop %%"REG_BP" \n\t"
:
: "S" (y), "D" (dst), "a" (u), "b" (v), "c" (w/16),
#if ARCH_X86_64
"d" ((x86_reg)us), "r" ((x86_reg)vs)
#else
"d" (&us)
#endif
: "memory"
);
pack_li_0_C(dst, y, u, v, (w&15), us, vs);
}
static void pack_li_1_MMX(unsigned char *dst, unsigned char *y,
unsigned char *u, unsigned char *v, int w, int us, int vs)
{
__asm__ volatile (""
"push %%"REG_BP" \n\t"
#if ARCH_X86_64
"mov %6, %%"REG_BP" \n\t"
#else
"movl 4(%%"REG_d"), %%"REG_BP" \n\t"
"movl (%%"REG_d"), %%"REG_d" \n\t"
#endif
"pxor %%mm0, %%mm0 \n\t"
".align 4 \n\t"
".Lli1: \n\t"
"movq (%%"REG_S"), %%mm1 \n\t"
"movq (%%"REG_S"), %%mm2 \n\t"
"movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
"movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
"punpcklbw %%mm0, %%mm4 \n\t"
"punpcklbw %%mm0, %%mm6 \n\t"
"movq (%%"REG_a"), %%mm3 \n\t"
"movq (%%"REG_b"), %%mm5 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t"
"punpcklbw %%mm0, %%mm5 \n\t"
"movq %%mm4, %%mm7 \n\t"
"paddw %%mm4, %%mm4 \n\t"
"paddw %%mm7, %%mm4 \n\t"
"movq %%mm6, %%mm7 \n\t"
"paddw %%mm6, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"psrlw $3, %%mm4 \n\t"
"psrlw $3, %%mm6 \n\t"
"packuswb %%mm4, %%mm4 \n\t"
"packuswb %%mm6, %%mm6 \n\t"
"punpcklbw %%mm6, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm1 \n\t"
"punpckhbw %%mm4, %%mm2 \n\t"
"movq %%mm1, (%%"REG_D") \n\t"
"movq %%mm2, 8(%%"REG_D") \n\t"
"movq 8(%%"REG_S"), %%mm1 \n\t"
"movq 8(%%"REG_S"), %%mm2 \n\t"
"movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
"movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
"punpckhbw %%mm0, %%mm4 \n\t"
"punpckhbw %%mm0, %%mm6 \n\t"
"movq (%%"REG_a"), %%mm3 \n\t"
"movq (%%"REG_b"), %%mm5 \n\t"
"punpckhbw %%mm0, %%mm3 \n\t"
"punpckhbw %%mm0, %%mm5 \n\t"
"movq %%mm4, %%mm7 \n\t"
"paddw %%mm4, %%mm4 \n\t"
"paddw %%mm7, %%mm4 \n\t"
"movq %%mm6, %%mm7 \n\t"
"paddw %%mm6, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm3, %%mm4 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"psrlw $3, %%mm4 \n\t"
"psrlw $3, %%mm6 \n\t"
"packuswb %%mm4, %%mm4 \n\t"
"packuswb %%mm6, %%mm6 \n\t"
"punpcklbw %%mm6, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm1 \n\t"
"punpckhbw %%mm4, %%mm2 \n\t"
"add $16, %%"REG_S" \n\t"
"add $8, %%"REG_a" \n\t"
"add $8, %%"REG_b" \n\t"
"movq %%mm1, 16(%%"REG_D") \n\t"
"movq %%mm2, 24(%%"REG_D") \n\t"
"add $32, %%"REG_D" \n\t"
"decl %%ecx \n\t"
"jnz .Lli1 \n\t"
"emms \n\t"
"pop %%"REG_BP" \n\t"
:
: "S" (y), "D" (dst), "a" (u), "b" (v), "c" (w/16),
#if ARCH_X86_64
"d" ((x86_reg)us), "r" ((x86_reg)vs)
#else
"d" (&us)
#endif
: "memory"
);
pack_li_1_C(dst, y, u, v, (w&15), us, vs);
}
#endif /* HAVE_EBX_AVAILABLE */
#endif
static pack_func_t *pack_nn;
static pack_func_t *pack_li_0;
static pack_func_t *pack_li_1;
static void ilpack(unsigned char *dst, unsigned char *src[3],
int dststride, int srcstride[3], int w, int h, pack_func_t *pack[2])
{
@ -410,19 +150,6 @@ static int vf_open(vf_instance_t *vf)
vf->query_format=query_format;
vf->filter=filter;
pack_nn = pack_nn_C;
pack_li_0 = pack_li_0_C;
pack_li_1 = pack_li_1_C;
#if HAVE_MMX
if(gCpuCaps.hasMMX) {
pack_nn = pack_nn_MMX;
#if HAVE_EBX_AVAILABLE
pack_li_0 = pack_li_0_MMX;
pack_li_1 = pack_li_1_MMX;
#endif
}
#endif
switch(vf->priv->mode) {
case 0:
vf->priv->pack[0] = vf->priv->pack[1] = pack_nn;