Rewrite bgr24->yuv mmx code, the new code is cleaner, more accurate,

and does not throw half the chroma away.


git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@27561 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
michael 2008-09-09 23:30:06 +00:00
parent 593bf6af8e
commit 0c6d92bfda
2 changed files with 147 additions and 196 deletions

View File

@ -237,6 +237,20 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUV[2][4]) = {
{0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
{0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
};
DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
#endif /* defined(ARCH_X86) */
// clipping helper table for C implementations:
@ -2201,7 +2215,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)
&& srcFormat!=PIX_FMT_RGB8 && srcFormat!=PIX_FMT_BGR8
&& srcFormat!=PIX_FMT_RGB4 && srcFormat!=PIX_FMT_BGR4
&& srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE)
&& srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE
&& srcFormat!=PIX_FMT_BGR24 && srcFormat!=PIX_FMT_RGB24)
c->chrSrcHSubSample=1;
if (param){

View File

@ -1875,78 +1875,121 @@ static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
}
}
#ifdef HAVE_MMX
static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
{
if(srcFormat == PIX_FMT_BGR24){
asm volatile(
"movq "MANGLE(ff_bgr24toY1Coeff)", %mm5 \n\t"
"movq "MANGLE(ff_bgr24toY2Coeff)", %mm6 \n\t"
);
}else{
asm volatile(
"movq "MANGLE(ff_rgb24toY1Coeff)", %mm5 \n\t"
"movq "MANGLE(ff_rgb24toY2Coeff)", %mm6 \n\t"
);
}
asm volatile(
"movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
"mov %2, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
PREFETCH" 64(%0) \n\t"
"movd (%0), %%mm0 \n\t"
"movd 2(%0), %%mm1 \n\t"
"movd 6(%0), %%mm2 \n\t"
"movd 8(%0), %%mm3 \n\t"
"add $12, %0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm4, %%mm0 \n\t"
"paddd %%mm4, %%mm2 \n\t"
"psrad $15, %%mm0 \n\t"
"psrad $15, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, (%1, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
: "r" (dst+width), "g" (-width)
: "%"REG_a
);
}
static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
{
asm volatile(
"movq 24+%4, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
PREFETCH" 64(%0) \n\t"
"movd (%0), %%mm0 \n\t"
"movd 2(%0), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pmaddwd %4, %%mm0 \n\t"
"pmaddwd 8+%4, %%mm1 \n\t"
"pmaddwd 16+%4, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"movd 6(%0), %%mm1 \n\t"
"movd 8(%0), %%mm3 \n\t"
"add $12, %0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"pmaddwd %4, %%mm1 \n\t"
"pmaddwd 8+%4, %%mm3 \n\t"
"pmaddwd 16+%4, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm5 \n\t"
"paddd %%mm3, %%mm1 \n\t"
"paddd %%mm5, %%mm4 \n\t"
"movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
"paddd %%mm3, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm3, %%mm1 \n\t"
"paddd %%mm3, %%mm4 \n\t"
"psrad $15, %%mm0 \n\t"
"psrad $15, %%mm2 \n\t"
"psrad $15, %%mm1 \n\t"
"psrad $15, %%mm4 \n\t"
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm4, %%mm2 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm2, %%mm2 \n\t"
"movd %%mm0, (%1, %%"REG_a") \n\t"
"movd %%mm2, (%2, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
: "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
: "%"REG_a
);
}
#endif
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
{
#ifdef HAVE_MMX
asm volatile(
"mov %2, %%"REG_a" \n\t"
"movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
ASMALIGN(4)
"1: \n\t"
PREFETCH" 64(%0, %%"REG_d") \n\t"
"movd (%0, %%"REG_d"), %%mm0 \n\t"
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"psraw $7, %%mm0 \n\t"
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"add $24, %%"REG_d" \n\t"
"packssdw %%mm2, %%mm4 \n\t"
"psraw $7, %%mm4 \n\t"
"packuswb %%mm4, %%mm0 \n\t"
"paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
"movq %%mm0, (%1, %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src+width*3), "r" (dst+width), "g" (-width)
: "%"REG_a, "%"REG_d
);
bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
#else
int i;
for (i=0; i<width; i++)
@ -1963,132 +2006,17 @@ static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
{
#ifdef HAVE_MMX
asm volatile(
"mov %3, %%"REG_a" \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
"add %%"REG_d", %%"REG_d" \n\t"
ASMALIGN(4)
"1: \n\t"
PREFETCH" 64(%0, %%"REG_d") \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq (%0, %%"REG_d"), %%mm0 \n\t"
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm0 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB(%%mm1, %%mm0)
PAVGB(%%mm3, %%mm2)
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd (%0, %%"REG_d"), %%mm0 \n\t"
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"psrlw $1, %%mm0 \n\t"
"psrlw $1, %%mm2 \n\t"
#endif
"movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
"psraw $7, %%mm0 \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm4 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB(%%mm1, %%mm4)
PAVGB(%%mm3, %%mm2)
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"paddw %%mm5, %%mm2 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"add $24, %%"REG_d" \n\t"
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
"psraw $7, %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpckldq %%mm4, %%mm0 \n\t"
"punpckhdq %%mm4, %%mm1 \n\t"
"packsswb %%mm1, %%mm0 \n\t"
"paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
"movd %%mm0, (%1, %%"REG_a") \n\t"
"punpckhdq %%mm0, %%mm0 \n\t"
"movd %%mm0, (%2, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
: "%"REG_a, "%"REG_d
);
bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
#else
int i;
for (i=0; i<width; i++)
{
int b= src1[6*i + 0] + src1[6*i + 3];
int g= src1[6*i + 1] + src1[6*i + 4];
int r= src1[6*i + 2] + src1[6*i + 5];
int b= src1[3*i + 0];
int g= src1[3*i + 1];
int r= src1[3*i + 2];
dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
}
#endif /* HAVE_MMX */
assert(src1 == src2);
@ -2201,6 +2129,9 @@ static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
{
#ifdef HAVE_MMX
bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
#else
int i;
for (i=0; i<width; i++)
{
@ -2210,21 +2141,26 @@ static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
}
#endif
}
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
{
int i;
assert(src1==src2);
#ifdef HAVE_MMX
bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
#else
for (i=0; i<width; i++)
{
int r= src1[6*i + 0] + src1[6*i + 3];
int g= src1[6*i + 1] + src1[6*i + 4];
int b= src1[6*i + 2] + src1[6*i + 5];
int r= src1[3*i + 0];
int g= src1[3*i + 1];
int b= src1[3*i + 2];
dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
}
#endif
}
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, long width)