sync with mplayer xp

- partial yvu9 support (copy only) - rgb 15/16 -> 24/32 converters - int->unsigned changes Originally committed as revision 6493 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
2025-04-17 20:46:31 +00:00 · 2002-06-22 08:49:45 +00:00 · 2002-06-22 08:49:45 +00:00 · 0d9f3d85f6
commit 0d9f3d85f6
parent d661d18d89
8 changed files with 1128 additions and 292 deletions
--- a/postproc/rgb2rgb.c
+++ b/postproc/rgb2rgb.c
@ -20,6 +20,8 @@
 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
 #ifdef CAN_COMPILE_X86_ASM
 static const uint64_t mmx_null  __attribute__((aligned(8))) = 0x0000000000000000ULL;
 static const uint64_t mmx_one   __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
 static const uint64_t mask32b  __attribute__((aligned(8))) = 0x000000FF000000FFULL;
 static const uint64_t mask32g  __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
 static const uint64_t mask32r  __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
@ -35,6 +37,11 @@ static const uint64_t mask24hhhh  __attribute__((aligned(8))) = 0xffffffffffff00
 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
 static const uint64_t mask15g  __attribute__((aligned(8))) = 0x03E003E003E003E0ULL;
 static const uint64_t mask15r  __attribute__((aligned(8))) = 0x7C007C007C007C00ULL;
 #define mask16b mask15b
 static const uint64_t mask16g  __attribute__((aligned(8))) = 0x07E007E007E007E0ULL;
 static const uint64_t mask16r  __attribute__((aligned(8))) = 0xF800F800F800F800ULL;
 static const uint64_t red_16mask  __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
@ -137,10 +144,68 @@ void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb24to32_MMX(src, dst, src_size);
 	else
 		rgb24to32_C(src, dst, src_size);
 #else
 		rgb24to32_C(src, dst, src_size);
 #endif
 		rgb24to32_C(src, dst, src_size);
 }
 void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
 #ifdef CAN_COMPILE_X86_ASM
 	// ordered per speed fasterst first
 	if(gCpuCaps.hasMMX2)
 		rgb15to24_MMX2(src, dst, src_size);
 	else if(gCpuCaps.has3DNow)
 		rgb15to24_3DNow(src, dst, src_size);
 	else if(gCpuCaps.hasMMX)
 		rgb15to24_MMX(src, dst, src_size);
 	else
 #endif
 		rgb15to24_C(src, dst, src_size);
 }
 void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
 #ifdef CAN_COMPILE_X86_ASM
 	// ordered per speed fasterst first
 	if(gCpuCaps.hasMMX2)
 		rgb16to24_MMX2(src, dst, src_size);
 	else if(gCpuCaps.has3DNow)
 		rgb16to24_3DNow(src, dst, src_size);
 	else if(gCpuCaps.hasMMX)
 		rgb16to24_MMX(src, dst, src_size);
 	else
 #endif
 		rgb16to24_C(src, dst, src_size);
 }
 void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
 #ifdef CAN_COMPILE_X86_ASM
 	// ordered per speed fasterst first
 	if(gCpuCaps.hasMMX2)
 		rgb15to32_MMX2(src, dst, src_size);
 	else if(gCpuCaps.has3DNow)
 		rgb15to32_3DNow(src, dst, src_size);
 	else if(gCpuCaps.hasMMX)
 		rgb15to32_MMX(src, dst, src_size);
 	else
 #endif
 		rgb15to32_C(src, dst, src_size);
 }
 void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
 #ifdef CAN_COMPILE_X86_ASM
 	// ordered per speed fasterst first
 	if(gCpuCaps.hasMMX2)
 		rgb16to32_MMX2(src, dst, src_size);
 	else if(gCpuCaps.has3DNow)
 		rgb16to32_3DNow(src, dst, src_size);
 	else if(gCpuCaps.hasMMX)
 		rgb16to32_MMX(src, dst, src_size);
 	else
 #endif
 		rgb16to32_C(src, dst, src_size);
 }
 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
@ -154,10 +219,8 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb32to24_MMX(src, dst, src_size);
 	else
 		rgb32to24_C(src, dst, src_size);
 #else
 		rgb32to24_C(src, dst, src_size);
 #endif
 		rgb32to24_C(src, dst, src_size);
 }
 /*
@ -177,10 +240,8 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb15to16_MMX(src, dst, src_size);
 	else
 		rgb15to16_C(src, dst, src_size);
 #else
 		rgb15to16_C(src, dst, src_size);
 #endif
 		rgb15to16_C(src, dst, src_size);
 }
 /**
@ -242,10 +303,8 @@ void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb32to16_MMX(src, dst, src_size);
 	else
 		rgb32to16_C(src, dst, src_size);
 #else
 		rgb32to16_C(src, dst, src_size);
 #endif
 		rgb32to16_C(src, dst, src_size);
 }
 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
@ -259,10 +318,8 @@ void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb32to15_MMX(src, dst, src_size);
 	else
 		rgb32to15_C(src, dst, src_size);
 #else
 		rgb32to15_C(src, dst, src_size);
 #endif
 		rgb32to15_C(src, dst, src_size);
 }
 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
@ -276,10 +333,8 @@ void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb24to16_MMX(src, dst, src_size);
 	else
 		rgb24to16_C(src, dst, src_size);
 #else
 		rgb24to16_C(src, dst, src_size);
 #endif
 		rgb24to16_C(src, dst, src_size);
 }
 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
@ -293,10 +348,8 @@ void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb24to15_MMX(src, dst, src_size);
 	else
 		rgb24to15_C(src, dst, src_size);
 #else
 		rgb24to15_C(src, dst, src_size);
 #endif
 		rgb24to15_C(src, dst, src_size);
 }
 /**
@ -330,10 +383,8 @@ void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb32tobgr32_MMX(src, dst, src_size);
 	else
 		rgb32tobgr32_C(src, dst, src_size);
 #else
 		rgb32tobgr32_C(src, dst, src_size);
 #endif
 		rgb32tobgr32_C(src, dst, src_size);
 }
 void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
@ -347,10 +398,8 @@ void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
 	else if(gCpuCaps.hasMMX)
 		rgb24tobgr24_MMX(src, dst, src_size);
 	else
 		rgb24tobgr24_C(src, dst, src_size);
 #else
 		rgb24tobgr24_C(src, dst, src_size);
 #endif
 		rgb24tobgr24_C(src, dst, src_size);
 }
 /**
@ -371,10 +420,8 @@ void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, u
 	else if(gCpuCaps.hasMMX)
 		yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 	else
 		yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 #else
 		yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 #endif
 		yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 }
 /**
@ -394,10 +441,8 @@ void yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc
 	else if(gCpuCaps.hasMMX)
 		yuv422ptoyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 	else
 		yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 #else
 		yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 #endif
 		yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
 }
 /**
@ -418,10 +463,8 @@ void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 	else if(gCpuCaps.hasMMX)
 		yuy2toyv12_MMX(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 	else
 		yuy2toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 #else
 		yuy2toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 #endif
 		yuy2toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 }
 /**
@ -488,14 +531,13 @@ void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst
 	else if(gCpuCaps.hasMMX)
 		rgb24toyv12_MMX(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 	else
 		rgb24toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 #else
 		rgb24toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 #endif
 		rgb24toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 }
 void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
-		     int width, int height, int src1Stride, int src2Stride, int dstStride)
+		     unsigned width, unsigned height, unsigned src1Stride,
 		     unsigned src2Stride, unsigned dstStride)
 {
 #ifdef CAN_COMPILE_X86_ASM
 	// ordered per speed fasterst first
@ -506,8 +548,6 @@ void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
 	else if(gCpuCaps.hasMMX)
 		interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
 	else
 		interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
 #else
 		interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
 #endif
 		interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
 }
--- a/postproc/rgb2rgb.h
+++ b/postproc/rgb2rgb.h
@ -10,12 +10,16 @@
 #define RGB2RGB_INCLUDED
 extern void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb24to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb24to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
 extern void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size);
 extern void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size);
@ -39,7 +43,8 @@ extern void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_
 	unsigned int lumStride, unsigned int chromStride, unsigned int srcStride);
 extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
-			    int width, int height, int src1Stride, int src2Stride, int dstStride);
+			    unsigned width, unsigned height, unsigned src1Stride,
 			    unsigned src2Stride, unsigned dstStride);
 #define MODE_RGB  0x1
@ -47,11 +52,11 @@ extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
 typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
-			      int h_size, int v_size,
+			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride);
+			      unsigned rgb_stride, unsigned y_stride, unsigned uv_stride);
 extern yuv2rgb_fun yuv2rgb;
-void yuv2rgb_init (int bpp, int mode);
+void yuv2rgb_init (unsigned bpp, int mode);
 #endif
--- a/postproc/rgb2rgb_template.c
+++ b/postproc/rgb2rgb_template.c
@ -8,6 +8,13 @@
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
 */
 #include <stddef.h>
 #include <inttypes.h> /* for __WORDSIZE */
 #ifndef __WORDSIZE
 #warning You have misconfigured system and probably will lose performance!
 #endif
 #undef PREFETCH
 #undef MOVNTQ
 #undef EMMS
@ -56,13 +63,13 @@ static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned sr
  const uint8_t *s = src;
  const uint8_t *end;
 #ifdef HAVE_MMX
-  const uint8_t *mm_end;
+  uint8_t *mm_end;
 #endif
  end = s + src_size;
 #ifdef HAVE_MMX
  __asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
  mm_end = end - 23;
  __asm __volatile("movq	%0, %%mm7"::"m"(mask32):"memory");
  mm_end = (uint8_t*)((((unsigned long)end)/24)*24);
  while(s < mm_end)
  {
    __asm __volatile(
@ -107,12 +114,12 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr
  const uint8_t *s = src;
  const uint8_t *end;
 #ifdef HAVE_MMX
-  const uint8_t *mm_end;
+  uint8_t *mm_end;
 #endif
  end = s + src_size;
 #ifdef HAVE_MMX
  __asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
-  mm_end = end - 31;
+  mm_end = (uint8_t*)((((unsigned long)end)/32)*32);
  while(s < mm_end)
  {
    __asm __volatile(
@ -186,15 +193,16 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr
 */
 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
  register const uint8_t* s=src;
  register uint8_t* d=dst;
  register const uint8_t *end;
  uint8_t *mm_end;
  end = s + src_size;
 #ifdef HAVE_MMX
-  register int offs=15-src_size;
+  __asm __volatile(PREFETCH"	%0"::"m"(*s));
-  register const char* s=src-offs;
+  __asm __volatile("movq	%0, %%mm4"::"m"(mask15s));
-  register char* d=dst-offs;
+  mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
-  __asm __volatile(PREFETCH"	%0"::"m"(*(s+offs)));
+  while(s<mm_end)
  __asm __volatile(
 	"movq	%0, %%mm4\n\t"
 	::"m"(mask15s));
  while(offs<0)
  {
 	__asm __volatile(
 		PREFETCH"	32%1\n\t"
@ -208,40 +216,28 @@ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned sr
 		"paddw	%%mm3, %%mm2\n\t"
 		MOVNTQ"	%%mm0, %0\n\t"
 		MOVNTQ"	%%mm2, 8%0"
-		:"=m"(*(d+offs))
+		:"=m"(*d)
-		:"m"(*(s+offs))
+		:"m"(*s)
 		);
-	offs+=16;
+	d+=16;
 	s+=16;
  }
  __asm __volatile(SFENCE:::"memory");
  __asm __volatile(EMMS:::"memory");
 #else
 #if 0
   const uint16_t *s1=( uint16_t * )src;
   uint16_t *d1=( uint16_t * )dst;
   uint16_t *e=((uint8_t *)s1)+src_size;
   while( s1<e ){
     register int x=*( s1++ );
     /* rrrrrggggggbbbbb
        0rrrrrgggggbbbbb
        0111 1111 1110 0000=0x7FE0
        00000000000001 1111=0x001F */
     *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
   }
 #else
 	const unsigned *s1=( unsigned * )src;
 	unsigned *d1=( unsigned * )dst;
 	int i;
 	int size= src_size>>2;
 	for(i=0; i<size; i++)
 	{
 		register int x= s1[i];
 //		d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 		d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 	}
 #endif
 #endif
    mm_end = (uint8_t*)((((unsigned long)end)/4)*4);
    while(s < mm_end)
    {
 	register unsigned x= *((uint32_t *)s);
 	*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 	d+=4;
 	s+=4;
    }
    if(s < end)
    {
 	register unsigned short x= *((uint16_t *)s);
 	*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
    }
 }
 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
@ -257,17 +253,20 @@ static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsign
 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 #ifdef HAVE_MMX
 	const uint8_t *s = src;
-	const uint8_t *end,*mm_end;
+	const uint8_t *end;
 #ifdef HAVE_MMX
 	const uint8_t *mm_end;
 #endif
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
-	mm_end = end - 15;
+#ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_16mask),"m"(green_16mask));
 	mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@ -303,43 +302,35 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 16;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		const int b= *s++;
 		const int g= *s++;
 		const int r= *s++;
                s++;
 		*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 		s++;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #else
 	unsigned j,i,num_pixels=src_size/4;
 	uint16_t *d = (uint16_t *)dst;
 	for(i=0,j=0; j<num_pixels; i+=4,j++)
 	{
 		const int b= src[i+0];
 		const int g= src[i+1];
 		const int r= src[i+2];
 		d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 	}
 #endif
 }
 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 #ifdef HAVE_MMX
 	const uint8_t *s = src;
-	const uint8_t *end,*mm_end;
+	const uint8_t *end;
 #ifdef HAVE_MMX
 	const uint8_t *mm_end;
 #endif
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
-	mm_end = end - 15;
+#ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_15mask),"m"(green_15mask));
 	mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@ -375,43 +366,35 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 16;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		const int b= *s++;
 		const int g= *s++;
 		const int r= *s++;
 		s++;
 		*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 		s++;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #else
 	unsigned j,i,num_pixels=src_size/4;
 	uint16_t *d = (uint16_t *)dst;
 	for(i=0,j=0; j<num_pixels; i+=4,j++)
 	{
 		const int b= src[i+0];
 		const int g= src[i+1];
 		const int r= src[i+2];
 		d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 	}
 #endif
 }
 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 #ifdef HAVE_MMX
 	const uint8_t *s = src;
-	const uint8_t *end,*mm_end;
+	const uint8_t *end;
 #ifdef HAVE_MMX
 	const uint8_t *mm_end;
 #endif
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
-	mm_end = end - 11;
+#ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_16mask),"m"(green_16mask));
 	mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@ -447,6 +430,9 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 12;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		const int b= *s++;
@ -454,35 +440,24 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned
 		const int r= *s++;
 		*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #else
 	unsigned j,i,num_pixels=src_size/3;
 	uint16_t *d = (uint16_t *)dst;
 	for(i=0,j=0; j<num_pixels; i+=3,j++)
 	{
 		const int b= src[i+0];
 		const int g= src[i+1];
 		const int r= src[i+2];
 		d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 	}
 #endif
 }
 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 #ifdef HAVE_MMX
 	const uint8_t *s = src;
-	const uint8_t *end,*mm_end;
+	const uint8_t *end;
 #ifdef HAVE_MMX
 	const uint8_t *mm_end;
 #endif
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
-	mm_end = end -11;
+#ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_15mask),"m"(green_15mask));
 	mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@ -518,6 +493,9 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 12;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		const int b= *s++;
@ -525,25 +503,448 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned
 		const int r= *s++;
 		*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 	}
 }
 /*
  I use here less accurate approximation by simply
 left-shifting the input
  value and filling the low order bits with
 zeroes. This method improves png's
  compression but this scheme cannot reproduce white exactly, since it does not
  generate an all-ones maximum value; the net effect is to darken the
  image slightly.
  The better method should be "left bit replication":
   4 3 2 1 0
   ---------
   1 1 0 1 1
   7 6 5 4 3  2 1 0
   ----------------
   1 1 0 1 1  1 1 0
   |=======|  |===|
       |      Leftmost Bits Repeated to Fill Open Bits
       |
   Original Bits
 */
 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 	const uint16_t *end;
 #ifdef HAVE_MMX
 	const uint16_t *mm_end;
 #endif
 	uint8_t *d = (uint8_t *)dst;
 	const uint16_t *s = (uint16_t *)src;
 	end = s + src_size/2;
 #ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
 	mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
 		PREFETCH" 32%1\n\t"
 		"movq	%1, %%mm0\n\t"
 		"movq	%1, %%mm1\n\t"
 		"movq	%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$2, %%mm1\n\t"
 		"psrlq	$7, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %5, %%mm0\n\t"
 		"punpcklwd %5, %%mm1\n\t"
 		"punpcklwd %5, %%mm2\n\t"
 		"punpckhwd %5, %%mm3\n\t"
 		"punpckhwd %5, %%mm4\n\t"
 		"punpckhwd %5, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		"movq	%%mm0, %%mm6\n\t"
 		"movq	%%mm3, %%mm7\n\t"
 		"movq	8%1, %%mm0\n\t"
 		"movq	8%1, %%mm1\n\t"
 		"movq	8%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$2, %%mm1\n\t"
 		"psrlq	$7, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %5, %%mm0\n\t"
 		"punpcklwd %5, %%mm1\n\t"
 		"punpcklwd %5, %%mm2\n\t"
 		"punpckhwd %5, %%mm3\n\t"
 		"punpckhwd %5, %%mm4\n\t"
 		"punpckhwd %5, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
 		:"memory");
 	    /* Borrowed 32 to 24 */
 	    __asm __volatile(
 		"movq	%%mm0, %%mm4\n\t"
 		"movq	%%mm3, %%mm5\n\t"
 		"movq	%%mm6, %%mm0\n\t"
 		"movq	%%mm7, %%mm1\n\t"
 		"movq	%%mm4, %%mm6\n\t"
 		"movq	%%mm5, %%mm7\n\t"
 		"movq	%%mm0, %%mm2\n\t"
 		"movq	%%mm1, %%mm3\n\t"
 		"psrlq	$8, %%mm2\n\t"
 		"psrlq	$8, %%mm3\n\t"
 		"psrlq	$8, %%mm6\n\t"
 		"psrlq	$8, %%mm7\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%2, %%mm1\n\t"
 		"pand	%2, %%mm4\n\t"
 		"pand	%2, %%mm5\n\t"
 		"pand	%3, %%mm2\n\t"
 		"pand	%3, %%mm3\n\t"
 		"pand	%3, %%mm6\n\t"
 		"pand	%3, %%mm7\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"por	%%mm3, %%mm1\n\t"
 		"por	%%mm6, %%mm4\n\t"
 		"por	%%mm7, %%mm5\n\t"
 		"movq	%%mm1, %%mm2\n\t"
 		"movq	%%mm4, %%mm3\n\t"
 		"psllq	$48, %%mm2\n\t"
 		"psllq	$32, %%mm3\n\t"
 		"pand	%4, %%mm2\n\t"
 		"pand	%5, %%mm3\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psrlq	$16, %%mm1\n\t"
 		"psrlq	$32, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm3, %%mm1\n\t"
 		"pand	%6, %%mm5\n\t"
 		"por	%%mm5, %%mm4\n\t"
 		MOVNTQ"	%%mm0, %0\n\t"
 		MOVNTQ"	%%mm1, 8%0\n\t"
 		MOVNTQ"	%%mm4, 16%0"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
 		:"memory");
 		d += 24;
 		s += 8;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #else
 	unsigned j,i,num_pixels=src_size/3;
 	uint16_t *d = (uint16_t *)dst;
 	for(i=0,j=0; j<num_pixels; i+=3,j++)
 	{
 		const int b= src[i+0];
 		const int g= src[i+1];
 		const int r= src[i+2];
 		d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 	}
 #endif
 	while(s < end)
 	{
 		register uint16_t bgr;
 		bgr = *s++;
 		*d++ = (bgr&0x1F)<<3;
 		*d++ = (bgr&0x3E0)>>2;
 		*d++ = (bgr&0x7C00)>>7;
 	}
 }
 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 	const uint16_t *end;
 #ifdef HAVE_MMX
 	const uint16_t *mm_end;
 #endif
 	uint8_t *d = (uint8_t *)dst;
 	const uint16_t *s = (const uint16_t *)src;
 	end = s + src_size/2;
 #ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
 	mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
 		PREFETCH" 32%1\n\t"
 		"movq	%1, %%mm0\n\t"
 		"movq	%1, %%mm1\n\t"
 		"movq	%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$3, %%mm1\n\t"
 		"psrlq	$8, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %5, %%mm0\n\t"
 		"punpcklwd %5, %%mm1\n\t"
 		"punpcklwd %5, %%mm2\n\t"
 		"punpckhwd %5, %%mm3\n\t"
 		"punpckhwd %5, %%mm4\n\t"
 		"punpckhwd %5, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		"movq	%%mm0, %%mm6\n\t"
 		"movq	%%mm3, %%mm7\n\t"
 		"movq	8%1, %%mm0\n\t"
 		"movq	8%1, %%mm1\n\t"
 		"movq	8%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$3, %%mm1\n\t"
 		"psrlq	$8, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %5, %%mm0\n\t"
 		"punpcklwd %5, %%mm1\n\t"
 		"punpcklwd %5, %%mm2\n\t"
 		"punpckhwd %5, %%mm3\n\t"
 		"punpckhwd %5, %%mm4\n\t"
 		"punpckhwd %5, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)		
 		:"memory");
 	    /* Borrowed 32 to 24 */
 	    __asm __volatile(
 		"movq	%%mm0, %%mm4\n\t"
 		"movq	%%mm3, %%mm5\n\t"
 		"movq	%%mm6, %%mm0\n\t"
 		"movq	%%mm7, %%mm1\n\t"
 		"movq	%%mm4, %%mm6\n\t"
 		"movq	%%mm5, %%mm7\n\t"
 		"movq	%%mm0, %%mm2\n\t"
 		"movq	%%mm1, %%mm3\n\t"
 		"psrlq	$8, %%mm2\n\t"
 		"psrlq	$8, %%mm3\n\t"
 		"psrlq	$8, %%mm6\n\t"
 		"psrlq	$8, %%mm7\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%2, %%mm1\n\t"
 		"pand	%2, %%mm4\n\t"
 		"pand	%2, %%mm5\n\t"
 		"pand	%3, %%mm2\n\t"
 		"pand	%3, %%mm3\n\t"
 		"pand	%3, %%mm6\n\t"
 		"pand	%3, %%mm7\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"por	%%mm3, %%mm1\n\t"
 		"por	%%mm6, %%mm4\n\t"
 		"por	%%mm7, %%mm5\n\t"
 		"movq	%%mm1, %%mm2\n\t"
 		"movq	%%mm4, %%mm3\n\t"
 		"psllq	$48, %%mm2\n\t"
 		"psllq	$32, %%mm3\n\t"
 		"pand	%4, %%mm2\n\t"
 		"pand	%5, %%mm3\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psrlq	$16, %%mm1\n\t"
 		"psrlq	$32, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm3, %%mm1\n\t"
 		"pand	%6, %%mm5\n\t"
 		"por	%%mm5, %%mm4\n\t"
 		MOVNTQ"	%%mm0, %0\n\t"
 		MOVNTQ"	%%mm1, 8%0\n\t"
 		MOVNTQ"	%%mm4, 16%0"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
 		:"memory");
 		d += 24;
 		s += 8;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		register uint16_t bgr;
 		bgr = *s++;
 		*d++ = (bgr&0x1F)<<3;
 		*d++ = (bgr&0x7E0)>>3;
 		*d++ = (bgr&0xF800)>>8;
 	}
 }
 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 	const uint16_t *end;
 #ifdef HAVE_MMX
 	const uint16_t *mm_end;
 #endif
 	uint8_t *d = (uint8_t *)dst;
 	const uint16_t *s = (const uint16_t *)src;
 	end = s + src_size/2;
 #ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
 	__asm __volatile("pxor	%%mm7,%%mm7\n\t":::"memory");
 	mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
 		PREFETCH" 32%1\n\t"
 		"movq	%1, %%mm0\n\t"
 		"movq	%1, %%mm1\n\t"
 		"movq	%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$2, %%mm1\n\t"
 		"psrlq	$7, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %%mm7, %%mm0\n\t"
 		"punpcklwd %%mm7, %%mm1\n\t"
 		"punpcklwd %%mm7, %%mm2\n\t"
 		"punpckhwd %%mm7, %%mm3\n\t"
 		"punpckhwd %%mm7, %%mm4\n\t"
 		"punpckhwd %%mm7, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		MOVNTQ"	%%mm0, %0\n\t"
 		MOVNTQ"	%%mm3, 8%0\n\t"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
 		:"memory");
 		d += 16;
 		s += 4;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		register uint16_t bgr;
 		bgr = *s++;
 		*d++ = (bgr&0x1F)<<3;
 		*d++ = (bgr&0x3E0)>>2;
 		*d++ = (bgr&0x7C00)>>7;
 		*d++ = 0;
 	}
 }
 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
 {
 	const uint16_t *end;
 #ifdef HAVE_MMX
 	const uint16_t *mm_end;
 #endif
 	uint8_t *d = (uint8_t *)dst;
 	const uint16_t *s = (uint16_t *)src;
 	end = s + src_size/2;
 #ifdef HAVE_MMX
 	__asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
 	__asm __volatile("pxor	%%mm7,%%mm7\n\t":::"memory");
 	mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
 	while(s < mm_end)
 	{
 	    __asm __volatile(
 		PREFETCH" 32%1\n\t"
 		"movq	%1, %%mm0\n\t"
 		"movq	%1, %%mm1\n\t"
 		"movq	%1, %%mm2\n\t"
 		"pand	%2, %%mm0\n\t"
 		"pand	%3, %%mm1\n\t"
 		"pand	%4, %%mm2\n\t"
 		"psllq	$3, %%mm0\n\t"
 		"psrlq	$3, %%mm1\n\t"
 		"psrlq	$8, %%mm2\n\t"
 		"movq	%%mm0, %%mm3\n\t"
 		"movq	%%mm1, %%mm4\n\t"
 		"movq	%%mm2, %%mm5\n\t"
 		"punpcklwd %%mm7, %%mm0\n\t"
 		"punpcklwd %%mm7, %%mm1\n\t"
 		"punpcklwd %%mm7, %%mm2\n\t"
 		"punpckhwd %%mm7, %%mm3\n\t"
 		"punpckhwd %%mm7, %%mm4\n\t"
 		"punpckhwd %%mm7, %%mm5\n\t"
 		"psllq	$8, %%mm1\n\t"
 		"psllq	$16, %%mm2\n\t"
 		"por	%%mm1, %%mm0\n\t"
 		"por	%%mm2, %%mm0\n\t"
 		"psllq	$8, %%mm4\n\t"
 		"psllq	$16, %%mm5\n\t"
 		"por	%%mm4, %%mm3\n\t"
 		"por	%%mm5, %%mm3\n\t"
 		MOVNTQ"	%%mm0, %0\n\t"
 		MOVNTQ"	%%mm3, 8%0\n\t"
 		:"=m"(*d)
 		:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
 		:"memory");
 		d += 16;
 		s += 4;
 	}
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
 	while(s < end)
 	{
 		register uint16_t bgr;
 		bgr = *s++;
 		*d++ = (bgr&0x1F)<<3;
 		*d++ = (bgr&0x7E0)>>3;
 		*d++ = (bgr&0xF800)>>8;
 		*d++ = 0;
 	}
 }
 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
 {
 #ifdef HAVE_MMX
 /* TODO: unroll this loop */
 	asm volatile (
 		"xorl %%eax, %%eax		\n\t"
 		".balign 16			\n\t"
@ -554,9 +955,9 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsign
 		"movq %%mm0, %%mm2		\n\t"
 		"pslld $16, %%mm0		\n\t"
 		"psrld $16, %%mm1		\n\t"
-		"pand "MANGLE(mask32r)", %%mm0		\n\t"
+		"pand "MANGLE(mask32r)", %%mm0	\n\t"
-		"pand "MANGLE(mask32g)", %%mm2		\n\t"
+		"pand "MANGLE(mask32g)", %%mm2	\n\t"
-		"pand "MANGLE(mask32b)", %%mm1		\n\t"
+		"pand "MANGLE(mask32b)", %%mm1	\n\t"
 		"por %%mm0, %%mm2		\n\t"
 		"por %%mm1, %%mm2		\n\t"
 		MOVNTQ" %%mm2, (%1, %%eax)	\n\t"
@ -570,8 +971,8 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsign
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #else
-	int i;
+	unsigned i;
-	int num_pixels= src_size >> 2;
+	unsigned num_pixels = src_size >> 2;
 	for(i=0; i<num_pixels; i++)
 	{
 		dst[4*i + 0] = src[4*i + 2];
@ -583,7 +984,7 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsign
 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
 {
-	int i;
+	unsigned i;
 #ifdef HAVE_MMX
 	int mmx_size= 23 - src_size;
 	asm volatile (
@ -631,15 +1032,16 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsign
 	__asm __volatile(EMMS:::"memory");
 	if(mmx_size==23) return; //finihsed, was multiple of 8
 	src+= src_size;
 	dst+= src_size;
-	src_size= 23 - mmx_size;
+	src_size= 23-mmx_size;
 	src-= src_size;
 	dst-= src_size;
 #endif
 	for(i=0; i<src_size; i+=3)
 	{
-		register int x;
+		register uint8_t x;
 		x          = src[i + 2];
 		dst[i + 1] = src[i + 1];
 		dst[i + 2] = src[i + 0];
@ -651,8 +1053,8 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
 	unsigned int width, unsigned int height,
 	unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
 {
-	int y;
+	unsigned y;
-	const int chromWidth= width>>1;
+	const unsigned chromWidth= width>>1;
 	for(y=0; y<height; y++)
 	{
 #ifdef HAVE_MMX
@ -691,14 +1093,33 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
 			: "%eax"
 		);
 #else
 #if __WORDSIZE >= 64
 		int i;
-		for(i=0; i<chromWidth; i++)
+		uint64_t *ldst = (uint64_t *) dst;
-		{
+		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
-			dst[4*i+0] = ysrc[2*i+0];
+		for(i = 0; i < chromWidth; i += 2){
-			dst[4*i+1] = usrc[i];
+			uint64_t k, l;
-			dst[4*i+2] = ysrc[2*i+1];
+			k = yc[0] + (uc[0] << 8) +
-			dst[4*i+3] = vsrc[i];
+			    (yc[1] << 16) + (vc[0] << 24);
 			l = yc[2] + (uc[1] << 8) +
 			    (yc[3] << 16) + (vc[1] << 24);
 			*ldst++ = k + (l << 32);
 			yc += 4;
 			uc += 2;
 			vc += 2;
 		}
 #else
 		int i, *idst = (int32_t *) dst;
 		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 		for(i = 0; i < chromWidth; i++){
 			*idst++ = yc[0] + (uc[0] << 8) +
 			    (yc[1] << 16) + (vc[0] << 24);
 			yc += 2;
 			uc++;
 			vc++;
 		}
 #endif
 #endif
 		if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
 		{
@ -748,8 +1169,8 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 	unsigned int width, unsigned int height,
 	unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 {
-	int y;
+	unsigned y;
-	const int chromWidth= width>>1;
+	const unsigned chromWidth= width>>1;
 	for(y=0; y<height; y+=2)
 	{
 #ifdef HAVE_MMX
@ -835,7 +1256,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 			: "memory", "%eax"
 		);
 #else
-		int i;
+		unsigned i;
 		for(i=0; i<chromWidth; i++)
 		{
 			ydst[2*i+0] 	= src[4*i+0];
@ -884,8 +1305,8 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 	unsigned int width, unsigned int height,
 	unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 {
-	int y;
+	unsigned y;
-	const int chromWidth= width>>1;
+	const unsigned chromWidth= width>>1;
 	for(y=0; y<height; y+=2)
 	{
 #ifdef HAVE_MMX
@ -971,7 +1392,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 			: "memory", "%eax"
 		);
 #else
-		int i;
+		unsigned i;
 		for(i=0; i<chromWidth; i++)
 		{
 			udst[i] 	= src[4*i+0];
@ -1010,12 +1431,12 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 	unsigned int width, unsigned int height,
 	unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 {
-	int y;
+	unsigned y;
-	const int chromWidth= width>>1;
+	const unsigned chromWidth= width>>1;
 #ifdef HAVE_MMX
 	for(y=0; y<height-2; y+=2)
 	{
-		int i;
+		unsigned i;
 		for(i=0; i<2; i++)
 		{
 			asm volatile(
@ -1254,7 +1675,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 #endif
 	for(; y<height; y+=2)
 	{
-		int i;
+		unsigned i;
 		for(i=0; i<chromWidth; i++)
 		{
 			unsigned int b= src[6*i+0];
@ -1304,12 +1725,13 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 }
 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
-			    int width, int height, int src1Stride, int src2Stride, int dstStride){
+			    unsigned width, unsigned height, unsigned src1Stride,
-	int h;
+			    unsigned src2Stride, unsigned dstStride){
 	unsigned h;
 	for(h=0; h < height; h++)
 	{
-		int w;
+		unsigned w;
 #ifdef HAVE_MMX
 #ifdef HAVE_SSE2
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@ -65,6 +65,14 @@ untested special converters
 #include "rgb2rgb.h"
 #include "../libvo/fastmemcpy.h"
 #include "../mp_msg.h"
 #define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
 #define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
 #define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
 #define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
 #define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
 #define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
 #undef MOVNTQ
 #undef PAVGB
@ -92,19 +100,26 @@ untested special converters
 #endif
 //FIXME replace this with something faster
-#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
+#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
-#define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
+#define isRGB(x)       ((x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24|| (x)==IMGFMT_RGB16|| (x)==IMGFMT_RGB15)
-#define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
+#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV|| (x)==IMGFMT_YVU9 || (x)==IMGFMT_IF09)
 #define isYUV(x)       (!(isBGR(x) || isRGB(x)))
 #define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
 #define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
-#define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
+#define isPacked(x)    (isYUV(x) && !isPlanarYUV(x))
-#define isGray(x)      ((x)==IMGFMT_Y800)
+#define isGray(x)      ((x)==IMGFMT_Y800) /* Behaviour the same as PACKED but it's PLANAR */
 #define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
 			|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
 			|| (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
 			|| (x)==IMGFMT_Y800)
 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
 			|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
-#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
+#define isSupportedUnscaledIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_NV12 \
 			|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
 			|| (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
 			|| (x)==IMGFMT_Y800)
 #define isSupportedUnscaledOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x) == IMGFMT_YUY2 \
 			|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
 #define RGB2YUV_SHIFT 16
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
@ -751,7 +766,6 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 		if     (flags&SWS_BICUBIC) filterSize= 4;
 		else if(flags&SWS_X      ) filterSize= 4;
 		else			   filterSize= 2; // SWS_BILINEAR / SWS_AREA 
 //		printf("%d %d %d\n", filterSize, srcW, dstW);
 		filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 		xDstInSrc= xInc/2 - 0x8000;
@ -780,12 +794,10 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 					y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
 				}
 //				printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 				filter[i*filterSize + 0]= y1;
 				filter[i*filterSize + 1]= y2;
 				filter[i*filterSize + 2]= y3;
 				filter[i*filterSize + 3]= y4;
 //				printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
 			}
 			else
 			{
@ -795,7 +807,6 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 					double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
 					double coeff= 1.0 - d;
 					if(coeff<0) coeff=0;
 	//				printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 					filter[i*filterSize + j]= coeff;
 					xx++;
 				}
@ -812,7 +823,6 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 		else if(flags&SWS_X)	filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 		else if(flags&SWS_AREA)	filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
 		else /* BILINEAR */	filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
 //		printf("%d %d %d\n", *filterSize, srcW, dstW);
 		filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 		xDstInSrc= xInc/2 - 0x8000;
@ -849,7 +859,6 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 					coeff= 1.0 - d;
 					if(coeff<0) coeff=0;
 				}
 //				printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
 				filter[i*filterSize + j]= coeff;
 				xx++;
 			}
@ -940,7 +949,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
 	*outFilterSize= filterSize;
 	if(flags&SWS_PRINT_INFO)
-		mp_msg(MSGT_SWS,MSGL_V,"SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
+		MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
 	/* try to reduce the filter-size (step2 reduce it) */
 	for(i=0; i<dstW; i++)
 	{
@ -1254,6 +1263,32 @@ cpuCaps= gCpuCaps;
 #endif //!RUNTIME_CPUDETECT
 }
 static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
 	/* Copy Y plane */
 	if(dstStride[0]==srcStride[0])
 		memcpy(dst, src[0], srcSliceH*dstStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst;
 		for(i=0; i<srcSliceH; i++)
 		{
 			memcpy(dstPtr, srcPtr, srcStride[0]);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}
 	dst = dstParam[1] + dstStride[1]*srcSliceY;
 	if(c->srcFormat==IMGFMT_YV12)
 		interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
 	else /* I420 & IYUV */
 		interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
 }
 /* Warper functions for yuv2bgr */
 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
@ -1265,6 +1300,16 @@ static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int s
 		yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
 }
 static void Planar2PackedWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
 	if(c->srcFormat==IMGFMT_YV12)
 		yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
 	else /* I420 & IYUV */
 		yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
 }
 static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
@ -1285,6 +1330,46 @@ static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int
 	}     
 }
 static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*3==srcStride[0]*2)
 		rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb24to16(srcPtr, dstPtr, c->srcW*3);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*3==srcStride[0]*2)
 		rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb24to15(srcPtr, dstPtr, c->srcW*3);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
@ -1305,6 +1390,46 @@ static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int
 	}     
 }
 static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*4==srcStride[0]*2)
 		rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb32to16(srcPtr, dstPtr, c->srcW<<2);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*4==srcStride[0]*2)
 		rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb32to15(srcPtr, dstPtr, c->srcW<<2);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
@ -1325,6 +1450,86 @@ static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int
 	}     
 }
 static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*2==srcStride[0]*3)
 		rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb15to24(srcPtr, dstPtr, c->srcW<<1);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*2==srcStride[0]*4)
 		rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb15to32(srcPtr, dstPtr, c->srcW<<1);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*2==srcStride[0]*3)
 		rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb16to24(srcPtr, dstPtr, c->srcW<<1);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
 	if(dstStride[0]*2==srcStride[0]*4)
 		rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
 	else
 	{
 		int i;
 		uint8_t *srcPtr= src[0];
 		uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 		for(i=0; i<srcSliceH; i++)
 		{
 			rgb16to32(srcPtr, dstPtr, c->srcW<<1);
 			srcPtr+= srcStride[0];
 			dstPtr+= dstStride[0];
 		}
 	}     
 }
 static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
@ -1346,21 +1551,25 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 	uint8_t *src[3];
 	uint8_t *dst[3];
-	if(c->srcFormat == IMGFMT_I420){
+	if(isPlanarYUV(c->srcFormat))
 	{
 	    if(c->srcFormat == IMGFMT_I420 || c->srcFormat == IMGFMT_IYUV){
 		src[0]= srcParam[0];
 		src[1]= srcParam[2];
 		src[2]= srcParam[1];
 		srcStride[0]= srcStrideParam[0];
 		srcStride[1]= srcStrideParam[2];
 		srcStride[2]= srcStrideParam[1];
-	}
+	    }
-	else if(c->srcFormat==IMGFMT_YV12){
+	    else
 	    {
 		src[0]= srcParam[0];
 		src[1]= srcParam[1];
 		src[2]= srcParam[2];
 		srcStride[0]= srcStrideParam[0];
 		srcStride[1]= srcStrideParam[1];
 		srcStride[2]= srcStrideParam[2];
 	    }
 	}
 	else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
 		src[0]= srcParam[0];
@ -1371,7 +1580,7 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 		srcStride[2]= 0;
 	}
-	if(c->dstFormat == IMGFMT_I420){
+	if(c->dstFormat == IMGFMT_I420 || c->dstFormat == IMGFMT_IYUV){
 		dst[0]= dstParam[0];
 		dst[1]= dstParam[2];
 		dst[2]= dstParam[1];
@ -1411,9 +1620,21 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 		int plane;
 		for(plane=0; plane<3; plane++)
 		{
-			int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
+			int length;
-			int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
+			int y;
-			int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
+			int height;
 			if(c->srcFormat == IMGFMT_YVU9 || c->srcFormat == IMGFMT_IF09)
 			{
 			    length= plane==0 ? c->srcW  : ((c->srcW+1)>>2);
 			    y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>2);
 			    height= plane==0 ? srcSliceH: ((srcSliceH+1)>>2);
 			}
 			else
 			{
 			    length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
 			    y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
 			    height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
 			}
 			if(dstStride[plane]==srcStride[plane])
 				memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
@ -1433,12 +1654,23 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 	}
 }
 static uint32_t remove_dup_fourcc(uint32_t fourcc)
 {
 	switch(fourcc)
 	{
 	    case IMGFMT_IYUV: return IMGFMT_I420;
 	    case IMGFMT_Y8  : return IMGFMT_Y800;
 	    default: return fourcc;
 	}
 }
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
                         SwsFilter *srcFilter, SwsFilter *dstFilter){
 	SwsContext *c;
 	int i;
 	int usesFilter;
 	int simple_copy, unscaled_copy;
 	SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
 #ifdef ARCH_X86
@ -1449,25 +1681,44 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	if(swScale==NULL) globalInit();
 	/* avoid dupplicate Formats, so we dont need to check to much */
-	if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
+	srcFormat = remove_dup_fourcc(srcFormat);
-	if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
+	dstFormat = remove_dup_fourcc(dstFormat);
-	if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
+	/* don't refuse this beauty */
-
+	unscaled_copy = (srcW == dstW && srcH == dstH);
-	if(!isSupportedIn(srcFormat)) 
+	simple_copy = (srcW == dstW && srcH == dstH && srcFormat == dstFormat);
 	if(!simple_copy)
 	{
-		mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
+	    if(unscaled_copy)
-		return NULL;
+	    {
 		if(!isSupportedUnscaledIn(srcFormat)) 
 		{
 		    MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
 		    return NULL;
 		}
 		if(!isSupportedUnscaledOut(dstFormat))
 		{
 		    MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
 		    return NULL;
 		}
 	    }
 	    else
 	    {
 		if(!isSupportedIn(srcFormat)) 
 		{
 		    MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
 		    return NULL;
 		}
 		if(!isSupportedOut(dstFormat))
 		{
 		    MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
 		    return NULL;
 		}
 	    }
 	}
 	if(!isSupportedOut(dstFormat))
 	{
 		 mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
 		return NULL;
 	}
 	/* sanity check */
 	if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
 	{
-		 mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
+		 MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
 			srcW, srcH, dstW, dstH);
 		return NULL;
 	}
@ -1501,6 +1752,26 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	/* unscaled special Cases */
 	if(srcW==dstW && srcH==dstH && !usesFilter)
 	{
 		/* yv12_to_nv12 */
 		if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
 		{
 			c->swScale= PlanarToNV12Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* yv12_to_yuy2 */
 		if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
 		{
 			c->swScale= Planar2PackedWrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* yuv2bgr */
 		if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
 		{
@ -1516,7 +1787,7 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= planarYuvToBgr;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1527,7 +1798,7 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= simpleCopy;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1539,7 +1810,31 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= bgr32to24Wrapper;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr32to16 & rgb32to16*/
 		if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
 		 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
 		{
 			c->swScale= bgr32to16Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr32to15 & rgb32to15*/
 		if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
 		 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
 		{
 			c->swScale= bgr32to15Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1551,7 +1846,31 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= bgr24to32Wrapper;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr24to16 & rgb24to16*/
 		if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
 		 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
 		{
 			c->swScale= bgr24to16Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr24to15 & rgb24to15*/
 		if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
 		 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
 		{
 			c->swScale= bgr24to15Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1562,7 +1881,55 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= bgr15to16Wrapper;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr15to24 */
 		if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
 		 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
 		{
 			c->swScale= bgr15to24Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr15to32 */
 		if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
 		 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
 		{
 			c->swScale= bgr15to32Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr16to24 */
 		if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
 		 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
 		{
 			c->swScale= bgr16to24Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
 		/* bgr16to32 */
 		if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
 		 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
 		{
 			c->swScale= bgr16to32Wrapper;
 			if(flags&SWS_PRINT_INFO)
 				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1573,7 +1940,7 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= bgr24toyv12Wrapper;
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
+				MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
 					vo_format_name(srcFormat), vo_format_name(dstFormat));
 			return c;
 		}
@ -1585,7 +1952,7 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 		if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
 		{
 			if(flags&SWS_PRINT_INFO)
-				mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
+				MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
 		}
 	}
 	else
@ -1723,33 +2090,35 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 		char *dither= "";
 #endif
 		if(flags&SWS_FAST_BILINEAR)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: FAST_BILINEAR scaler, ");
+			MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
 		else if(flags&SWS_BILINEAR)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: BILINEAR scaler, ");
+			MSG_INFO("\nSwScaler: BILINEAR scaler, ");
 		else if(flags&SWS_BICUBIC)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: BICUBIC scaler, ");
+			MSG_INFO("\nSwScaler: BICUBIC scaler, ");
 		else if(flags&SWS_X)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: Experimental scaler, ");
+			MSG_INFO("\nSwScaler: Experimental scaler, ");
 		else if(flags&SWS_POINT)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: Nearest Neighbor / POINT scaler, ");
+			MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
 		else if(flags&SWS_AREA)
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: Area Averageing scaler, ");
+			MSG_INFO("\nSwScaler: Area Averageing scaler, ");
 		else
-			mp_msg(MSGT_SWS,MSGL_INFO,"SwScaler: ehh flags invalid?! ");
+			MSG_INFO("\nSwScaler: ehh flags invalid?! ");
-		mp_msg(MSGT_SWS,MSGL_INFO,"%dx%d %s -> %dx%d%s %s ", 
+		if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
-			srcW,srcH, vo_format_name(srcFormat), dstW,dstH,
+			MSG_INFO("from %s to%s %s ", 
-			(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16) ?
+				vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
-			dither : "", vo_format_name(dstFormat));
+		else
 			MSG_INFO("from %s to %s ", 
 				vo_format_name(srcFormat), vo_format_name(dstFormat));
 		if(cpuCaps.hasMMX2)
-			mp_msg(MSGT_SWS,MSGL_INFO,"using MMX2\n");
+			MSG_INFO("using MMX2\n");
 		else if(cpuCaps.has3DNow)
-			mp_msg(MSGT_SWS,MSGL_INFO,"using 3DNOW\n");
+			MSG_INFO("using 3DNOW\n");
 		else if(cpuCaps.hasMMX)
-			mp_msg(MSGT_SWS,MSGL_INFO,"using MMX\n");
+			MSG_INFO("using MMX\n");
 		else
-			mp_msg(MSGT_SWS,MSGL_INFO,"using C\n");
+			MSG_INFO("using C\n");
 	}
 	if((flags & SWS_PRINT_INFO) && verbose)
@ -1757,70 +2126,70 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 		if(cpuCaps.hasMMX)
 		{
 			if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
+				MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
 			else
 			{
 				if(c->hLumFilterSize==4)
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
+					MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
 				else if(c->hLumFilterSize==8)
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
+					MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
 				else
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
+					MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
 				if(c->hChrFilterSize==4)
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
+					MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
 				else if(c->hChrFilterSize==8)
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
+					MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
 				else
-					mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
+					MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
 			}
 		}
 		else
 		{
 #ifdef ARCH_X86
-			mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using X86-Asm scaler for horizontal scaling\n");
+			MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
 #else
 			if(flags & SWS_FAST_BILINEAR)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
+				MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
 			else
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using C scaler for horizontal scaling\n");
+				MSG_V("SwScaler: using C scaler for horizontal scaling\n");
 #endif
 		}
 		if(isPlanarYUV(dstFormat))
 		{
 			if(c->vLumFilterSize==1)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
+				MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
 			else
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
+				MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
 		}
 		else
 		{
 			if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
+				MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
 				       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
 			else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
+				MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
 			else
-				mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
+				MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
 		}
 		if(dstFormat==IMGFMT_BGR24)
-			mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR24 Converter\n",
+			MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
 				cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
 		else if(dstFormat==IMGFMT_BGR32)
-			mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
+			MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
 		else if(dstFormat==IMGFMT_BGR16)
-			mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
+			MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
 		else if(dstFormat==IMGFMT_BGR15)
-			mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
+			MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
-		mp_msg(MSGT_SWS,MSGL_V,"SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
+		MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
 	}
 	if((flags & SWS_PRINT_INFO) && verbose>1)
 	{
-		mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+		MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
 			c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
-		mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+		MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
 			c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
 	}
@ -2039,9 +2408,9 @@ void printVec(SwsVector *a){
 	for(i=0; i<a->length; i++)
 	{
 		int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
-		printf("%1.3f ", a->coeff[i]);
+		MSG_DBG2("%1.3f ", a->coeff[i]);
-		for(;x>0; x--) printf(" ");
+		for(;x>0; x--) MSG_DBG2(" ");
-		printf("|\n");
+		MSG_DBG2("|\n");
 	}
 }
--- a/postproc/swscale_template.c
+++ b/postproc/swscale_template.c
@ -2626,7 +2626,7 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 		srcStride[1]= srcStrideParam[1];
 		srcStride[2]= srcStrideParam[2];
 	}
-	else if(isPacked(c->srcFormat)){
+	else if(isPacked(c->srcFormat) || isBGR(c->srcFormat) || isRGB(c->srcFormat)){
 		src[0]=
 		src[1]=
 		src[2]= srcParam[0];
--- a/postproc/yuv2rgb.c
+++ b/postproc/yuv2rgb.c
@ -156,7 +156,7 @@ const int32_t Inverse_Table_6_9[8][4] = {
    {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };
-static void yuv2rgb_c_init (int bpp, int mode);
+static void yuv2rgb_c_init (unsigned bpp, int mode);
 yuv2rgb_fun yuv2rgb;
@ -166,11 +166,11 @@ static void (* yuv2rgb_c_internal) (uint8_t *, uint8_t *,
 static void yuv2rgb_c (void * dst, uint8_t * py,
 		       uint8_t * pu, uint8_t * pv,
-		       int h_size, int v_size,
+		       unsigned h_size, unsigned v_size,
-		       int rgb_stride, int y_stride, int uv_stride)
+		       unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    v_size >>= 1;
-    
+
    while (v_size--) {
 	yuv2rgb_c_internal (py, py + y_stride, pu, pv, dst, dst + rgb_stride,
 			    h_size, v_size<<1);
@ -182,7 +182,7 @@ static void yuv2rgb_c (void * dst, uint8_t * py,
    }
 }
-void yuv2rgb_init (int bpp, int mode)
+void yuv2rgb_init (unsigned bpp, int mode)
 {
    yuv2rgb = NULL;
 #ifdef CAN_COMPILE_X86_ASM
@ -676,7 +676,7 @@ static int div_round (int dividend, int divisor)
 	return -((-dividend + (divisor>>1)) / divisor);
 }
-static void yuv2rgb_c_init (int bpp, int mode)
+static void yuv2rgb_c_init (unsigned bpp, int mode)
 {  
    int i;
    uint8_t table_Y[1024];
--- a/postproc/yuv2rgb_mlib.c
+++ b/postproc/yuv2rgb_mlib.c
@ -29,8 +29,8 @@
 static void mlib_YUV2ARGB420_32(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
-			 int h_size, int v_size, 
+			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
+			 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
  mlib_VideoColorYUV2ARGB420(image, py, pu, pv, h_size,
 			     v_size, rgb_stride, y_stride, uv_stride);
@ -38,8 +38,8 @@ static void mlib_YUV2ARGB420_32(uint8_t* image, uint8_t* py,
 static void mlib_YUV2ABGR420_32(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
-			 int h_size, int v_size, 
+			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
+			 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
  mlib_VideoColorYUV2ABGR420(image, py, pu, pv, h_size,
 			     v_size, rgb_stride, y_stride, uv_stride);
@ -47,15 +47,15 @@ static void mlib_YUV2ABGR420_32(uint8_t* image, uint8_t* py,
 static void mlib_YUV2RGB420_24(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
-			 int h_size, int v_size, 
+			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
+			 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
  mlib_VideoColorYUV2RGB420(image, py, pu, pv, h_size,
 			    v_size, rgb_stride, y_stride, uv_stride);
 }
-yuv2rgb_fun yuv2rgb_init_mlib(int bpp, int mode) 
+yuv2rgb_fun yuv2rgb_init_mlib(unsigned bpp, int mode) 
 {  
 	if( bpp == 24 ) 
--- a/postproc/yuv2rgb_template.c
+++ b/postproc/yuv2rgb_template.c
@ -123,8 +123,8 @@
 static inline void RENAME(yuv420_rgb16) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
-			      int h_size, int v_size,
+			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
+			      unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    int even = 1;
    int x, y;
@ -228,8 +228,8 @@ YUV2RGB
 static inline void RENAME(yuv420_rgb15) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
-			      int h_size, int v_size,
+			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
+			      unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    int even = 1;
    int x, y;
@ -329,8 +329,8 @@ YUV2RGB
 static inline void RENAME(yuv420_rgb24) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
-			      int h_size, int v_size,
+			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
+			      unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    int even = 1;
    int x, y;
@ -488,8 +488,8 @@ YUV2RGB
 static inline void RENAME(yuv420_argb32) (uint8_t * image, uint8_t * py,
 			       uint8_t * pu, uint8_t * pv,
-			       int h_size, int v_size,
+			       unsigned h_size, unsigned v_size,
-			       int rgb_stride, int y_stride, int uv_stride)
+			       unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    int even = 1;
    int x, y;
@ -584,7 +584,7 @@ YUV2RGB
    __asm__ __volatile__ (EMMS);
 }
-yuv2rgb_fun RENAME(yuv2rgb_init) (int bpp, int mode)
+yuv2rgb_fun RENAME(yuv2rgb_init) (unsigned bpp, int mode)
 {
    if (bpp == 15 && mode == MODE_RGB) return RENAME(yuv420_rgb15);
    if (bpp == 16 && mode == MODE_RGB) return RENAME(yuv420_rgb16);