mpv/libvo/fastmemcpy.h


#ifndef __MPLAYER_MEMCPY
#define __MPLAYER_MEMCPY

/*
 This part of code was taken by from Linux-2.4.3 and slightly modified
for MMX2 instruction set. I have done it since linux uses page aligned
blocks but mplayer uses weakly ordered data and original sources can not
speedup their. Only using prefetchnta and movntq together have effect! 
If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
*/

// 3dnow memcpy support from kernel 2.4.2
//  by Pontscho/fresh!mindworkz

#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW )

/* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)\
{\
__asm__ __volatile__(\
	"rep ; movsb\n"\
	::"D" (to), "S" (from),"c" (n)\
	: "memory");\
}

inline static void * fast_memcpy(void * to, const void * from, unsigned len)
{
	void *p;
	int i;

        if(len >= 0x200) /* 512-byte blocks */
	{
  	  p = to;
	  i = len >> 6; /* len/64 */
	  len&=63;
	  
	__asm__ __volatile__ (
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
	        "prefetch (%0)\n"
	        "prefetch 64(%0)\n"
	        "prefetch 128(%0)\n"
        	"prefetch 192(%0)\n"
        	"prefetch 256(%0)\n"
#else
		"prefetchnta (%0)\n"
		"prefetchnta 64(%0)\n"
		"prefetchnta 128(%0)\n"
		"prefetchnta 192(%0)\n"
		"prefetchnta 256(%0)\n"
#endif
		: : "r" (from) );
        /*
           This algorithm is top effective when the code consequently
           reads and writes blocks which have size of cache line.
           Size of cache line is processor-dependent.
           It will, however, be a minimum of 32 bytes on any processors.
           It would be better to have a number of instructions which
           perform reading and writing to be multiple to a number of
           processor's decoders, but it's not always possible.
        */
	for(; i>0; i--)
	{
		__asm__ __volatile__ (
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
        	"prefetch 320(%0)\n"
#else
		"prefetchnta 320(%0)\n"
#endif
#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
		"movups (%0), %%xmm0\n"
		"movups 16(%0), %%xmm1\n"
		"movntps %%xmm0, (%1)\n"
		"movntps %%xmm1, 16(%1)\n"
		"movups 32(%0), %%xmm0\n"
		"movups 48(%0), %%xmm1\n"
		"movntps %%xmm0, 32(%1)\n"
		"movntps %%xmm1, 48(%1)\n"
#else /* Only K7 (may be other) */
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
        	"movq (%0), %%mm0\n"
        	"movq 8(%0), %%mm1\n"
        	"movq 16(%0), %%mm2\n"
        	"movq 24(%0), %%mm3\n"
        	"movq %%mm0, (%1)\n"
        	"movq %%mm1, 8(%1)\n"
        	"movq %%mm2, 16(%1)\n"
        	"movq %%mm3, 24(%1)\n"
        	"movq 32(%0), %%mm0\n"
        	"movq 40(%0), %%mm1\n"
        	"movq 48(%0), %%mm2\n"
        	"movq 56(%0), %%mm3\n"
        	"movq %%mm0, 32(%1)\n"
        	"movq %%mm1, 40(%1)\n"
        	"movq %%mm2, 48(%1)\n"
        	"movq %%mm3, 56(%1)\n"
#else
		"movq (%0), %%mm0\n"
		"movq 8(%0), %%mm1\n"
		"movq 16(%0), %%mm2\n"
		"movq 24(%0), %%mm3\n"
		"movntq %%mm0, (%1)\n"
		"movntq %%mm1, 8(%1)\n"
		"movntq %%mm2, 16(%1)\n"
		"movntq %%mm3, 24(%1)\n"
		"movq 32(%0), %%mm0\n"
		"movq 40(%0), %%mm1\n"
		"movq 48(%0), %%mm2\n"
		"movq 56(%0), %%mm3\n"
		"movntq %%mm0, 32(%1)\n"
		"movntq %%mm1, 40(%1)\n"
		"movntq %%mm2, 48(%1)\n"
		"movntq %%mm3, 56(%1)\n"
#endif
#endif
		:: "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
		__asm__ __volatile__ ("femms":::"memory");
#else
		__asm__ __volatile__ ("emms":::"memory");
#endif
	}
	/*
	 *	Now do the tail of the block
	 */
	small_memcpy(to, from, len);
	return p;
}
#define memcpy(a,b,c) fast_memcpy(a,b,c)

#undef small_memcpy

#endif

#endif
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00
			`#ifndef __MPLAYER_MEMCPY`
			`#define __MPLAYER_MEMCPY`

fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`/*`
			`This part of code was taken by from Linux-2.4.3 and slightly modified`
			`for MMX2 instruction set. I have done it since linux uses page aligned`
			`blocks but mplayer uses weakly ordered data and original sources can not`
P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00			`speedup their. Only using prefetchnta and movntq together have effect!`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.`
			`*/`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00
			`// 3dnow memcpy support from kernel 2.4.2`
			`// by Pontscho/fresh!mindworkz`

			`#if defined( HAVE_MMX2 ) \|\| defined( HAVE_3DNOW )`

P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00			`/* for small memory blocks (<256 bytes) this version is faster */`
			`#define small_memcpy(to,from,n)\`
			`{\`
			`__asm__ __volatile__(\`
			`"rep ; movsb\n"\`
			`::"D" (to), "S" (from),"c" (n)\`
			`: "memory");\`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`}`
P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00
__memcpy fix git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@371 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 00:09:57 +00:00			`inline static void * fast_memcpy(void * to, const void * from, unsigned len)`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`{`
			`void *p;`
			`int i;`

			`if(len >= 0x200) /* 512-byte blocks */`
			`{`
			`p = to;`
			`i = len >> 6; /* len/64 */`
P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00			`len&=63;`

fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`__asm__ __volatile__ (`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )`
			`"prefetch (%0)\n"`
			`"prefetch 64(%0)\n"`
			`"prefetch 128(%0)\n"`
			`"prefetch 192(%0)\n"`
			`"prefetch 256(%0)\n"`
			`#else`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`"prefetchnta (%0)\n"`
			`"prefetchnta 64(%0)\n"`
			`"prefetchnta 128(%0)\n"`
			`"prefetchnta 192(%0)\n"`
			`"prefetchnta 256(%0)\n"`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#endif`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`: : "r" (from) );`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`/*`
			`This algorithm is top effective when the code consequently`
			`reads and writes blocks which have size of cache line.`
			`Size of cache line is processor-dependent.`
			`It will, however, be a minimum of 32 bytes on any processors.`
			`It would be better to have a number of instructions which`
			`perform reading and writing to be multiple to a number of`
			`processor's decoders, but it's not always possible.`
			`*/`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`for(; i>0; i--)`
			`{`
			`__asm__ __volatile__ (`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )`
			`"prefetch 320(%0)\n"`
			`#else`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`"prefetchnta 320(%0)\n"`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#endif`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */`
			`"movups (%0), %%xmm0\n"`
			`"movups 16(%0), %%xmm1\n"`
			`"movntps %%xmm0, (%1)\n"`
			`"movntps %%xmm1, 16(%1)\n"`
			`"movups 32(%0), %%xmm0\n"`
			`"movups 48(%0), %%xmm1\n"`
			`"movntps %%xmm0, 32(%1)\n"`
			`"movntps %%xmm1, 48(%1)\n"`
			`#else /* Only K7 (may be other) */`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )`
			`"movq (%0), %%mm0\n"`
			`"movq 8(%0), %%mm1\n"`
			`"movq 16(%0), %%mm2\n"`
			`"movq 24(%0), %%mm3\n"`
			`"movq %%mm0, (%1)\n"`
			`"movq %%mm1, 8(%1)\n"`
			`"movq %%mm2, 16(%1)\n"`
			`"movq %%mm3, 24(%1)\n"`
			`"movq 32(%0), %%mm0\n"`
			`"movq 40(%0), %%mm1\n"`
			`"movq 48(%0), %%mm2\n"`
			`"movq 56(%0), %%mm3\n"`
			`"movq %%mm0, 32(%1)\n"`
			`"movq %%mm1, 40(%1)\n"`
			`"movq %%mm2, 48(%1)\n"`
			`"movq %%mm3, 56(%1)\n"`
			`#else`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`"movq (%0), %%mm0\n"`
			`"movq 8(%0), %%mm1\n"`
			`"movq 16(%0), %%mm2\n"`
			`"movq 24(%0), %%mm3\n"`
			`"movntq %%mm0, (%1)\n"`
			`"movntq %%mm1, 8(%1)\n"`
			`"movntq %%mm2, 16(%1)\n"`
			`"movntq %%mm3, 24(%1)\n"`
			`"movq 32(%0), %%mm0\n"`
			`"movq 40(%0), %%mm1\n"`
			`"movq 48(%0), %%mm2\n"`
			`"movq 56(%0), %%mm3\n"`
			`"movntq %%mm0, 32(%1)\n"`
			`"movntq %%mm1, 40(%1)\n"`
			`"movntq %%mm2, 48(%1)\n"`
			`"movntq %%mm3, 56(%1)\n"`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#endif`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`#endif`
			`:: "r" (from), "r" (to) : "memory");`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`from+=64;`
			`to+=64;`
			`}`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )`
			`__asm__ __volatile__ ("femms":::"memory");`
			`#else`
- applied SSE patch by Nick Kurshev git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@410 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-14 17:56:44 +00:00			`__asm__ __volatile__ ("emms":::"memory");`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00			`#endif`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`}`
			`/*`
			`* Now do the tail of the block`
			`*/`
P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00			`small_memcpy(to, from, len);`
fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`return p;`
			`}`
P3 fixes... git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-12 14:40:10 +00:00			`#define memcpy(a,b,c) fast_memcpy(a,b,c)`
add 3dnow support git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 20:55:27 +00:00
replace "movsl..." to small_memcpy git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@514 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-18 20:44:16 +00:00			`#undef small_memcpy`

fast memcpy() using x86 asm or mmxext git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@359 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-11 20:14:49 +00:00			`#endif`

sorry, for the #endif git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@479 b3059339-0415-0410-9bf9-f77b7e298cf2 2001-04-16 21:08:05 +00:00			`#endif`