1
0
mirror of https://github.com/mpv-player/mpv synced 2025-02-20 14:56:55 +00:00

add 3dnow support

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@478 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
pontscho 2001-04-16 20:55:27 +00:00
parent 22ad1b8599
commit ebe63858a9

View File

@ -1,3 +1,7 @@
#ifndef __MPLAYER_MEMCPY
#define __MPLAYER_MEMCPY
/* /*
This part of code was taken by from Linux-2.4.3 and slightly modified This part of code was taken by from Linux-2.4.3 and slightly modified
for MMX2 instruction set. I have done it since linux uses page aligned for MMX2 instruction set. I have done it since linux uses page aligned
@ -5,7 +9,12 @@ blocks but mplayer uses weakly ordered data and original sources can not
speedup their. Only using prefetchnta and movntq together have effect! speedup their. Only using prefetchnta and movntq together have effect!
If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
*/ */
#ifdef HAVE_MMX2
// 3dnow memcpy support from kernel 2.4.2
// by Pontscho/fresh!mindworkz
#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW )
/* for small memory blocks (<256 bytes) this version is faster */ /* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)\ #define small_memcpy(to,from,n)\
{\ {\
@ -27,11 +36,19 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
len&=63; len&=63;
__asm__ __volatile__ ( __asm__ __volatile__ (
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
"prefetch (%0)\n"
"prefetch 64(%0)\n"
"prefetch 128(%0)\n"
"prefetch 192(%0)\n"
"prefetch 256(%0)\n"
#else
"prefetchnta (%0)\n" "prefetchnta (%0)\n"
"prefetchnta 64(%0)\n" "prefetchnta 64(%0)\n"
"prefetchnta 128(%0)\n" "prefetchnta 128(%0)\n"
"prefetchnta 192(%0)\n" "prefetchnta 192(%0)\n"
"prefetchnta 256(%0)\n" "prefetchnta 256(%0)\n"
#endif
: : "r" (from) ); : : "r" (from) );
/* /*
This algorithm is top effective when the code consequently This algorithm is top effective when the code consequently
@ -45,7 +62,11 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
for(; i>0; i--) for(; i>0; i--)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
"prefetch 320(%0)\n"
#else
"prefetchnta 320(%0)\n" "prefetchnta 320(%0)\n"
#endif
#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
"movups (%0), %%xmm0\n" "movups (%0), %%xmm0\n"
"movups 16(%0), %%xmm1\n" "movups 16(%0), %%xmm1\n"
@ -56,6 +77,24 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
"movntps %%xmm0, 32(%1)\n" "movntps %%xmm0, 32(%1)\n"
"movntps %%xmm1, 48(%1)\n" "movntps %%xmm1, 48(%1)\n"
#else /* Only K7 (may be other) */ #else /* Only K7 (may be other) */
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n"
"movq %%mm0, (%1)\n"
"movq %%mm1, 8(%1)\n"
"movq %%mm2, 16(%1)\n"
"movq %%mm3, 24(%1)\n"
"movq 32(%0), %%mm0\n"
"movq 40(%0), %%mm1\n"
"movq 48(%0), %%mm2\n"
"movq 56(%0), %%mm3\n"
"movq %%mm0, 32(%1)\n"
"movq %%mm1, 40(%1)\n"
"movq %%mm2, 48(%1)\n"
"movq %%mm3, 56(%1)\n"
#else
"movq (%0), %%mm0\n" "movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n" "movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n" "movq 16(%0), %%mm2\n"
@ -72,20 +111,41 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
"movntq %%mm1, 40(%1)\n" "movntq %%mm1, 40(%1)\n"
"movntq %%mm2, 48(%1)\n" "movntq %%mm2, 48(%1)\n"
"movntq %%mm3, 56(%1)\n" "movntq %%mm3, 56(%1)\n"
#endif
#endif #endif
:: "r" (from), "r" (to) : "memory"); :: "r" (from), "r" (to) : "memory");
from+=64; from+=64;
to+=64; to+=64;
} }
#if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
__asm__ __volatile__ ("femms":::"memory");
#else
__asm__ __volatile__ ("emms":::"memory"); __asm__ __volatile__ ("emms":::"memory");
#endif
} }
/* /*
* Now do the tail of the block * Now do the tail of the block
*/ */
#if 0
small_memcpy(to, from, len); small_memcpy(to, from, len);
#else
__asm__ __volatile__ (
"shrl $1,%%ecx\n"
"jnc 1f\n"
"movsb\n"
"1:\n"
"shrl $1,%%ecx\n"
"jnc 2f\n"
"movsw\n"
"2:\n"
"rep ; movsl\n"
::"D" (to), "S" (from),"c" (len)
: "memory");
#endif
return p; return p;
} }
#define memcpy(a,b,c) fast_memcpy(a,b,c) #define memcpy(a,b,c) fast_memcpy(a,b,c)
#endif #endif