P3 fixes...

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
arpi_esp 2001-04-12 14:40:10 +00:00
parent 3dde448fb2
commit 34a4680099
1 changed files with 24 additions and 30 deletions

View File

@ -2,31 +2,19 @@
This part of code was taken by from Linux-2.4.3 and slightly modified
for MMX2 instruction set. I have done it since linux uses page aligned
blocks but mplayer uses weakly ordered data and original sources can not
speedup their. Only using prefetch and movntq together have effect!
speedup their. Only using prefetchnta and movntq together have effect!
If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
*/
#ifndef HAVE_MMX2
//static inline void * __memcpy(void * to, const void * from, unsigned n)
inline static void * fast_memcpy(void * to, const void * from, unsigned n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return (to);
#ifdef HAVE_MMX2
/* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)\
{\
__asm__ __volatile__(\
"rep ; movsb\n"\
::"D" (to), "S" (from),"c" (n)\
: "memory");\
}
#else
//inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len)
inline static void * fast_memcpy(void * to, const void * from, unsigned len)
{
void *p;
@ -36,12 +24,15 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
{
p = to;
i = len >> 6; /* len/64 */
len&=63;
__asm__ __volatile__ (
"1: prefetch (%0)\n" /* This set is 28 bytes */
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
"1: prefetchnta (%0)\n" /* This set is 28 bytes */
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
" prefetchnta 256(%0)\n"
#if 0
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
@ -51,13 +42,14 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" .align 4\n"
" .long 1b, 3b\n"
".previous"
#endif
: : "r" (from) );
for(; i>0; i--)
{
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"1: prefetchnta 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
@ -74,6 +66,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" movntq %%mm1, 40(%1)\n"
" movntq %%mm2, 48(%1)\n"
" movntq %%mm3, 56(%1)\n"
#if 0
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
@ -82,6 +75,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" .align 4\n"
" .long 1b, 3b\n"
".previous"
#endif
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
@ -91,10 +85,10 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
/*
* Now do the tail of the block
*/
memcpy(to, from, len&63);
small_memcpy(to, from, len);
return p;
}
#define memcpy(a,b,c) fast_memcpy(a,b,c)
#endif
#define memcpy(a,b,c) fast_memcpy(a,b,c)