1
0
mirror of https://github.com/mpv-player/mpv synced 2025-01-11 09:29:29 +00:00

10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3078 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
michael 2001-11-22 19:40:38 +00:00
parent 2a6e9d9eda
commit 6c6cc954f5
2 changed files with 232 additions and 12 deletions

View File

@ -1,13 +1,19 @@
#include "../config.h"
#ifdef USE_FASTMEMCPY
/*
/*
aclib - advanced C library ;)
This file contains functions which improve and expand standard C-library
*/
#include <stddef.h>
#define BLOCK_SIZE 4096
#define CONFUSION_FACTOR 0
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
//#define STATISTICS
#ifndef HAVE_SSE2
/*
P3 processor has only one SSE decoder so can execute only 1 sse insn per
@ -103,7 +109,7 @@ __asm__ __volatile__(\
#ifdef HAVE_SSE
#define MMREG_SIZE 16
#else
#define MMREG_SIZE 8
#define MMREG_SIZE 64 //8
#endif
/* Small defines (for readability only) ;) */
@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
{
void *retval;
size_t i;
retval = to;
retval = to;
#ifdef STATISTICS
{
static int freq[33];
static int t=0;
int i;
for(i=0; len>(1<<i); i++);
freq[i]++;
t++;
if(1024*1024*1024 % t == 0)
for(i=0; i<32; i++)
printf("freq < %8d %4d\n", 1<<i, freq[i]);
}
#endif
#ifndef HAVE_MMX1
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
else
else
/*
Only if SRC is aligned on 16-byte boundary.
It allows to use movaps instead of movups, which required data
@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((unsigned char *)to)+=64;
}
#else
// Align destination at BLOCK_SIZE boundary
for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
{
__asm__ __volatile__ (
#ifndef HAVE_MMX1
PREFETCH" 320(%0)\n"
#endif
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n"
"movq 32(%0), %%mm4\n"
"movq 40(%0), %%mm5\n"
"movq 48(%0), %%mm6\n"
"movq 56(%0), %%mm7\n"
MOVNTQ" %%mm0, (%1)\n"
MOVNTQ" %%mm1, 8(%1)\n"
MOVNTQ" %%mm2, 16(%1)\n"
MOVNTQ" %%mm3, 24(%1)\n"
MOVNTQ" %%mm4, 32(%1)\n"
MOVNTQ" %%mm5, 40(%1)\n"
MOVNTQ" %%mm6, 48(%1)\n"
MOVNTQ" %%mm7, 56(%1)\n"
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
// printf(" %d %d\n", (int)from&1023, (int)to&1023);
// Pure Assembly cuz gcc is a bit unpredictable ;)
if(i>=BLOCK_SIZE/64)
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
"movl (%0, %%eax), %%ebx \n\t"
"movl 32(%0, %%eax), %%ebx \n\t"
"movl 64(%0, %%eax), %%ebx \n\t"
"movl 96(%0, %%eax), %%ebx \n\t"
"addl $128, %%eax \n\t"
"cmpl %3, %%eax \n\t"
" jb 1b \n\t"
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"2: \n\t"
"movq (%0, %%eax), %%mm0\n"
"movq 8(%0, %%eax), %%mm1\n"
"movq 16(%0, %%eax), %%mm2\n"
"movq 24(%0, %%eax), %%mm3\n"
"movq 32(%0, %%eax), %%mm4\n"
"movq 40(%0, %%eax), %%mm5\n"
"movq 48(%0, %%eax), %%mm6\n"
"movq 56(%0, %%eax), %%mm7\n"
MOVNTQ" %%mm0, (%1, %%eax)\n"
MOVNTQ" %%mm1, 8(%1, %%eax)\n"
MOVNTQ" %%mm2, 16(%1, %%eax)\n"
MOVNTQ" %%mm3, 24(%1, %%eax)\n"
MOVNTQ" %%mm4, 32(%1, %%eax)\n"
MOVNTQ" %%mm5, 40(%1, %%eax)\n"
MOVNTQ" %%mm6, 48(%1, %%eax)\n"
MOVNTQ" %%mm7, 56(%1, %%eax)\n"
"addl $64, %%eax \n\t"
"cmpl %3, %%eax \n\t"
"jb 2b \n\t"
#if CONFUSION_FACTOR > 0
// a few percent speedup on out of order executing CPUs
"movl %5, %%eax \n\t"
"2: \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"decl %%eax \n\t"
" jnz 2b \n\t"
#endif
"xorl %%eax, %%eax \n\t"
"addl %3, %0 \n\t"
"addl %3, %1 \n\t"
"subl %4, %2 \n\t"
"cmpl %4, %2 \n\t"
" jae 1b \n\t"
: "+r" (from), "+r" (to), "+r" (i)
: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
: "%eax", "%ebx"
);
for(; i>0; i--)
{
__asm__ __volatile__ (
@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#endif /* Have SSE */
#ifdef HAVE_MMX2
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
#ifndef HAVE_SSE
#ifndef HAVE_SSE
/* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
#endif
#endif
}
/*
* Now do the tail of the block

View File

@ -1,13 +1,19 @@
#include "../config.h"
#ifdef USE_FASTMEMCPY
/*
/*
aclib - advanced C library ;)
This file contains functions which improve and expand standard C-library
*/
#include <stddef.h>
#define BLOCK_SIZE 4096
#define CONFUSION_FACTOR 0
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
//#define STATISTICS
#ifndef HAVE_SSE2
/*
P3 processor has only one SSE decoder so can execute only 1 sse insn per
@ -103,7 +109,7 @@ __asm__ __volatile__(\
#ifdef HAVE_SSE
#define MMREG_SIZE 16
#else
#define MMREG_SIZE 8
#define MMREG_SIZE 64 //8
#endif
/* Small defines (for readability only) ;) */
@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
{
void *retval;
size_t i;
retval = to;
retval = to;
#ifdef STATISTICS
{
static int freq[33];
static int t=0;
int i;
for(i=0; len>(1<<i); i++);
freq[i]++;
t++;
if(1024*1024*1024 % t == 0)
for(i=0; i<32; i++)
printf("freq < %8d %4d\n", 1<<i, freq[i]);
}
#endif
#ifndef HAVE_MMX1
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
else
else
/*
Only if SRC is aligned on 16-byte boundary.
It allows to use movaps instead of movups, which required data
@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((unsigned char *)to)+=64;
}
#else
// Align destination at BLOCK_SIZE boundary
for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
{
__asm__ __volatile__ (
#ifndef HAVE_MMX1
PREFETCH" 320(%0)\n"
#endif
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n"
"movq 32(%0), %%mm4\n"
"movq 40(%0), %%mm5\n"
"movq 48(%0), %%mm6\n"
"movq 56(%0), %%mm7\n"
MOVNTQ" %%mm0, (%1)\n"
MOVNTQ" %%mm1, 8(%1)\n"
MOVNTQ" %%mm2, 16(%1)\n"
MOVNTQ" %%mm3, 24(%1)\n"
MOVNTQ" %%mm4, 32(%1)\n"
MOVNTQ" %%mm5, 40(%1)\n"
MOVNTQ" %%mm6, 48(%1)\n"
MOVNTQ" %%mm7, 56(%1)\n"
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
// printf(" %d %d\n", (int)from&1023, (int)to&1023);
// Pure Assembly cuz gcc is a bit unpredictable ;)
if(i>=BLOCK_SIZE/64)
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
"movl (%0, %%eax), %%ebx \n\t"
"movl 32(%0, %%eax), %%ebx \n\t"
"movl 64(%0, %%eax), %%ebx \n\t"
"movl 96(%0, %%eax), %%ebx \n\t"
"addl $128, %%eax \n\t"
"cmpl %3, %%eax \n\t"
" jb 1b \n\t"
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"2: \n\t"
"movq (%0, %%eax), %%mm0\n"
"movq 8(%0, %%eax), %%mm1\n"
"movq 16(%0, %%eax), %%mm2\n"
"movq 24(%0, %%eax), %%mm3\n"
"movq 32(%0, %%eax), %%mm4\n"
"movq 40(%0, %%eax), %%mm5\n"
"movq 48(%0, %%eax), %%mm6\n"
"movq 56(%0, %%eax), %%mm7\n"
MOVNTQ" %%mm0, (%1, %%eax)\n"
MOVNTQ" %%mm1, 8(%1, %%eax)\n"
MOVNTQ" %%mm2, 16(%1, %%eax)\n"
MOVNTQ" %%mm3, 24(%1, %%eax)\n"
MOVNTQ" %%mm4, 32(%1, %%eax)\n"
MOVNTQ" %%mm5, 40(%1, %%eax)\n"
MOVNTQ" %%mm6, 48(%1, %%eax)\n"
MOVNTQ" %%mm7, 56(%1, %%eax)\n"
"addl $64, %%eax \n\t"
"cmpl %3, %%eax \n\t"
"jb 2b \n\t"
#if CONFUSION_FACTOR > 0
// a few percent speedup on out of order executing CPUs
"movl %5, %%eax \n\t"
"2: \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"movl (%0), %%ebx \n\t"
"decl %%eax \n\t"
" jnz 2b \n\t"
#endif
"xorl %%eax, %%eax \n\t"
"addl %3, %0 \n\t"
"addl %3, %1 \n\t"
"subl %4, %2 \n\t"
"cmpl %4, %2 \n\t"
" jae 1b \n\t"
: "+r" (from), "+r" (to), "+r" (i)
: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
: "%eax", "%ebx"
);
for(; i>0; i--)
{
__asm__ __volatile__ (
@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#endif /* Have SSE */
#ifdef HAVE_MMX2
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
#ifndef HAVE_SSE
#ifndef HAVE_SSE
/* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
#endif
#endif
}
/*
* Now do the tail of the block