mirror of
https://github.com/mpv-player/mpv
synced 2025-01-11 09:29:29 +00:00
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3078 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
parent
2a6e9d9eda
commit
6c6cc954f5
122
libvo/aclib.c
122
libvo/aclib.c
@ -1,13 +1,19 @@
|
||||
#include "../config.h"
|
||||
|
||||
#ifdef USE_FASTMEMCPY
|
||||
/*
|
||||
/*
|
||||
aclib - advanced C library ;)
|
||||
This file contains functions which improve and expand standard C-library
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define BLOCK_SIZE 4096
|
||||
#define CONFUSION_FACTOR 0
|
||||
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
|
||||
|
||||
//#define STATISTICS
|
||||
|
||||
#ifndef HAVE_SSE2
|
||||
/*
|
||||
P3 processor has only one SSE decoder so can execute only 1 sse insn per
|
||||
@ -103,7 +109,7 @@ __asm__ __volatile__(\
|
||||
#ifdef HAVE_SSE
|
||||
#define MMREG_SIZE 16
|
||||
#else
|
||||
#define MMREG_SIZE 8
|
||||
#define MMREG_SIZE 64 //8
|
||||
#endif
|
||||
|
||||
/* Small defines (for readability only) ;) */
|
||||
@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
{
|
||||
void *retval;
|
||||
size_t i;
|
||||
retval = to;
|
||||
retval = to;
|
||||
#ifdef STATISTICS
|
||||
{
|
||||
static int freq[33];
|
||||
static int t=0;
|
||||
int i;
|
||||
for(i=0; len>(1<<i); i++);
|
||||
freq[i]++;
|
||||
t++;
|
||||
if(1024*1024*1024 % t == 0)
|
||||
for(i=0; i<32; i++)
|
||||
printf("freq < %8d %4d\n", 1<<i, freq[i]);
|
||||
}
|
||||
#endif
|
||||
#ifndef HAVE_MMX1
|
||||
/* PREFETCH has effect even for MOVSB instruction ;) */
|
||||
__asm__ __volatile__ (
|
||||
@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
else
|
||||
else
|
||||
/*
|
||||
Only if SRC is aligned on 16-byte boundary.
|
||||
It allows to use movaps instead of movups, which required data
|
||||
@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
#else
|
||||
// Align destination at BLOCK_SIZE boundary
|
||||
for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
#ifndef HAVE_MMX1
|
||||
PREFETCH" 320(%0)\n"
|
||||
#endif
|
||||
"movq (%0), %%mm0\n"
|
||||
"movq 8(%0), %%mm1\n"
|
||||
"movq 16(%0), %%mm2\n"
|
||||
"movq 24(%0), %%mm3\n"
|
||||
"movq 32(%0), %%mm4\n"
|
||||
"movq 40(%0), %%mm5\n"
|
||||
"movq 48(%0), %%mm6\n"
|
||||
"movq 56(%0), %%mm7\n"
|
||||
MOVNTQ" %%mm0, (%1)\n"
|
||||
MOVNTQ" %%mm1, 8(%1)\n"
|
||||
MOVNTQ" %%mm2, 16(%1)\n"
|
||||
MOVNTQ" %%mm3, 24(%1)\n"
|
||||
MOVNTQ" %%mm4, 32(%1)\n"
|
||||
MOVNTQ" %%mm5, 40(%1)\n"
|
||||
MOVNTQ" %%mm6, 48(%1)\n"
|
||||
MOVNTQ" %%mm7, 56(%1)\n"
|
||||
:: "r" (from), "r" (to) : "memory");
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
|
||||
// printf(" %d %d\n", (int)from&1023, (int)to&1023);
|
||||
// Pure Assembly cuz gcc is a bit unpredictable ;)
|
||||
if(i>=BLOCK_SIZE/64)
|
||||
asm volatile(
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
".balign 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movl (%0, %%eax), %%ebx \n\t"
|
||||
"movl 32(%0, %%eax), %%ebx \n\t"
|
||||
"movl 64(%0, %%eax), %%ebx \n\t"
|
||||
"movl 96(%0, %%eax), %%ebx \n\t"
|
||||
"addl $128, %%eax \n\t"
|
||||
"cmpl %3, %%eax \n\t"
|
||||
" jb 1b \n\t"
|
||||
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
|
||||
".balign 16 \n\t"
|
||||
"2: \n\t"
|
||||
"movq (%0, %%eax), %%mm0\n"
|
||||
"movq 8(%0, %%eax), %%mm1\n"
|
||||
"movq 16(%0, %%eax), %%mm2\n"
|
||||
"movq 24(%0, %%eax), %%mm3\n"
|
||||
"movq 32(%0, %%eax), %%mm4\n"
|
||||
"movq 40(%0, %%eax), %%mm5\n"
|
||||
"movq 48(%0, %%eax), %%mm6\n"
|
||||
"movq 56(%0, %%eax), %%mm7\n"
|
||||
MOVNTQ" %%mm0, (%1, %%eax)\n"
|
||||
MOVNTQ" %%mm1, 8(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm2, 16(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm3, 24(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm4, 32(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm5, 40(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm6, 48(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm7, 56(%1, %%eax)\n"
|
||||
"addl $64, %%eax \n\t"
|
||||
"cmpl %3, %%eax \n\t"
|
||||
"jb 2b \n\t"
|
||||
|
||||
#if CONFUSION_FACTOR > 0
|
||||
// a few percent speedup on out of order executing CPUs
|
||||
"movl %5, %%eax \n\t"
|
||||
"2: \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"decl %%eax \n\t"
|
||||
" jnz 2b \n\t"
|
||||
#endif
|
||||
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
"addl %3, %0 \n\t"
|
||||
"addl %3, %1 \n\t"
|
||||
"subl %4, %2 \n\t"
|
||||
"cmpl %4, %2 \n\t"
|
||||
" jae 1b \n\t"
|
||||
: "+r" (from), "+r" (to), "+r" (i)
|
||||
: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
|
||||
: "%eax", "%ebx"
|
||||
);
|
||||
|
||||
for(; i>0; i--)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
|
||||
#endif /* Have SSE */
|
||||
#ifdef HAVE_MMX2
|
||||
/* since movntq is weakly-ordered, a "sfence"
|
||||
* is needed to become ordered again. */
|
||||
__asm__ __volatile__ ("sfence":::"memory");
|
||||
#endif
|
||||
#ifndef HAVE_SSE
|
||||
#ifndef HAVE_SSE
|
||||
/* enables to use FPU */
|
||||
__asm__ __volatile__ (EMMS:::"memory");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
/*
|
||||
* Now do the tail of the block
|
||||
|
@ -1,13 +1,19 @@
|
||||
#include "../config.h"
|
||||
|
||||
#ifdef USE_FASTMEMCPY
|
||||
/*
|
||||
/*
|
||||
aclib - advanced C library ;)
|
||||
This file contains functions which improve and expand standard C-library
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define BLOCK_SIZE 4096
|
||||
#define CONFUSION_FACTOR 0
|
||||
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
|
||||
|
||||
//#define STATISTICS
|
||||
|
||||
#ifndef HAVE_SSE2
|
||||
/*
|
||||
P3 processor has only one SSE decoder so can execute only 1 sse insn per
|
||||
@ -103,7 +109,7 @@ __asm__ __volatile__(\
|
||||
#ifdef HAVE_SSE
|
||||
#define MMREG_SIZE 16
|
||||
#else
|
||||
#define MMREG_SIZE 8
|
||||
#define MMREG_SIZE 64 //8
|
||||
#endif
|
||||
|
||||
/* Small defines (for readability only) ;) */
|
||||
@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
{
|
||||
void *retval;
|
||||
size_t i;
|
||||
retval = to;
|
||||
retval = to;
|
||||
#ifdef STATISTICS
|
||||
{
|
||||
static int freq[33];
|
||||
static int t=0;
|
||||
int i;
|
||||
for(i=0; len>(1<<i); i++);
|
||||
freq[i]++;
|
||||
t++;
|
||||
if(1024*1024*1024 % t == 0)
|
||||
for(i=0; i<32; i++)
|
||||
printf("freq < %8d %4d\n", 1<<i, freq[i]);
|
||||
}
|
||||
#endif
|
||||
#ifndef HAVE_MMX1
|
||||
/* PREFETCH has effect even for MOVSB instruction ;) */
|
||||
__asm__ __volatile__ (
|
||||
@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
else
|
||||
else
|
||||
/*
|
||||
Only if SRC is aligned on 16-byte boundary.
|
||||
It allows to use movaps instead of movups, which required data
|
||||
@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
#else
|
||||
// Align destination at BLOCK_SIZE boundary
|
||||
for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
#ifndef HAVE_MMX1
|
||||
PREFETCH" 320(%0)\n"
|
||||
#endif
|
||||
"movq (%0), %%mm0\n"
|
||||
"movq 8(%0), %%mm1\n"
|
||||
"movq 16(%0), %%mm2\n"
|
||||
"movq 24(%0), %%mm3\n"
|
||||
"movq 32(%0), %%mm4\n"
|
||||
"movq 40(%0), %%mm5\n"
|
||||
"movq 48(%0), %%mm6\n"
|
||||
"movq 56(%0), %%mm7\n"
|
||||
MOVNTQ" %%mm0, (%1)\n"
|
||||
MOVNTQ" %%mm1, 8(%1)\n"
|
||||
MOVNTQ" %%mm2, 16(%1)\n"
|
||||
MOVNTQ" %%mm3, 24(%1)\n"
|
||||
MOVNTQ" %%mm4, 32(%1)\n"
|
||||
MOVNTQ" %%mm5, 40(%1)\n"
|
||||
MOVNTQ" %%mm6, 48(%1)\n"
|
||||
MOVNTQ" %%mm7, 56(%1)\n"
|
||||
:: "r" (from), "r" (to) : "memory");
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
|
||||
// printf(" %d %d\n", (int)from&1023, (int)to&1023);
|
||||
// Pure Assembly cuz gcc is a bit unpredictable ;)
|
||||
if(i>=BLOCK_SIZE/64)
|
||||
asm volatile(
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
".balign 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movl (%0, %%eax), %%ebx \n\t"
|
||||
"movl 32(%0, %%eax), %%ebx \n\t"
|
||||
"movl 64(%0, %%eax), %%ebx \n\t"
|
||||
"movl 96(%0, %%eax), %%ebx \n\t"
|
||||
"addl $128, %%eax \n\t"
|
||||
"cmpl %3, %%eax \n\t"
|
||||
" jb 1b \n\t"
|
||||
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
|
||||
".balign 16 \n\t"
|
||||
"2: \n\t"
|
||||
"movq (%0, %%eax), %%mm0\n"
|
||||
"movq 8(%0, %%eax), %%mm1\n"
|
||||
"movq 16(%0, %%eax), %%mm2\n"
|
||||
"movq 24(%0, %%eax), %%mm3\n"
|
||||
"movq 32(%0, %%eax), %%mm4\n"
|
||||
"movq 40(%0, %%eax), %%mm5\n"
|
||||
"movq 48(%0, %%eax), %%mm6\n"
|
||||
"movq 56(%0, %%eax), %%mm7\n"
|
||||
MOVNTQ" %%mm0, (%1, %%eax)\n"
|
||||
MOVNTQ" %%mm1, 8(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm2, 16(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm3, 24(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm4, 32(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm5, 40(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm6, 48(%1, %%eax)\n"
|
||||
MOVNTQ" %%mm7, 56(%1, %%eax)\n"
|
||||
"addl $64, %%eax \n\t"
|
||||
"cmpl %3, %%eax \n\t"
|
||||
"jb 2b \n\t"
|
||||
|
||||
#if CONFUSION_FACTOR > 0
|
||||
// a few percent speedup on out of order executing CPUs
|
||||
"movl %5, %%eax \n\t"
|
||||
"2: \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"movl (%0), %%ebx \n\t"
|
||||
"decl %%eax \n\t"
|
||||
" jnz 2b \n\t"
|
||||
#endif
|
||||
|
||||
"xorl %%eax, %%eax \n\t"
|
||||
"addl %3, %0 \n\t"
|
||||
"addl %3, %1 \n\t"
|
||||
"subl %4, %2 \n\t"
|
||||
"cmpl %4, %2 \n\t"
|
||||
" jae 1b \n\t"
|
||||
: "+r" (from), "+r" (to), "+r" (i)
|
||||
: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
|
||||
: "%eax", "%ebx"
|
||||
);
|
||||
|
||||
for(; i>0; i--)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
|
||||
((const unsigned char *)from)+=64;
|
||||
((unsigned char *)to)+=64;
|
||||
}
|
||||
|
||||
#endif /* Have SSE */
|
||||
#ifdef HAVE_MMX2
|
||||
/* since movntq is weakly-ordered, a "sfence"
|
||||
* is needed to become ordered again. */
|
||||
__asm__ __volatile__ ("sfence":::"memory");
|
||||
#endif
|
||||
#ifndef HAVE_SSE
|
||||
#ifndef HAVE_SSE
|
||||
/* enables to use FPU */
|
||||
__asm__ __volatile__ (EMMS:::"memory");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
/*
|
||||
* Now do the tail of the block
|
||||
|
Loading…
Reference in New Issue
Block a user