mirror of https://github.com/mpv-player/mpv
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1246 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
parent
bf8a76c063
commit
2ec6762923
|
@ -1,8 +1,10 @@
|
|||
|
||||
include config.mak
|
||||
|
||||
SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS)
|
||||
OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS)
|
||||
SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\
|
||||
dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s
|
||||
OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\
|
||||
dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o
|
||||
# OBJS = $(SRCS:.c,.s=.o)
|
||||
CFLAGS = $(OPTFLAGS) $(EXTRA_INC)
|
||||
|
||||
|
|
|
@ -9,9 +9,12 @@
|
|||
unsigned int _CpuID;
|
||||
unsigned int _i586;
|
||||
unsigned int _3dnow;
|
||||
unsigned int _isse;
|
||||
unsigned int _has_mmx;
|
||||
|
||||
extern unsigned long CpuDetect( void );
|
||||
extern unsigned long ipentium( void );
|
||||
extern unsigned long isse( void );
|
||||
extern unsigned long a3dnow( void );
|
||||
|
||||
#endif
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
.globl CpuDetect
|
||||
.globl ipentium
|
||||
.globl a3dnow
|
||||
.globl isse
|
||||
|
||||
/ ---------------------------------------------------------------------------
|
||||
/ in C: unsigned long CpuDetect( void );
|
||||
|
@ -45,7 +46,9 @@ exit_cpudetect:
|
|||
|
||||
/ ---------------------------------------------------------------------------
|
||||
/ in C: unsigled long ipentium( void );
|
||||
/ return: 0 if the processor is not P5 or above else above 1.
|
||||
/ return: 0 if this processor i386 or i486
|
||||
/ 1 otherwise
|
||||
/ 2 if this cpu supports mmx
|
||||
/ ---------------------------------------------------------------------------
|
||||
ipentium:
|
||||
pushl %ebx
|
||||
|
@ -63,10 +66,15 @@ ipentium:
|
|||
jz no_cpuid
|
||||
movl $1,%eax
|
||||
cpuid
|
||||
shrl $8,%eax
|
||||
cmpl $5,%eax
|
||||
jb no_cpuid
|
||||
movl $1,%eax
|
||||
movl %eax, %ecx
|
||||
xorl %eax, %eax
|
||||
shrl $8,%ecx
|
||||
cmpl $5,%ecx
|
||||
jb exit
|
||||
incl %eax
|
||||
test $0x00800000, %edx
|
||||
jz exit
|
||||
incl %eax
|
||||
jmp exit
|
||||
no_cpuid:
|
||||
xorl %eax,%eax
|
||||
|
@ -113,3 +121,33 @@ exit2:
|
|||
popl %edx
|
||||
popl %ebx
|
||||
ret
|
||||
|
||||
/ ---------------------------------------------------------------------------
|
||||
/ in C: unsigned long isse( void );
|
||||
/ return: 0 if this processor does not support sse
|
||||
/ 1 otherwise
|
||||
/ 2 if this cpu supports sse2 extension
|
||||
/ ---------------------------------------------------------------------------
|
||||
isse:
|
||||
pushl %ebx
|
||||
pushl %edx
|
||||
pushl %ecx
|
||||
|
||||
call ipentium
|
||||
testl %eax,%eax
|
||||
jz exit3
|
||||
|
||||
movl $1,%eax
|
||||
cpuid
|
||||
xorl %eax, %eax
|
||||
testl $0x02000000,%edx
|
||||
jz exit3
|
||||
incl %eax
|
||||
testl $0x04000000,%edx
|
||||
jz exit3
|
||||
incl %eax
|
||||
exit3:
|
||||
popl %ecx
|
||||
popl %edx
|
||||
popl %ebx
|
||||
ret
|
||||
|
|
|
@ -193,7 +193,7 @@ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf)
|
|||
sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
|
||||
MACRO0(v); }
|
||||
|
||||
register const real *c = nCOS9;
|
||||
register const real *c = COS9;
|
||||
register real *out2 = o2;
|
||||
register real *w = wintab;
|
||||
register real *out1 = o1;
|
||||
|
|
1598
mp3lib/dct64_3dnow.s
1598
mp3lib/dct64_3dnow.s
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1409
mp3lib/dct64_k7.s
1409
mp3lib/dct64_k7.s
File diff suppressed because it is too large
Load Diff
|
@ -105,6 +105,15 @@ static int synth_1to1_r(real *bandPtr,int channel,unsigned char *out,int *pnt)
|
|||
}
|
||||
#endif
|
||||
|
||||
synth_func_t synth_func;
|
||||
|
||||
int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
|
||||
{
|
||||
static short buffs[2][2][0x110];
|
||||
static int bo = 1;
|
||||
synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
|
||||
{
|
||||
|
@ -117,40 +126,13 @@ static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
|
|||
int clip = 0;
|
||||
int bo1;
|
||||
|
||||
#ifdef HAVE_SSE_MP3
|
||||
//if ( _3dnow )
|
||||
if ( synth_func )
|
||||
{
|
||||
int ret;
|
||||
ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
|
||||
ret=(*synth_func)( bandPtr,channel,samples);
|
||||
*pnt+=128;
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_3DNOWEX
|
||||
if ( _3dnow > 1 )
|
||||
{
|
||||
int ret;
|
||||
ret=synth_1to1_3dnowex( bandPtr,channel,out+*pnt );
|
||||
*pnt+=128;
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_3DNOW
|
||||
if ( _3dnow )
|
||||
{
|
||||
int ret;
|
||||
ret=synth_1to1_3dnow( bandPtr,channel,out+*pnt );
|
||||
*pnt+=128;
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
if ( _i586 )
|
||||
{
|
||||
int ret;
|
||||
ret=synth_1to1_pent( bandPtr,channel,out+*pnt );
|
||||
*pnt+=128;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(!channel) { /* channel=0 */
|
||||
bo--;
|
||||
|
|
|
@ -1,265 +0,0 @@
|
|||
/ synth_1to1_3dnow works the same way as the c version of
|
||||
/ synth_1to1. this assembler code based 'decode-i586.s'
|
||||
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
|
||||
/ have been made:
|
||||
/ - use {MMX,3DNow!} instruction for reduce cpu
|
||||
/ - remove unused(?) local symbols
|
||||
/
|
||||
/ useful sources of information on optimizing 3DNow! code include:
|
||||
/ AMD 3DNow! Technology Manual (Publication #21928)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
|
||||
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
|
||||
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
|
||||
/
|
||||
/ This code was tested only AMD-K6-2 processor Linux systems,
|
||||
/ please tell me:
|
||||
/ - whether this code works on other 3DNow! capable processors
|
||||
/ (ex.IDT-C6-2) or not
|
||||
/ - whether this code works on other OSes or not
|
||||
/
|
||||
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
|
||||
/ <kim@comtec.co.jp> - after 1.Apr.1998
|
||||
|
||||
/ Enhancments for q-word operation by Michael Hipp
|
||||
|
||||
.bss
|
||||
.comm buffs,4352,4
|
||||
.data
|
||||
.align 4
|
||||
bo:
|
||||
.long 1
|
||||
.text
|
||||
.globl synth_1to1_3dnow
|
||||
synth_1to1_3dnow:
|
||||
subl $12,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
movl 32(%esp),%eax
|
||||
movl 40(%esp),%esi
|
||||
movl $0,%edi
|
||||
movl bo,%ebp
|
||||
cmpl %edi,36(%esp)
|
||||
jne .L48
|
||||
decl %ebp
|
||||
andl $15,%ebp
|
||||
movl %ebp,bo
|
||||
movl $buffs,%ecx
|
||||
jmp .L49
|
||||
.L48:
|
||||
addl $2,%esi
|
||||
movl $buffs+2176,%ecx
|
||||
.L49:
|
||||
testl $1,%ebp
|
||||
je .L50
|
||||
movl %ecx,%ebx
|
||||
movl %ebp,16(%esp)
|
||||
pushl %eax
|
||||
movl 20(%esp),%edx
|
||||
leal (%ebx,%edx,4),%eax
|
||||
pushl %eax
|
||||
movl 24(%esp),%eax
|
||||
incl %eax
|
||||
andl $15,%eax
|
||||
leal 1088(,%eax,4),%eax
|
||||
addl %ebx,%eax
|
||||
jmp .L74
|
||||
.L50:
|
||||
leal 1088(%ecx),%ebx
|
||||
leal 1(%ebp),%edx
|
||||
movl %edx,16(%esp)
|
||||
pushl %eax
|
||||
leal 1092(%ecx,%ebp,4),%eax
|
||||
pushl %eax
|
||||
leal (%ecx,%ebp,4),%eax
|
||||
.L74:
|
||||
pushl %eax
|
||||
call dct64_3dnow
|
||||
addl $12,%esp
|
||||
movl 16(%esp),%edx
|
||||
leal 0(,%edx,4),%edx
|
||||
movl $decwin+64,%eax
|
||||
movl %eax,%ecx
|
||||
subl %edx,%ecx
|
||||
movl $16,%ebp
|
||||
|
||||
.L55:
|
||||
movq (%ecx),%mm4
|
||||
movq (%ebx),%mm3
|
||||
movq 8(%ecx),%mm0
|
||||
movq 8(%ebx),%mm1
|
||||
pfmul %mm3,%mm4
|
||||
|
||||
movq 16(%ecx),%mm2
|
||||
pfmul %mm1,%mm0
|
||||
movq 16(%ebx),%mm3
|
||||
pfadd %mm0,%mm4
|
||||
|
||||
movq 24(%ecx),%mm0
|
||||
pfmul %mm2,%mm3
|
||||
movq 24(%ebx),%mm1
|
||||
pfadd %mm3,%mm4
|
||||
|
||||
movq 32(%ecx),%mm2
|
||||
pfmul %mm1,%mm0
|
||||
movq 32(%ebx),%mm3
|
||||
pfadd %mm0,%mm4
|
||||
|
||||
movq 40(%ecx),%mm0
|
||||
pfmul %mm2,%mm3
|
||||
movq 40(%ebx),%mm1
|
||||
pfadd %mm3,%mm4
|
||||
|
||||
movq 48(%ecx),%mm2
|
||||
pfmul %mm1,%mm0
|
||||
movq 48(%ebx),%mm3
|
||||
pfadd %mm0,%mm4
|
||||
|
||||
movq 56(%ecx),%mm0
|
||||
pfmul %mm2,%mm3
|
||||
movq 56(%ebx),%mm1
|
||||
pfadd %mm3,%mm4
|
||||
|
||||
pfmul %mm1,%mm0
|
||||
pfadd %mm0,%mm4
|
||||
|
||||
movq %mm4,%mm0
|
||||
psrlq $32,%mm0
|
||||
pfsub %mm0,%mm4
|
||||
|
||||
pf2id %mm4,%mm4
|
||||
movd %mm4,%eax
|
||||
|
||||
sar $16,%eax
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $64,%ebx
|
||||
subl $-128,%ecx
|
||||
addl $4,%esi
|
||||
decl %ebp
|
||||
jnz .L55
|
||||
|
||||
/ --- end of loop 1 ---
|
||||
|
||||
movd (%ecx),%mm2
|
||||
movd (%ebx),%mm1
|
||||
pfmul %mm1,%mm2
|
||||
|
||||
movd 8(%ecx),%mm0
|
||||
movd 8(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 16(%ecx),%mm0
|
||||
movd 16(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 24(%ecx),%mm0
|
||||
movd 24(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 32(%ecx),%mm0
|
||||
movd 32(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 40(%ecx),%mm0
|
||||
movd 40(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 48(%ecx),%mm0
|
||||
movd 48(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
movd 56(%ecx),%mm0
|
||||
movd 56(%ebx),%mm1
|
||||
pfmul %mm0,%mm1
|
||||
pfadd %mm1,%mm2
|
||||
|
||||
pf2id %mm2,%mm2
|
||||
movd %mm2,%eax
|
||||
|
||||
sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $-64,%ebx
|
||||
addl $4,%esi
|
||||
addl $256,%ecx
|
||||
movl $15,%ebp
|
||||
|
||||
.L68:
|
||||
psubd %mm0,%mm0
|
||||
|
||||
movq (%ebx),%mm1
|
||||
movq (%ecx),%mm2
|
||||
pfmul %mm1,%mm2
|
||||
pfsub %mm2,%mm0
|
||||
|
||||
movq 8(%ebx),%mm3
|
||||
movq 8(%ecx),%mm4
|
||||
pfmul %mm3,%mm4
|
||||
pfsub %mm4,%mm0
|
||||
|
||||
movq 16(%ebx),%mm1
|
||||
movq 16(%ecx),%mm2
|
||||
pfmul %mm1,%mm2
|
||||
pfsub %mm2,%mm0
|
||||
|
||||
movq 24(%ebx),%mm3
|
||||
movq 24(%ecx),%mm4
|
||||
pfmul %mm3,%mm4
|
||||
pfsub %mm4,%mm0
|
||||
|
||||
movq 32(%ebx),%mm1
|
||||
movq 32(%ecx),%mm2
|
||||
pfmul %mm1,%mm2
|
||||
pfsub %mm2,%mm0
|
||||
|
||||
movq 40(%ebx),%mm3
|
||||
movq 40(%ecx),%mm4
|
||||
pfmul %mm3,%mm4
|
||||
pfsub %mm4,%mm0
|
||||
|
||||
movq 48(%ebx),%mm1
|
||||
movq 48(%ecx),%mm2
|
||||
pfmul %mm1,%mm2
|
||||
pfsub %mm2,%mm0
|
||||
|
||||
movq 56(%ebx),%mm3
|
||||
movq 56(%ecx),%mm4
|
||||
pfmul %mm3,%mm4
|
||||
pfsub %mm4,%mm0
|
||||
|
||||
pfacc %mm0,%mm0
|
||||
|
||||
pf2id %mm0,%mm0
|
||||
movd %mm0,%eax
|
||||
|
||||
sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $-64,%ebx
|
||||
subl $-128,%ecx
|
||||
addl $4,%esi
|
||||
decl %ebp
|
||||
jnz .L68
|
||||
|
||||
/ --- end of loop 2
|
||||
|
||||
femms
|
||||
|
||||
movl %edi,%eax
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $12,%esp
|
||||
ret
|
|
@ -0,0 +1,117 @@
|
|||
# this code comes under GPL
|
||||
# This code was taken from http://www.mpg123.org
|
||||
# See ChangeLog of mpg123-0.59s-pre.1 for detail
|
||||
# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
|
||||
#
|
||||
# TODO: Partial loops unrolling and removing MOVW insn.
|
||||
#
|
||||
|
||||
.text
|
||||
|
||||
.globl synth_1to1_MMX_s
|
||||
|
||||
synth_1to1_MMX_s:
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
movl 24(%esp),%ecx
|
||||
movl 28(%esp),%edi
|
||||
movl $15,%ebx
|
||||
movl 36(%esp),%edx
|
||||
leal (%edi,%ecx,2),%edi
|
||||
decl %ecx
|
||||
movl 32(%esp),%esi
|
||||
movl (%edx),%eax
|
||||
jecxz .L1
|
||||
decl %eax
|
||||
andl %ebx,%eax
|
||||
leal 1088(%esi),%esi
|
||||
movl %eax,(%edx)
|
||||
.L1:
|
||||
leal (%esi,%eax,2),%edx
|
||||
movl %eax,%ebp
|
||||
incl %eax
|
||||
pushl 20(%esp)
|
||||
andl %ebx,%eax
|
||||
leal 544(%esi,%eax,2),%ecx
|
||||
incl %ebx
|
||||
testl $1, %eax
|
||||
jnz .L2
|
||||
xchgl %edx,%ecx
|
||||
incl %ebp
|
||||
leal 544(%esi),%esi
|
||||
.L2:
|
||||
emms
|
||||
pushl %edx
|
||||
pushl %ecx
|
||||
call *dct64_MMX_func
|
||||
addl $12,%esp
|
||||
leal 1(%ebx), %ecx
|
||||
subl %ebp,%ebx
|
||||
|
||||
leal decwins(%ebx,%ebx,1), %edx
|
||||
.L3:
|
||||
movq (%edx),%mm0
|
||||
pmaddwd (%esi),%mm0
|
||||
movq 8(%edx),%mm1
|
||||
pmaddwd 8(%esi),%mm1
|
||||
movq 16(%edx),%mm2
|
||||
pmaddwd 16(%esi),%mm2
|
||||
movq 24(%edx),%mm3
|
||||
pmaddwd 24(%esi),%mm3
|
||||
paddd %mm1,%mm0
|
||||
paddd %mm2,%mm0
|
||||
paddd %mm3,%mm0
|
||||
movq %mm0,%mm1
|
||||
psrlq $32,%mm1
|
||||
paddd %mm1,%mm0
|
||||
psrad $13,%mm0
|
||||
packssdw %mm0,%mm0
|
||||
movd %mm0,%eax
|
||||
movw %ax, (%edi)
|
||||
|
||||
leal 32(%esi),%esi
|
||||
leal 64(%edx),%edx
|
||||
leal 4(%edi),%edi
|
||||
decl %ecx
|
||||
jnz .L3
|
||||
|
||||
|
||||
subl $64,%esi
|
||||
movl $15,%ecx
|
||||
.L4:
|
||||
movq (%edx),%mm0
|
||||
pmaddwd (%esi),%mm0
|
||||
movq 8(%edx),%mm1
|
||||
pmaddwd 8(%esi),%mm1
|
||||
movq 16(%edx),%mm2
|
||||
pmaddwd 16(%esi),%mm2
|
||||
movq 24(%edx),%mm3
|
||||
pmaddwd 24(%esi),%mm3
|
||||
paddd %mm1,%mm0
|
||||
paddd %mm2,%mm0
|
||||
paddd %mm3,%mm0
|
||||
movq %mm0,%mm1
|
||||
psrlq $32,%mm1
|
||||
paddd %mm0,%mm1
|
||||
psrad $13,%mm1
|
||||
packssdw %mm1,%mm1
|
||||
psubd %mm0,%mm0
|
||||
psubsw %mm1,%mm0
|
||||
movd %mm0,%eax
|
||||
movw %ax,(%edi)
|
||||
|
||||
subl $32,%esi
|
||||
addl $64,%edx
|
||||
leal 4(%edi),%edi
|
||||
decl %ecx
|
||||
jnz .L4
|
||||
emms
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
ret
|
||||
|
||||
|
|
@ -1,364 +0,0 @@
|
|||
///
|
||||
/// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support
|
||||
///
|
||||
/// This code based 'decode_3dnow.s' by Syuuhei Kashiyama
|
||||
/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
|
||||
///
|
||||
/// - Added new opcode PFNACC
|
||||
/// - decreased number of opcodes (as it was suggested by k7 manual)
|
||||
/// (using memory reference as operand of instructions)
|
||||
/// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2
|
||||
/// and saves 15-25 cpu clocks for athlon.
|
||||
/// - partial unrolling loops for removing slower MOVW insns.
|
||||
/// (Note: probably same operation should be done for decode_3dnow.s)
|
||||
/// - change function name for support 3DNowEx! automatic detect
|
||||
/// - added loops alignment
|
||||
///
|
||||
/// note: because K7 processors are an aggresive out-of-order three-way
|
||||
/// superscalar ones instruction order is not significand for them.
|
||||
///
|
||||
/// Benchmark: measured by mplayer on Duron-700:
|
||||
/// 3dNow! optimized code - 1.4% of cpu usage
|
||||
/// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage
|
||||
/// k7 optimized code - 1.1% of cpu usage
|
||||
/// Note: K6-2 users have an chance with partial loops unrolling
|
||||
///
|
||||
/// Modified by Nick Kurshev <nickols_k@mail.ru>
|
||||
///
|
||||
/ synth_1to1_3dnow works the same way as the c version of
|
||||
/ synth_1to1. this assembler code based 'decode-i586.s'
|
||||
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
|
||||
/ have been made:
|
||||
/ - use {MMX,3DNow!} instruction for reduce cpu
|
||||
/ - remove unused(?) local symbols
|
||||
/
|
||||
/ useful sources of information on optimizing 3DNow! code include:
|
||||
/ AMD 3DNow! Technology Manual (Publication #21928)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
|
||||
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
|
||||
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
|
||||
/
|
||||
/ This code was tested only AMD-K6-2 processor Linux systems,
|
||||
/ please tell me:
|
||||
/ - whether this code works on other 3DNow! capable processors
|
||||
/ (ex.IDT-C6-2) or not
|
||||
/ - whether this code works on other OSes or not
|
||||
/
|
||||
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
|
||||
/ <kim@comtec.co.jp> - after 1.Apr.1998
|
||||
|
||||
/ Enhancments for q-word operation by Michael Hipp
|
||||
|
||||
.bss
|
||||
.comm buffs,4352,4
|
||||
.data
|
||||
.align 8
|
||||
null_one: .long 0x0000ffff, 0x0000ffff
|
||||
one_null: .long 0xffff0000, 0xffff0000
|
||||
bo: .long 1
|
||||
.text
|
||||
/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
|
||||
.globl synth_1to1_3dnowex
|
||||
synth_1to1_3dnowex:
|
||||
subl $12,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
|
||||
movl 32(%esp),%eax
|
||||
movl 40(%esp),%esi
|
||||
movl $0,%edi
|
||||
movl bo,%ebp
|
||||
cmpl %edi,36(%esp)
|
||||
jne .L48
|
||||
decl %ebp
|
||||
andl $15,%ebp
|
||||
movl %ebp,bo
|
||||
movl $buffs,%ecx
|
||||
jmp .L49
|
||||
.L48:
|
||||
addl $2,%esi
|
||||
movl $buffs+2176,%ecx
|
||||
.L49:
|
||||
testl $1,%ebp
|
||||
je .L50
|
||||
movl %ecx,%ebx
|
||||
movl %ebp,16(%esp)
|
||||
pushl %eax
|
||||
movl 20(%esp),%edx
|
||||
leal (%ebx,%edx,4),%eax
|
||||
pushl %eax
|
||||
movl 24(%esp),%eax
|
||||
incl %eax
|
||||
andl $15,%eax
|
||||
leal 1088(,%eax,4),%eax
|
||||
addl %ebx,%eax
|
||||
jmp .L74
|
||||
.L50:
|
||||
leal 1088(%ecx),%ebx
|
||||
leal 1(%ebp),%edx
|
||||
movl %edx,16(%esp)
|
||||
pushl %eax
|
||||
leal 1092(%ecx,%ebp,4),%eax
|
||||
pushl %eax
|
||||
leal (%ecx,%ebp,4),%eax
|
||||
.L74:
|
||||
pushl %eax
|
||||
call dct64_3dnowex
|
||||
movl 16(%esp),%edx
|
||||
leal 0(,%edx,4),%edx
|
||||
movl $decwin+64,%eax
|
||||
movl %eax,%ecx
|
||||
subl %edx,%ecx
|
||||
movl $8,%ebp
|
||||
prefetchw (%esi)
|
||||
.align 16
|
||||
.L55:
|
||||
|
||||
movq (%ecx),%mm0
|
||||
pfmul (%ebx),%mm0
|
||||
movq 128(%ecx),%mm4
|
||||
pfmul 64(%ebx),%mm4
|
||||
|
||||
movq 8(%ecx),%mm1
|
||||
pfmul 8(%ebx),%mm1
|
||||
pfadd %mm1,%mm0
|
||||
movq 136(%ecx),%mm5
|
||||
pfmul 72(%ebx),%mm5
|
||||
pfadd %mm5,%mm4
|
||||
|
||||
movq 16(%ebx),%mm2
|
||||
pfmul 16(%ecx),%mm2
|
||||
pfadd %mm2,%mm0
|
||||
movq 80(%ebx),%mm6
|
||||
pfmul 144(%ecx),%mm6
|
||||
pfadd %mm6,%mm4
|
||||
|
||||
movq 24(%ecx),%mm3
|
||||
pfmul 24(%ebx),%mm3
|
||||
pfadd %mm3,%mm0
|
||||
movq 152(%ecx),%mm7
|
||||
pfmul 88(%ebx),%mm7
|
||||
pfadd %mm7,%mm4
|
||||
|
||||
movq 32(%ebx),%mm1
|
||||
pfmul 32(%ecx),%mm1
|
||||
pfadd %mm1,%mm0
|
||||
movq 96(%ebx),%mm5
|
||||
pfmul 160(%ecx),%mm5
|
||||
pfadd %mm5,%mm4
|
||||
|
||||
movq 40(%ecx),%mm2
|
||||
pfmul 40(%ebx),%mm2
|
||||
pfadd %mm2,%mm0
|
||||
movq 168(%ecx),%mm6
|
||||
pfmul 104(%ebx),%mm6
|
||||
pfadd %mm6,%mm4
|
||||
|
||||
movq 48(%ebx),%mm3
|
||||
pfmul 48(%ecx),%mm3
|
||||
pfadd %mm3,%mm0
|
||||
movq 112(%ebx),%mm7
|
||||
pfmul 176(%ecx),%mm7
|
||||
pfadd %mm7,%mm4
|
||||
|
||||
movq 56(%ecx),%mm1
|
||||
pfmul 56(%ebx),%mm1
|
||||
pfadd %mm1,%mm0
|
||||
movq 184(%ecx),%mm5
|
||||
pfmul 120(%ebx),%mm5
|
||||
pfadd %mm5,%mm4
|
||||
|
||||
pfnacc %mm4, %mm0
|
||||
movq (%esi), %mm1
|
||||
pf2id %mm0, %mm0
|
||||
pand one_null, %mm1
|
||||
psrld $16,%mm0
|
||||
pand null_one, %mm0
|
||||
por %mm0, %mm1
|
||||
movq %mm1,(%esi)
|
||||
|
||||
addl $128,%ebx
|
||||
addl $256,%ecx
|
||||
addl $8,%esi
|
||||
decl %ebp
|
||||
jnz .L55
|
||||
|
||||
/ --- end of loop 1 ---
|
||||
|
||||
prefetchw (%esi) /* prefetching for writing this block and next loop */
|
||||
|
||||
movd (%ecx),%mm0
|
||||
pfmul (%ebx),%mm0
|
||||
|
||||
movd 8(%ebx),%mm1
|
||||
pfmul 8(%ecx),%mm1
|
||||
pfadd %mm1,%mm0
|
||||
|
||||
movd 16(%ebx),%mm2
|
||||
pfmul 16(%ecx),%mm2
|
||||
pfadd %mm2,%mm0
|
||||
|
||||
movd 24(%ebx),%mm3
|
||||
pfmul 24(%ecx),%mm3
|
||||
pfadd %mm3,%mm0
|
||||
|
||||
movd 32(%ebx),%mm4
|
||||
pfmul 32(%ecx),%mm4
|
||||
pfadd %mm4,%mm0
|
||||
|
||||
movd 40(%ebx),%mm5
|
||||
pfmul 40(%ecx),%mm5
|
||||
pfadd %mm5,%mm0
|
||||
|
||||
movd 48(%ebx),%mm6
|
||||
pfmul 48(%ecx),%mm6
|
||||
pfadd %mm6,%mm0
|
||||
|
||||
movd 56(%ebx),%mm7
|
||||
pfmul 56(%ecx),%mm7
|
||||
pfadd %mm7,%mm0
|
||||
|
||||
pf2id %mm0,%mm0
|
||||
movd %mm0,%eax
|
||||
|
||||
sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
subl $64,%ebx
|
||||
addl $4,%esi
|
||||
addl $256,%ecx
|
||||
movl $7,%ebp
|
||||
.align 16
|
||||
.L68:
|
||||
pxor %mm0, %mm0
|
||||
pxor %mm4, %mm4
|
||||
|
||||
movq (%ecx),%mm1
|
||||
pfmul (%ebx),%mm1
|
||||
pfsub %mm1,%mm0
|
||||
movq 128(%ecx),%mm5
|
||||
pfmul -64(%ebx),%mm5
|
||||
pfsub %mm5,%mm4
|
||||
|
||||
movq 8(%ecx),%mm2
|
||||
pfmul 8(%ebx),%mm2
|
||||
pfsub %mm2,%mm0
|
||||
movq 136(%ecx),%mm6
|
||||
pfmul -56(%ebx),%mm6
|
||||
pfsub %mm6,%mm4
|
||||
|
||||
movq 16(%ecx),%mm3
|
||||
pfmul 16(%ebx),%mm3
|
||||
pfsub %mm3,%mm0
|
||||
movq 144(%ecx),%mm7
|
||||
pfmul -48(%ebx),%mm7
|
||||
pfsub %mm7,%mm4
|
||||
|
||||
movq 24(%ecx),%mm1
|
||||
pfmul 24(%ebx),%mm1
|
||||
pfsub %mm1,%mm0
|
||||
movq 152(%ecx),%mm5
|
||||
pfmul -40(%ebx),%mm5
|
||||
pfsub %mm5,%mm4
|
||||
|
||||
movq 32(%ecx),%mm2
|
||||
pfmul 32(%ebx),%mm2
|
||||
pfsub %mm2,%mm0
|
||||
movq 160(%ecx),%mm6
|
||||
pfmul -32(%ebx),%mm6
|
||||
pfsub %mm6,%mm4
|
||||
|
||||
movq 40(%ecx),%mm3
|
||||
pfmul 40(%ebx),%mm3
|
||||
pfsub %mm3,%mm0
|
||||
movq 168(%ecx),%mm7
|
||||
pfmul -24(%ebx),%mm7
|
||||
pfsub %mm7,%mm4
|
||||
|
||||
movq 48(%ecx),%mm1
|
||||
pfmul 48(%ebx),%mm1
|
||||
pfsub %mm1,%mm0
|
||||
movq 176(%ecx),%mm5
|
||||
pfmul -16(%ebx),%mm5
|
||||
pfsub %mm5,%mm4
|
||||
|
||||
movq 56(%ecx),%mm2
|
||||
pfmul 56(%ebx),%mm2
|
||||
pfsub %mm2,%mm0
|
||||
movq 184(%ecx),%mm6
|
||||
pfmul -8(%ebx),%mm6
|
||||
pfsub %mm6,%mm4
|
||||
|
||||
pfacc %mm4,%mm0
|
||||
movq (%esi), %mm1
|
||||
pf2id %mm0, %mm0
|
||||
pand one_null, %mm1
|
||||
psrld $16,%mm0
|
||||
pand null_one, %mm0
|
||||
por %mm0, %mm1
|
||||
movq %mm1,(%esi)
|
||||
|
||||
subl $128,%ebx
|
||||
addl $256,%ecx
|
||||
addl $8,%esi
|
||||
decl %ebp
|
||||
jnz .L68
|
||||
|
||||
/ --- end of loop 2
|
||||
|
||||
pxor %mm0, %mm0
|
||||
|
||||
movq (%ecx),%mm1
|
||||
pfmul (%ebx),%mm1
|
||||
pfsub %mm1,%mm0
|
||||
|
||||
movq 8(%ecx),%mm2
|
||||
pfmul 8(%ebx),%mm2
|
||||
pfsub %mm2,%mm0
|
||||
|
||||
movq 16(%ecx),%mm3
|
||||
pfmul 16(%ebx),%mm3
|
||||
pfsub %mm3,%mm0
|
||||
|
||||
movq 24(%ecx),%mm4
|
||||
pfmul 24(%ebx),%mm4
|
||||
pfsub %mm4,%mm0
|
||||
|
||||
movq 32(%ecx),%mm5
|
||||
pfmul 32(%ebx),%mm5
|
||||
pfsub %mm5,%mm0
|
||||
|
||||
movq 40(%ecx),%mm6
|
||||
pfmul 40(%ebx),%mm6
|
||||
pfsub %mm6,%mm0
|
||||
|
||||
movq 48(%ecx),%mm7
|
||||
pfmul 48(%ebx),%mm7
|
||||
pfsub %mm7,%mm0
|
||||
|
||||
movq 56(%ecx),%mm1
|
||||
pfmul 56(%ebx),%mm1
|
||||
pfsub %mm1,%mm0
|
||||
|
||||
pfacc %mm0,%mm0
|
||||
|
||||
pf2id %mm0,%mm0
|
||||
movd %mm0,%eax
|
||||
|
||||
sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
femms
|
||||
|
||||
movl %edi,%eax
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $12,%esp
|
||||
ret
|
|
@ -1,201 +0,0 @@
|
|||
///
|
||||
/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
|
||||
///
|
||||
/// This code based 'decode_k7.s' by Nick Kurshev
|
||||
/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
|
||||
///
|
||||
/// - SSE optimization
|
||||
/// - change function name for support SSE automatic detect
|
||||
///
|
||||
/// Modified by Nick Kurshev <nickols_k@mail.ru>
|
||||
///
|
||||
/ synth_1to1_3dnow works the same way as the c version of
|
||||
/ synth_1to1. this assembler code based 'decode-i586.s'
|
||||
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
|
||||
/ have been made:
|
||||
/ - use {MMX,3DNow!} instruction for reduce cpu
|
||||
/ - remove unused(?) local symbols
|
||||
/
|
||||
/ useful sources of information on optimizing 3DNow! code include:
|
||||
/ AMD 3DNow! Technology Manual (Publication #21928)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
|
||||
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
|
||||
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
|
||||
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
|
||||
/
|
||||
/ This code was tested only AMD-K6-2 processor Linux systems,
|
||||
/ please tell me:
|
||||
/ - whether this code works on other 3DNow! capable processors
|
||||
/ (ex.IDT-C6-2) or not
|
||||
/ - whether this code works on other OSes or not
|
||||
/
|
||||
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
|
||||
/ <kim@comtec.co.jp> - after 1.Apr.1998
|
||||
|
||||
/ Enhancments for q-word operation by Michael Hipp
|
||||
|
||||
.bss
|
||||
.comm buffs,4352,4
|
||||
.data
|
||||
.align 4
|
||||
bo:
|
||||
.long 1
|
||||
.text
|
||||
/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
|
||||
.globl synth_1to1_sse
|
||||
synth_1to1_sse:
|
||||
subl $12,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
|
||||
movl 32(%esp),%eax
|
||||
movl 40(%esp),%esi
|
||||
movl $0,%edi
|
||||
movl bo,%ebp
|
||||
cmpl %edi,36(%esp)
|
||||
jne .L48
|
||||
decl %ebp
|
||||
andl $15,%ebp
|
||||
movl %ebp,bo
|
||||
movl $buffs,%ecx
|
||||
jmp .L49
|
||||
.L48:
|
||||
addl $2,%esi
|
||||
movl $buffs+2176,%ecx
|
||||
.L49:
|
||||
testl $1,%ebp
|
||||
je .L50
|
||||
movl %ecx,%ebx
|
||||
movl %ebp,16(%esp)
|
||||
pushl %eax
|
||||
movl 20(%esp),%edx
|
||||
leal (%ebx,%edx,4),%eax
|
||||
pushl %eax
|
||||
movl 24(%esp),%eax
|
||||
incl %eax
|
||||
andl $15,%eax
|
||||
leal 1088(,%eax,4),%eax
|
||||
addl %ebx,%eax
|
||||
jmp .L74
|
||||
.L50:
|
||||
leal 1088(%ecx),%ebx
|
||||
leal 1(%ebp),%edx
|
||||
movl %edx,16(%esp)
|
||||
pushl %eax
|
||||
leal 1092(%ecx,%ebp,4),%eax
|
||||
pushl %eax
|
||||
leal (%ecx,%ebp,4),%eax
|
||||
.L74:
|
||||
pushl %eax
|
||||
call dct64
|
||||
addl $12, %esp
|
||||
movl 16(%esp),%edx
|
||||
leal 0(,%edx,4),%edx
|
||||
movl $decwin+64,%eax
|
||||
movl %eax,%ecx
|
||||
subl %edx,%ecx
|
||||
movl $16,%ebp
|
||||
|
||||
.L55:
|
||||
movups (%ecx), %xmm0
|
||||
mulps (%ebx), %xmm0
|
||||
movups 16(%ecx), %xmm1
|
||||
mulps 16(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movups 32(%ecx), %xmm1
|
||||
mulps 32(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movups 48(%ecx), %xmm1
|
||||
mulps 48(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
/* pfnacc -> PFNACC mmreg1, mmreg2 performs the following operations: */
|
||||
/* temp = mmreg2 */
|
||||
/* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */
|
||||
/* mmreg1[63:32]= temp [31:0] - temp[63:32] */
|
||||
/* save difference of mmreg1's low-word and high-word into mmreg1's low-word */
|
||||
/* save difference of mmreg2's low-word and high-word into mmreg1's high-word */
|
||||
movhlps %xmm0, %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */
|
||||
|
||||
subss %xmm1, %xmm0
|
||||
cvtss2si %xmm0, %eax
|
||||
|
||||
/ sar $16,%eax
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $64,%ebx
|
||||
subl $-128,%ecx
|
||||
addl $4,%esi
|
||||
decl %ebp
|
||||
jnz .L55
|
||||
|
||||
/ --- end of loop 1 ---
|
||||
|
||||
movups (%ecx), %xmm0
|
||||
mulps (%ebx), %xmm0
|
||||
movups 16(%ecx), %xmm1
|
||||
mulps 16(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movups 32(%ecx), %xmm1
|
||||
mulps 32(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movups 48(%ecx), %xmm1
|
||||
mulps 48(%ebx), %xmm1
|
||||
addps %xmm1, %xmm0
|
||||
movhlps %xmm0, %xmm1
|
||||
addss %xmm1, %xmm0
|
||||
cvtss2si %xmm0, %eax
|
||||
|
||||
/ sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $-64,%ebx
|
||||
addl $4,%esi
|
||||
addl $256,%ecx
|
||||
movl $15,%ebp
|
||||
|
||||
.L68:
|
||||
xorps %xmm0, %xmm0
|
||||
movups (%ecx), %xmm1
|
||||
mulps (%ebx), %xmm1
|
||||
subps %xmm1, %xmm0
|
||||
movups 16(%ecx), %xmm1
|
||||
mulps 16(%ebx), %xmm1
|
||||
subps %xmm1, %xmm0
|
||||
movups 32(%ecx), %xmm1
|
||||
mulps 32(%ebx), %xmm1
|
||||
subps %xmm1, %xmm0
|
||||
movups 48(%ecx), %xmm1
|
||||
mulps 48(%ebx), %xmm1
|
||||
subps %xmm1, %xmm0
|
||||
movhlps %xmm0, %xmm1
|
||||
subps %xmm1, %xmm0
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */
|
||||
addss %xmm1, %xmm0
|
||||
cvtss2si %xmm0, %eax
|
||||
|
||||
/ sar $16,%eax
|
||||
|
||||
movw %ax,(%esi)
|
||||
|
||||
addl $-64,%ebx
|
||||
subl $-128,%ecx
|
||||
addl $4,%esi
|
||||
decl %ebp
|
||||
jnz .L68
|
||||
|
||||
/ --- end of loop 2
|
||||
|
||||
movl %edi,%eax
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $12,%esp
|
||||
ret
|
|
@ -50,8 +50,16 @@ static void init_layer2(void)
|
|||
{
|
||||
double m=mulmul[k];
|
||||
table = muls[k];
|
||||
if(_has_mmx)
|
||||
{
|
||||
for(j=3,i=0;i<63;i++,j--)
|
||||
*table++ = 16384 * m * pow(2.0,(double) j / 3.0);
|
||||
}
|
||||
else
|
||||
for(j=3,i=0;i<63;i++,j--)
|
||||
{
|
||||
*table++ = m * pow(2.0,(double) j / 3.0);
|
||||
}
|
||||
*table++ = 0.0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,9 +22,9 @@ static real win1[4][36];
|
|||
#define GP2MAX (256+118+4)
|
||||
static real gainpow2[GP2MAX];
|
||||
|
||||
static real nCOS9[9];
|
||||
real COS9[9];
|
||||
static real COS6_1,COS6_2;
|
||||
static real tfcos36[9];
|
||||
real tfcos36[9];
|
||||
static real tfcos12[3];
|
||||
#ifdef NEW_DCT9
|
||||
static real cos9[3],cos18[3];
|
||||
|
@ -111,8 +111,12 @@ void init_layer3(int down_sample_sblimit)
|
|||
int i,j,k,l;
|
||||
|
||||
for(i=-256;i<118+4;i++)
|
||||
gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
|
||||
|
||||
{
|
||||
if(_has_mmx)
|
||||
gainpow2[i+256] = 16384.0 * pow((double)2.0,-0.25 * (double) (i+210) );
|
||||
else
|
||||
gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
|
||||
}
|
||||
for(i=0;i<8207;i++)
|
||||
ispow[i] = pow((double)i,(double)4.0/3.0);
|
||||
|
||||
|
@ -139,7 +143,7 @@ void init_layer3(int down_sample_sblimit)
|
|||
}
|
||||
|
||||
for(i=0;i<9;i++)
|
||||
nCOS9[i] = cos( M_PI / 18.0 * (double) i);
|
||||
COS9[i] = cos( M_PI / 18.0 * (double) i);
|
||||
|
||||
for(i=0;i<9;i++)
|
||||
tfcos36[i] = 0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 );
|
||||
|
@ -1533,6 +1537,9 @@ static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info)
|
|||
/*
|
||||
* III_hybrid
|
||||
*/
|
||||
|
||||
dct36_func_t dct36_func;
|
||||
|
||||
static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
|
||||
int ch,struct gr_info_s *gr_info)
|
||||
{
|
||||
|
@ -1553,8 +1560,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
|
|||
|
||||
if(gr_info->mixed_block_flag) {
|
||||
sb = 2;
|
||||
dct36(fsIn[0],rawout1,rawout2,win[0],tspnt);
|
||||
dct36(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
|
||||
(*dct36_func)(fsIn[0],rawout1,rawout2,win[0],tspnt);
|
||||
(*dct36_func)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
|
||||
rawout1 += 36; rawout2 += 36; tspnt += 2;
|
||||
}
|
||||
|
||||
|
@ -1567,8 +1574,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
|
|||
}
|
||||
else {
|
||||
for (; sb<gr_info->maxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) {
|
||||
dct36(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
|
||||
dct36(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
|
||||
(*dct36_func)(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
|
||||
(*dct36_func)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -104,33 +104,22 @@ struct III_sideinfo
|
|||
};
|
||||
|
||||
static long freqs[9];
|
||||
#ifdef HAVE_3DNOW
|
||||
real decwin[2*(512+32)];
|
||||
#else
|
||||
real decwin[512+32];
|
||||
#endif
|
||||
real *pnts[];
|
||||
extern real decwin[(512+32)];
|
||||
extern real *pnts[];
|
||||
|
||||
static int do_layer2(struct frame *fr,int single);
|
||||
static int do_layer3(struct frame *fr,int single);
|
||||
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt);
|
||||
|
||||
extern int synth_1to1_pent( real *,int,unsigned char * );
|
||||
extern int synth_1to1_pent( real *,int,short * );
|
||||
extern void make_decode_tables_MMX(long scaleval);
|
||||
extern int synth_1to1_MMX( real *,int,short * );
|
||||
extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
|
||||
extern void dct64(real *a,real *b,real *c);
|
||||
|
||||
#ifdef HAVE_3DNOW
|
||||
extern void dct64_3dnow( real *,real *, real * );
|
||||
extern void dct36_3dnow(real *,real *,real *,real *,real *);
|
||||
extern int synth_1to1_3dnow( real *,int,unsigned char * );
|
||||
#endif
|
||||
#ifdef HAVE_3DNOWEX
|
||||
extern void dct64_3dnowex( real *,real *, real * );
|
||||
extern void dct36_3dnowex(real *,real *,real *,real *,real *);
|
||||
extern int synth_1to1_3dnowex( real *,int,unsigned char * );
|
||||
#endif
|
||||
#ifdef HAVE_SSE_MP3
|
||||
// extern void dct64_3dnow( real *,real *, real * );
|
||||
// extern void dct36_3dnow(real *,real *,real *,real *,real *);
|
||||
extern int synth_1to1_sse( real *,int,unsigned char * );
|
||||
#endif
|
||||
extern void dct36_3dnow(real *,real *,real *,real *,real *);
|
||||
extern void dct36_3dnowex(real *,real *,real *,real *,real *);
|
||||
extern void dct36_sse(real *,real *,real *,real *,real *);
|
||||
|
||||
typedef int (*synth_func_t)( real *,int,short * );
|
||||
typedef void (*dct36_func_t)(real *,real *,real *,real *,real *);
|
||||
|
|
83
mp3lib/sr1.c
83
mp3lib/sr1.c
|
@ -343,6 +343,12 @@ retry1:
|
|||
|
||||
static int tables_done_flag=0;
|
||||
|
||||
/* It's hidden from gcc in assembler */
|
||||
extern void dct64_MMX( void );
|
||||
extern void dct64_MMX_3dnow( void );
|
||||
extern void dct64_MMX_3dnowex( void );
|
||||
void (*dct64_MMX_func)( void );
|
||||
|
||||
// Init decoder tables. Call first, once!
|
||||
#ifdef USE_FAKE_MONO
|
||||
void MP3_Init(int fakemono){
|
||||
|
@ -351,20 +357,41 @@ void MP3_Init(){
|
|||
#endif
|
||||
_CpuID=CpuDetect();
|
||||
_i586=ipentium();
|
||||
#ifdef HAVE_3DNOW
|
||||
#ifndef HAVE_MMX
|
||||
_i586 &= 1;
|
||||
#endif
|
||||
_3dnow=a3dnow();
|
||||
#ifndef HAVE_3DNOW
|
||||
_3dnow = 0;
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_3DNOWEX
|
||||
_3dnow &= 1;
|
||||
#endif
|
||||
_isse=isse();
|
||||
#ifndef HAVE_SSE
|
||||
_isse = 0;
|
||||
#endif
|
||||
#ifndef HAVE_SSE2
|
||||
_isse &= 1;
|
||||
#endif
|
||||
_has_mmx=_i586>1||_3dnow||_isse;
|
||||
printf( "mp3lib: Processor ID: %x\n",_CpuID );
|
||||
printf( "mp3lib: i586 processor %sdetected.\n",(_i586?"":"not ") );
|
||||
#ifdef HAVE_3DNOW
|
||||
printf( "mp3lib: AMD 3dnow! extension %sdetected.\n",(_3dnow?"":"not ") );
|
||||
#endif
|
||||
#ifdef HAVE_3DNOWEX
|
||||
printf( "mp3lib: AMD 3dnow-dsp! extension %sdetected.\n",(_3dnow>1?"":"not ") );
|
||||
#endif
|
||||
if(_i586&&!_3dnow&&!_isse)
|
||||
printf( "mp3lib: Using Pentium%s optimized decore.\n",(_i586>1?"-MMX":""));
|
||||
else
|
||||
if(_isse)
|
||||
/*
|
||||
Note: It's ok, Since K8 will have SSE2 support and will much faster
|
||||
of P4 ;)
|
||||
*/
|
||||
printf( "mp3lib: Using SSE%s! optimized decore.\n",(_isse>1?"2":""));
|
||||
else
|
||||
if(_3dnow)
|
||||
printf( "mp3lib: Using AMD 3dnow%s! optimized decore.\n",(_3dnow>1?"-dsp(k7)":""));
|
||||
|
||||
make_decode_tables(outscale);
|
||||
/* Use it for any MMX cpu */
|
||||
if(_has_mmx) make_decode_tables_MMX(outscale);
|
||||
else make_decode_tables(outscale);
|
||||
#ifdef USE_FAKE_MONO
|
||||
if (fakemono == 1)
|
||||
fr.synth=synth_1to1_l;
|
||||
|
@ -381,6 +408,42 @@ void MP3_Init(){
|
|||
init_layer2();
|
||||
init_layer3(fr.down_sample_sblimit);
|
||||
tables_done_flag=1;
|
||||
|
||||
dct36_func=dct36;
|
||||
if(_isse)
|
||||
{
|
||||
synth_func=synth_1to1_MMX;
|
||||
dct64_MMX_func=dct64_MMX;
|
||||
}
|
||||
else
|
||||
if ( _3dnow > 1 )
|
||||
{
|
||||
synth_func=synth_1to1_MMX;
|
||||
dct36_func=dct36_3dnowex;
|
||||
dct64_MMX_func=dct64_MMX_3dnowex;
|
||||
}
|
||||
else
|
||||
if ( _3dnow )
|
||||
{
|
||||
synth_func=synth_1to1_MMX;
|
||||
dct36_func=dct36_3dnow;
|
||||
dct64_MMX_func=dct64_MMX_3dnow;
|
||||
}
|
||||
else
|
||||
if ( _i586 > 1)
|
||||
{
|
||||
synth_func=synth_1to1_MMX;
|
||||
dct64_MMX_func=dct64_MMX;
|
||||
}
|
||||
else
|
||||
if ( _i586 )
|
||||
{
|
||||
synth_func=synth_1to1_pent;
|
||||
}
|
||||
else
|
||||
{
|
||||
synth_func = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
|
|
@ -1,20 +1,7 @@
|
|||
real decwin[(512+32)], cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
|
||||
real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
|
||||
|
||||
|
||||
#ifdef HAVE_3DNOW
|
||||
real decwin[2*(512+32)] __attribute__((aligned(8)));
|
||||
real cos64[32] __attribute__((aligned(8)));
|
||||
real cos32[16] __attribute__((aligned(8)));
|
||||
real cos16[8] __attribute__((aligned(8)));
|
||||
real cos8[4] __attribute__((aligned(8)));
|
||||
real cos4[2] __attribute__((aligned(8)));
|
||||
real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
|
||||
#else
|
||||
real decwin[512+32];
|
||||
real cos64[16],cos32[8],cos16[4],cos8[2],cos4[1];
|
||||
real *pnts[] = { cos64,cos32,cos16,cos8,cos4 };
|
||||
#endif
|
||||
|
||||
long intwinbase[] = {
|
||||
static long intwinbase[] = {
|
||||
0, -1, -1, -1, -1, -1, -1, -2, -2, -2,
|
||||
-2, -3, -3, -4, -4, -5, -5, -6, -7, -7,
|
||||
-8, -9, -10, -11, -13, -14, -16, -17, -19, -21,
|
||||
|
@ -42,7 +29,7 @@ long intwinbase[] = {
|
|||
64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
|
||||
73415, 73908, 74313, 74630, 74856, 74992, 75038 };
|
||||
|
||||
void make_decode_tables(long scaleval)
|
||||
void make_decode_tables(long scaleval)
|
||||
{
|
||||
int i,j,k,kr,divv;
|
||||
real *table,*costab;
|
||||
|
@ -53,17 +40,13 @@ long intwinbase[] = {
|
|||
kr=0x10>>i; divv=0x40>>i;
|
||||
costab = pnts[i];
|
||||
for(k=0;k<kr;k++) costab[k] = 1.0 / (2.0 * cos(M_PI * ((double) k * 2.0 + 1.0) / (double) divv));
|
||||
#ifdef HAVE_3DNOW
|
||||
if ( _3dnow ) for(k=0;k<kr;k++) costab[k+kr]=-costab[k];
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
table = decwin;
|
||||
scaleval = -scaleval;
|
||||
for(i=0,j=0;i<256;i++,j++,table+=32)
|
||||
{
|
||||
if(table < decwin+512+16)
|
||||
if(table < decwin+512+16)
|
||||
table[16] = table[0] = (double) intwinbase[j] / 65536.0 * (double) scaleval;
|
||||
if(i % 32 == 31)
|
||||
table -= 1023;
|
||||
|
@ -80,14 +63,6 @@ long intwinbase[] = {
|
|||
if(i % 64 == 63)
|
||||
scaleval = - scaleval;
|
||||
}
|
||||
#ifdef HAVE_3DNOW
|
||||
if ( _3dnow )
|
||||
for(i=0;i<512+32;i++)
|
||||
{
|
||||
decwin[512+31-i]*=65536.0; // allows faster clipping in 3dnow code
|
||||
decwin[512+32+i]=decwin[512+31-i];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
# This code was taken from http://www.mpg123.org
|
||||
# See ChangeLog of mpg123-0.59s-pre.1 for detail
|
||||
# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
|
||||
.bss
|
||||
.align 8
|
||||
.comm decwin,2176,32
|
||||
.align 8
|
||||
.comm decwins,2176,32
|
||||
.data
|
||||
.align 8
|
||||
intwinbase_MMX:
|
||||
.value 0, -1, -1, -1, -1, -1, -1, -2
|
||||
.value -2, -2, -2, -3, -3, -4, -4, -5
|
||||
.value -5, -6, -7, -7, -8, -9, -10, -11
|
||||
.value -13, -14, -16, -17, -19, -21, -24, -26
|
||||
.value -29, -31, -35, -38, -41, -45, -49, -53
|
||||
.value -58, -63, -68, -73, -79, -85, -91, -97
|
||||
.value -104, -111, -117, -125, -132, -139, -147, -154
|
||||
.value -161, -169, -176, -183, -190, -196, -202, -208
|
||||
.value -213, -218, -222, -225, -227, -228, -228, -227
|
||||
.value -224, -221, -215, -208, -200, -189, -177, -163
|
||||
.value -146, -127, -106, -83, -57, -29, 2, 36
|
||||
.value 72, 111, 153, 197, 244, 294, 347, 401
|
||||
.value 459, 519, 581, 645, 711, 779, 848, 919
|
||||
.value 991, 1064, 1137, 1210, 1283, 1356, 1428, 1498
|
||||
.value 1567, 1634, 1698, 1759, 1817, 1870, 1919, 1962
|
||||
.value 2001, 2032, 2057, 2075, 2085, 2087, 2080, 2063
|
||||
.value 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535
|
||||
.value 1414, 1280, 1131, 970, 794, 605, 402, 185
|
||||
.value -45, -288, -545, -814, -1095, -1388, -1692, -2006
|
||||
.value -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788
|
||||
.value -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597
|
||||
.value -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585
|
||||
.value -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750
|
||||
.value -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134
|
||||
.value -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082
|
||||
.value -70, 998, 2122, 3300, 4533, 5818, 7154, 8540
|
||||
.value 9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189
|
||||
.value 22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360
|
||||
.value -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863
|
||||
.value -8147, -6466, -4822, -3222, -1667, -162, 1289, 2684
|
||||
.value 4019, 5290, 6494, 7629, 8692, 9679, 10590, 11420
|
||||
.value 12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992
|
||||
.value 15038
|
||||
|
||||
intwindiv:
|
||||
.long 0x47800000 # 65536.0
|
||||
.text
|
||||
.align 32
|
||||
.globl make_decode_tables_MMX
|
||||
make_decode_tables_MMX:
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
|
||||
xorl %ecx,%ecx
|
||||
xorl %ebx,%ebx
|
||||
movl $32,%esi
|
||||
movl $intwinbase_MMX,%edi
|
||||
negl 16(%esp) # scaleval
|
||||
pushl $2 # intwinbase step
|
||||
.L00:
|
||||
cmpl $528,%ecx
|
||||
jnc .L02
|
||||
movswl (%edi),%eax
|
||||
cmpl $intwinbase_MMX+444,%edi
|
||||
jc .L01
|
||||
addl $60000,%eax
|
||||
.L01:
|
||||
pushl %eax
|
||||
fildl (%esp)
|
||||
fdivs intwindiv
|
||||
fimull 24(%esp)
|
||||
popl %eax
|
||||
fsts decwin(,%ecx,4)
|
||||
fstps decwin+64(,%ecx,4)
|
||||
.L02:
|
||||
leal -1(%esi),%edx
|
||||
and %ebx,%edx
|
||||
cmp $31,%edx
|
||||
jnz .L03
|
||||
addl $-1023,%ecx
|
||||
test %esi,%ebx
|
||||
jz .L03
|
||||
negl 20(%esp)
|
||||
.L03:
|
||||
addl %esi,%ecx
|
||||
addl (%esp),%edi
|
||||
incl %ebx
|
||||
cmpl $intwinbase_MMX,%edi
|
||||
jz .L04
|
||||
cmp $256,%ebx
|
||||
jnz .L00
|
||||
negl (%esp)
|
||||
jmp .L00
|
||||
.L04:
|
||||
popl %eax
|
||||
|
||||
xorl %ecx,%ecx
|
||||
xorl %ebx,%ebx
|
||||
pushl $2
|
||||
.L05:
|
||||
cmpl $528,%ecx
|
||||
jnc .L11
|
||||
movswl (%edi),%eax
|
||||
cmpl $intwinbase_MMX+444,%edi
|
||||
jc .L06
|
||||
addl $60000,%eax
|
||||
.L06:
|
||||
cltd
|
||||
imull 20(%esp)
|
||||
shrdl $17,%edx,%eax
|
||||
cmpl $32767,%eax
|
||||
movl $1055,%edx
|
||||
jle .L07
|
||||
movl $32767,%eax
|
||||
jmp .L08
|
||||
.L07:
|
||||
cmpl $-32767,%eax
|
||||
jge .L08
|
||||
movl $-32767,%eax
|
||||
.L08:
|
||||
cmpl $512,%ecx
|
||||
jnc .L09
|
||||
subl %ecx,%edx
|
||||
movw %ax,decwins(,%edx,2)
|
||||
movw %ax,decwins-32(,%edx,2)
|
||||
.L09:
|
||||
testl $1,%ecx
|
||||
jnz .L10
|
||||
negl %eax
|
||||
.L10:
|
||||
movw %ax,decwins(,%ecx,2)
|
||||
movw %ax,decwins+32(,%ecx,2)
|
||||
.L11:
|
||||
leal -1(%esi),%edx
|
||||
and %ebx,%edx
|
||||
cmp $31,%edx
|
||||
jnz .L12
|
||||
addl $-1023,%ecx
|
||||
test %esi,%ebx
|
||||
jz .L12
|
||||
negl 20(%esp)
|
||||
.L12:
|
||||
addl %esi,%ecx
|
||||
addl (%esp),%edi
|
||||
incl %ebx
|
||||
cmpl $intwinbase_MMX,%edi
|
||||
jz .L13
|
||||
cmp $256,%ebx
|
||||
jnz .L05
|
||||
negl (%esp)
|
||||
jmp .L05
|
||||
.L13:
|
||||
popl %eax
|
||||
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
ret
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
// gcc test.c -I.. -L. -lMP3 -lm -o test2 -O4
|
||||
//gcc test2.c -O2 -I.. -L. ../libvo/aclib.c -lMP3 -lm -o test2
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
|
Loading…
Reference in New Issue