mirror of
https://github.com/mpv-player/mpv
synced 2025-02-17 04:58:06 +00:00
Align output pointer so that we can use movaps instead of movups in dct64_sse;
1.5% faster decode. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@23484 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
parent
40c9f981a2
commit
739f79a5ff
@ -5,17 +5,7 @@
|
||||
* and mp3lib/dct64_MMX.c
|
||||
*/
|
||||
|
||||
/* NOTE: The following code is suboptimal! It can be improved (at least) by
|
||||
|
||||
1. Replace all movups by movaps. (Can Parameter c be always aligned on
|
||||
a 16-byte boundary?)
|
||||
|
||||
2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
|
||||
better. However, when __m128 locals are involved, GCC may
|
||||
produce bad code that uses movaps to access a stack not aligned
|
||||
on a 16-byte boundary, which leads to run-time crashes.)
|
||||
|
||||
*/
|
||||
#include <libavutil/mem.h>
|
||||
|
||||
typedef float real;
|
||||
|
||||
@ -32,8 +22,8 @@ static const int nnnn[4] __attribute__((aligned(16))) =
|
||||
|
||||
void dct64_sse(short *out0,short *out1,real *c)
|
||||
{
|
||||
static real __attribute__ ((aligned(16))) b1[0x20];
|
||||
static real __attribute__ ((aligned(16))) b2[0x20];
|
||||
static DECLARE_ALIGNED(16, real, b1[0x20]);
|
||||
static DECLARE_ALIGNED(16, real, b2[0x20]);
|
||||
static real const one = 1.f;
|
||||
|
||||
{
|
||||
@ -45,9 +35,9 @@ void dct64_sse(short *out0,short *out1,real *c)
|
||||
asm(
|
||||
"movaps %2, %%xmm3\n\t"
|
||||
"shufps $27, %%xmm3, %%xmm3\n\t"
|
||||
"movups %3, %%xmm1\n\t"
|
||||
"movaps %3, %%xmm1\n\t"
|
||||
"movaps %%xmm1, %%xmm4\n\t"
|
||||
"movups %4, %%xmm2\n\t"
|
||||
"movaps %4, %%xmm2\n\t"
|
||||
"shufps $27, %%xmm4, %%xmm4\n\t"
|
||||
"movaps %%xmm2, %%xmm0\n\t"
|
||||
"shufps $27, %%xmm0, %%xmm0\n\t"
|
||||
|
@ -131,7 +131,7 @@ static int do_layer1(struct frame *fr,int single)
|
||||
int i,stereo = fr->stereo;
|
||||
unsigned int balloc[2*SBLIMIT];
|
||||
unsigned int scale_index[2][SBLIMIT];
|
||||
real fraction[2][SBLIMIT];
|
||||
DECLARE_ALIGNED(16, real, fraction[2][SBLIMIT]);
|
||||
// int single = fr->single;
|
||||
|
||||
// printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n",
|
||||
|
@ -285,7 +285,7 @@ static int do_layer2(struct frame *fr,int outmode)
|
||||
int clip=0;
|
||||
int i,j;
|
||||
int stereo = fr->stereo;
|
||||
real fraction[2][4][SBLIMIT]; /* pick_table clears unused subbands */
|
||||
DECLARE_ALIGNED(16, real, fraction[2][4][SBLIMIT]); /* pick_table clears unused subbands */
|
||||
unsigned int bit_alloc[64];
|
||||
int scale[192];
|
||||
int single = fr->single;
|
||||
|
@ -1260,8 +1260,8 @@ static int do_layer3(struct frame *fr,int single){
|
||||
|
||||
granules = (fr->lsf) ? 1 : 2;
|
||||
for (gr=0;gr<granules;gr++){
|
||||
static real hybridIn[2][SBLIMIT][SSLIMIT];
|
||||
static real hybridOut[2][SSLIMIT][SBLIMIT];
|
||||
static DECLARE_ALIGNED(16, real, hybridIn[2][SBLIMIT][SSLIMIT]);
|
||||
static DECLARE_ALIGNED(16, real, hybridOut[2][SSLIMIT][SBLIMIT]);
|
||||
|
||||
{ struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]);
|
||||
int part2bits;
|
||||
|
Loading…
Reference in New Issue
Block a user