diff --git a/liba52/srfftp.h b/liba52/srfftp.h index ffb228eaaa..897a508601 100644 --- a/liba52/srfftp.h +++ b/liba52/srfftp.h @@ -32,19 +32,19 @@ #ifndef SRFFTP_H__ #define SRFFTP_H__ -static complex_t delta16[4] = +static complex_t delta16[4] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.92387953251129, -0.38268343236509}, {0.70710678118655, -0.70710678118655}, {0.38268343236509, -0.92387953251129}}; -static complex_t delta16_3[4] = +static complex_t delta16_3[4] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.38268343236509, -0.92387953251129}, {-0.70710678118655, -0.70710678118655}, {-0.92387953251129, 0.38268343236509}}; -static complex_t delta32[8] = +static complex_t delta32[8] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.98078528040323, -0.19509032201613}, {0.92387953251129, -0.38268343236509}, @@ -54,7 +54,7 @@ static complex_t delta32[8] = {0.38268343236509, -0.92387953251129}, {0.19509032201613, -0.98078528040323}}; -static complex_t delta32_3[8] = +static complex_t delta32_3[8] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.83146961230255, -0.55557023301960}, {0.38268343236509, -0.92387953251129}, @@ -64,7 +64,7 @@ static complex_t delta32_3[8] = {-0.92387953251129, 0.38268343236509}, {-0.55557023301960, 0.83146961230255}}; -static complex_t delta64[16] = +static complex_t delta64[16] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.99518472667220, -0.09801714032956}, {0.98078528040323, -0.19509032201613}, @@ -82,7 +82,7 @@ static complex_t delta64[16] = {0.19509032201613, -0.98078528040323}, {0.09801714032956, -0.99518472667220}}; -static complex_t delta64_3[16] = +static complex_t delta64_3[16] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.95694033573221, -0.29028467725446}, {0.83146961230255, -0.55557023301960}, @@ -100,7 +100,7 @@ static complex_t delta64_3[16] = {-0.55557023301960, 0.83146961230255}, {-0.29028467725446, 0.95694033573221}}; -static complex_t delta128[32] = +static complex_t delta128[32] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.99879545620517, -0.04906767432742}, {0.99518472667220, -0.09801714032956}, @@ -134,7 +134,7 @@ static complex_t delta128[32] = {0.09801714032956, -0.99518472667220}, {0.04906767432742, -0.99879545620517}}; -static complex_t delta128_3[32] = +static complex_t delta128_3[32] __attribute__((aligned(16))) = { {1.00000000000000, 0.00000000000000}, {0.98917650996478, -0.14673047445536}, {0.95694033573221, -0.29028467725446}, diff --git a/liba52/srfftp_3dnow.h b/liba52/srfftp_3dnow.h index 8444fdee21..a89f13ae76 100644 --- a/liba52/srfftp_3dnow.h +++ b/liba52/srfftp_3dnow.h @@ -68,11 +68,18 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, "psrlq $32, "##mm_base"\n\t"\ "punpckldq "##mm_hlp","##mm_base"\n\t" #endif +#ifdef HAVE_3DNOWEX +#define PFNACC_MM(mm_base,mm_hlp) "pfnacc "##mm_base","##mm_base"\n\t" +#else +#define PFNACC_MM(mm_base,mm_hlp)\ + "movq "##mm_base","##mm_hlp"\n\t"\ + "psrlq $32,"##mm_hlp"\n\t"\ + "punpckldq "##mm_hlp","##mm_hlp"\n\t"\ + "pfsub "##mm_hlp","##mm_base"\n\t" +#endif #define TRANSZERO_3DNOW(A0,A4,A8,A12) \ { \ - __asm__ __volatile__("femms":::"memory");\ - TRANS_FILL_MM6_MM7_3DNOW()\ __asm__ __volatile__(\ "movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\ "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \ @@ -98,13 +105,10 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, :"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\ :"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\ :"memory");\ - __asm__ __volatile__("femms":::"memory");\ } #define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\ {\ - __asm__ __volatile__("femms":::"memory");\ - TRANS_FILL_MM6_MM7_3DNOW()\ __asm__ __volatile__(\ "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\ "movq %%mm0, %%mm1\n\t"\ @@ -142,7 +146,56 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, :"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\ :"m"(wTB[2]), "m"(wTB[6]), "0"(A2), "2"(A6), "m"(HSQRT2_3DNOW)\ :"memory");\ - __asm__ __volatile__("femms":::"memory");\ +} + +#define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\ +{ \ + __asm__ __volatile__(\ + "movq %1, %%mm4\n\t"\ + "movq %%mm4, %%mm5\n\t"\ + "punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\ + "punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\ + "movq %0, %%mm0\n\t"\ + "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\ + "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\ + PSWAP_MM("%%mm5","%%mm3")\ + "pfmul %%mm7, %%mm5\n\t"\ + "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\ + "movq %3, %%mm1\n\t"\ + "movq %2, %%mm0\n\t"\ + PSWAP_MM("%%mm1","%%mm3")\ + "movq %%mm0, %%mm2\n\t"\ + "pfmul %%mm1, %%mm0\n\t"/* mm0 = a*/\ + "pfmul %3, %%mm2\n\t"/* mm2 = v*/\ + PFNACC_MM("%%mm2","%%mm3")\ + "pfacc %%mm0, %%mm0\n\t"\ + "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\ + "movq %%mm2, %%mm3\n\t"\ + "pfmul %%mm7, %%mm3\n\t"\ + "movq %%mm4, %%mm5\n\t"\ + "pfmul %%mm6, %%mm5\n\t"\ + "pfadd %%mm3, %%mm5\n\t"\ + PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\ + "pfadd %%mm2, %%mm4\n\t"\ + :\ + :"m"(WT), "m"(D), "m"(WB), "m"(D3)\ + :"memory");\ + __asm__ __volatile__(\ + "movq %4, %%mm0\n\t"/* a1 = A1*/\ + "movq %%mm0, %%mm1\n\t"\ + "pfadd %%mm4, %%mm0\n\t"/*A1 = a1 + u*/\ + "pfsub %%mm4, %%mm1\n\t"/*A9 = a1 - u*/\ + "movq %%mm0, %0\n\t"\ + "movq %%mm1, %1\n\t"\ + "movq %5, %%mm2\n\t"/* a1 = A5*/\ + "movq %%mm2, %%mm3\n\t"\ + "pfsub %%mm5, %%mm2\n\t"/*A5 = a1 - v*/\ + "pfadd %%mm5, %%mm3\n\t"/*A9 = a1 + v*/\ + "movq %%mm2, %2\n\t"\ + "movq %%mm3, %3"\ + :"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\ + :"0"(A1), "2"(A5), "m"(u), "m"(v)\ + :"memory");\ } #endif