diff --git a/libmpeg2/idct_altivec.c b/libmpeg2/idct_altivec.c index ff9eea7dd9..e9fc28bc4b 100644 --- a/libmpeg2/idct_altivec.c +++ b/libmpeg2/idct_altivec.c @@ -1,6 +1,6 @@ /* * idct_altivec.c - * Copyright (C) 2000-2002 Michel Lespinasse + * Copyright (C) 2000-2003 Michel Lespinasse * Copyright (C) 1999-2000 Aaron Holtzman * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. @@ -21,513 +21,57 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef __ALTIVEC__ - #include "config.h" #ifdef ARCH_PPC +#ifdef HAVE_ALTIVEC_H +#include +#endif #include #include "mpeg2.h" #include "mpeg2_internal.h" #include "attributes.h" -static const int16_t constants[5][8] ATTR_ALIGN(16) = { - {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, - {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, - {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, - {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, - {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} -}; +typedef vector signed char vector_s8_t; +typedef vector unsigned char vector_u8_t; +typedef vector signed short vector_s16_t; +typedef vector unsigned short vector_u16_t; +typedef vector signed int vector_s32_t; +typedef vector unsigned int vector_u32_t; -/* - * The asm code is generated with: - * - * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S - * idct_altivec.c - * - * awk '{args=""; len=split ($2, arg, ","); - * for (i=1; i<=len; i++) { a=arg[i]; if (i> 3) | ((j & 7) << 3); - j = mpeg2_scan_alt[i]; - mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3); - } -} - -#endif /* ARCH_PPC */ - -#else /* __ALTIVEC__ */ - -#define vector_s16_t vector signed short -#define vector_u16_t vector unsigned short -#define vector_s8_t vector signed char -#define vector_u8_t vector unsigned char -#define vector_s32_t vector signed int -#define vector_u32_t vector unsigned int +static const vector_s16_t constants ATTR_ALIGN(16) = + VEC_S16 (23170, 13573, 6518, 21895, -23170, -21895, 32, 31); +static const vector_s16_t constants_1 ATTR_ALIGN(16) = + VEC_S16 (16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725); +static const vector_s16_t constants_2 ATTR_ALIGN(16) = + VEC_S16 (22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521); +static const vector_s16_t constants_3 ATTR_ALIGN(16) = + VEC_S16 (21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692); +static const vector_s16_t constants_4 ATTR_ALIGN(16) = + VEC_S16 (19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722); #define IDCT_HALF \ /* 1st stage */ \ @@ -571,25 +115,25 @@ void mpeg2_idct_altivec_init (void) vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ vector_u16_t shift; \ \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ + c4 = vec_splat (constants, 0); \ + a0 = vec_splat (constants, 1); \ + a1 = vec_splat (constants, 2); \ + a2 = vec_splat (constants, 3); \ + mc4 = vec_splat (constants, 4); \ + ma2 = vec_splat (constants, 5); \ + bias = (vector_s16_t)vec_splat ((vector_s32_t)constants, 3); \ \ zero = vec_splat_s16 (0); \ shift = vec_splat_u16 (4); \ \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ + vx0 = vec_mradds (vec_sl (block[0], shift), constants_1, zero); \ + vx1 = vec_mradds (vec_sl (block[1], shift), constants_2, zero); \ + vx2 = vec_mradds (vec_sl (block[2], shift), constants_3, zero); \ + vx3 = vec_mradds (vec_sl (block[3], shift), constants_4, zero); \ + vx4 = vec_mradds (vec_sl (block[4], shift), constants_1, zero); \ + vx5 = vec_mradds (vec_sl (block[5], shift), constants_4, zero); \ + vx6 = vec_mradds (vec_sl (block[6], shift), constants_3, zero); \ + vx7 = vec_mradds (vec_sl (block[7], shift), constants_2, zero); \ \ IDCT_HALF \ \ @@ -632,17 +176,10 @@ void mpeg2_idct_altivec_init (void) vx6 = vec_sra (vy6, shift); \ vx7 = vec_sra (vy7, shift); -static const vector_s16_t constants[5] = { - (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), - (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), - (vector_s16_t)(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), - (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), - (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) -}; - -void mpeg2_idct_copy_altivec (vector_s16_t * const block, unsigned char * dest, +void mpeg2_idct_copy_altivec (int16_t * const _block, uint8_t * dest, const int stride) { + vector_s16_t * const block = (vector_s16_t *)_block; vector_u8_t tmp; IDCT @@ -661,12 +198,14 @@ void mpeg2_idct_copy_altivec (vector_s16_t * const block, unsigned char * dest, COPY (dest, vx6) dest += stride; COPY (dest, vx7) - memset (block, 0, 64 * sizeof (signed short)); + block[0] = block[1] = block[2] = block[3] = zero; + block[4] = block[5] = block[6] = block[7] = zero; } -void mpeg2_idct_add_altivec (const int last, vector_s16_t * const block, - unsigned char * dest, const int stride) +void mpeg2_idct_add_altivec (const int last, int16_t * const _block, + uint8_t * dest, const int stride) { + vector_s16_t * const block = (vector_s16_t *)_block; vector_u8_t tmp; vector_s16_t tmp2, tmp3; vector_u8_t perm0; @@ -699,7 +238,23 @@ void mpeg2_idct_add_altivec (const int last, vector_s16_t * const block, ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) - memset (block, 0, 64 * sizeof (signed short)); + block[0] = block[1] = block[2] = block[3] = zero; + block[4] = block[5] = block[6] = block[7] = zero; } -#endif /* __ALTIVEC__ */ +void mpeg2_idct_altivec_init (void) +{ + extern uint8_t mpeg2_scan_norm[64]; + extern uint8_t mpeg2_scan_alt[64]; + int i, j; + + /* the altivec idct uses a transposed input, so we patch scan tables */ + for (i = 0; i < 64; i++) { + j = mpeg2_scan_norm[i]; + mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3); + j = mpeg2_scan_alt[i]; + mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3); + } +} + +#endif