rv34: NEON optimised dc only inverse transform

30-50% faster than the C implementation, 0.5% overall speedup on
bourne.rmvb.
This commit is contained in:
Janne Grunau 2012-01-12 17:21:48 +01:00
parent 136ee32da3
commit e1e369049e
2 changed files with 36 additions and 2 deletions

View File

@ -26,8 +26,13 @@
void ff_rv34_inv_transform_neon(DCTELEM *block);
void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
void ff_rv34_inv_transform_dc_neon(DCTELEM *block);
void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon;
c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon;
}

View File

@ -107,3 +107,32 @@ function ff_rv34_inv_transform_noround_neon, export=1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
bx lr
endfunc
/* void rv34_inv_transform_dc_c(DCTELEM *block) */
function ff_rv34_inv_transform_dc_neon, export=1
vld1.16 d28[], [r0:16] @ block[0]
vmov.i16 d4, #169
mov r1, #16
vmull.s16 q3, d28, d4
vrshrn.s32 d0, q3, #10
vst1.16 {d0}, [r0:64], r1
vst1.16 {d0}, [r0:64], r1
vst1.16 {d0}, [r0:64], r1
vst1.16 {d0}, [r0:64], r1
bx lr
endfunc
/* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
function ff_rv34_inv_transform_noround_dc_neon, export=1
vld1.16 d28[], [r0:16] @ block[0]
vmov.i16 d4, #251
vorr.s16 d4, #256 @ 13^2 * 3
mov r1, #16
vmull.s16 q3, d28, d4
vshrn.s32 d0, q3, #11
vst1.64 {d0}, [r0:64], r1
vst1.64 {d0}, [r0:64], r1
vst1.64 {d0}, [r0:64], r1
vst1.64 {d0}, [r0:64], r1
bx lr
endfunc