Separate format conversion DSP functions from DSPContext.

This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.

Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672)
This commit is contained in:
Justin Ruggles 2011-01-30 15:06:46 +00:00 committed by Michael Niedermayer
parent a35d782d28
commit fe2ff6d247
32 changed files with 1204 additions and 882 deletions

View File

@ -12,6 +12,7 @@ OBJS = allcodecs.o \
bitstream_filter.o \
dsputil.o \
faanidct.o \
fmtconvert.o \
imgconvert.o \
jrevdct.o \
opt.o \

View File

@ -35,6 +35,7 @@
#include "fft.h"
#include "mpeg4audio.h"
#include "sbr.h"
#include "fmtconvert.h"
#include <stdint.h>
@ -268,6 +269,7 @@ typedef struct {
FFTContext mdct;
FFTContext mdct_small;
DSPContext dsp;
FmtConvertContext fmt_conv;
int random_state;
/** @} */

View File

@ -85,6 +85,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
#include "lpc.h"
#include "aac.h"
@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
ff_aac_sbr_init();
dsputil_init(&ac->dsp, avctx);
ff_fmt_convert_init(&ac->fmt_conv, avctx);
ac->random_state = 0x1f2e3d4c;
@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
*data_size = data_size_tmp;
if (samples)
ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
if (ac->output_configured)
ac->output_configured = OC_LOCKED;

View File

@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
ff_kbd_window_init(s->window, 5.0, 256);
dsputil_init(&s->dsp, avctx);
ff_fmt_convert_init(&s->fmt_conv, avctx);
av_lfg_init(&s->dith_state, 0);
/* set scale value for float to int16 conversion */
@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
} else {
gain *= s->dynamic_range[0];
}
s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
}
/* apply spectral extension to high frequency bins */
@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
err = 1;
}
s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
out_samples += 256 * s->out_channels;
}
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);

View File

@ -55,6 +55,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
/* override ac3.h to include coupling channel */
#undef AC3_MAX_CHANNELS
@ -190,6 +191,7 @@ typedef struct {
///@defgroup opt optimization
DSPContext dsp; ///< for optimization
FmtConvertContext fmt_conv; ///< optimized conversion functions
float mul_bias; ///< scaling for float_to_int16 conversion
///@}

View File

@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \
arm/fft_init_arm.o \
arm/fmtconvert_init_arm.o \
arm/jrevdct_arm.o \
arm/mpegvideo_arm.o \
arm/simple_idct_arm.o \
@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
arm/dsputil_init_vfp.o \
$(VFP-OBJS-yes)
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
arm/mpegvideo_iwmmxt.o \
@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
arm/simple_idct_neon.o \

View File

@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
float mul, int len);
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
c->butterflies_float = ff_butterflies_float_neon;
c->scalarproduct_float = ff_scalarproduct_float_neon;
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
c->vector_fmul_add = ff_vector_fmul_add_neon;
c->vector_clipf = ff_vector_clipf_neon;
@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
if (CONFIG_VORBIS_DECODER)
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;

View File

@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = ff_vector_fmul_vfp;
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
#if HAVE_ARMV6
c->float_to_int16 = ff_float_to_int16_vfp;
#endif
}

View File

@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
bx lr
endfunc
function ff_float_to_int16_neon, export=1
subs r2, r2, #8
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q9, q1, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vshrn.s32 d4, q8, #16
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #16
vshrn.s32 d5, q9, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vld1.64 {d16-d17},[r1,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r1,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6-d7}, [r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vld1.64 {d0-d1}, [r1,:128]!
vshrn.s32 d4, q8, #16
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vshrn.s32 d5, q9, #16
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vst1.64 {d6-d7}, [r0,:128]!
bx lr
3: vshrn.s32 d4, q8, #16
vshrn.s32 d5, q9, #16
vst1.64 {d4-d5}, [r0,:128]!
bx lr
endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
ldr r3, [r1]
ldr r1, [r1, #4]
subs r2, r2, #8
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q9, q1, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q10, q8, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vld1.64 {d26-d27},[r1,:128]!
vsri.32 q11, q9, #16
vst1.64 {d20-d21},[r0,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q12, q0, #16
vld1.64 {d16-d17},[r3,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d25},[r0,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r3,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d26-d27},[r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vsri.32 q10, q8, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vsri.32 q11, q9, #16
vld1.64 {d26-d27},[r1,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d20-d21},[r0,:128]!
vsri.32 q12, q0, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d27},[r0,:128]!
bx lr
3: vsri.32 q10, q8, #16
vsri.32 q11, q9, #16
vst1.64 {d20-d23},[r0,:128]!
bx lr
4: push {r4-r8,lr}
cmp r3, #4
lsl ip, r3, #1
blt 4f
@ 4 channels
5: ldmia r1!, {r4-r7}
mov lr, r2
mov r8, r0
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #8
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q9, q8, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 q11, q10, #16
vld1.64 {d4-d5}, [r6,:128]!
vcvt.s32.f32 q2, q2, #16
vzip.32 d18, d22
vld1.64 {d6-d7}, [r7,:128]!
vcvt.s32.f32 q3, q3, #16
vzip.32 d19, d23
vst1.64 {d18}, [r8], ip
vsri.32 q1, q0, #16
vst1.64 {d22}, [r8], ip
vsri.32 q3, q2, #16
vst1.64 {d19}, [r8], ip
vzip.32 d2, d6
vst1.64 {d23}, [r8], ip
vzip.32 d3, d7
beq 7f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.64 {d2}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6}, [r8], ip
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.64 {d3}, [r8], ip
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d7}, [r8], ip
b 6b
7: vst1.64 {d2}, [r8], ip
vst1.64 {d6}, [r8], ip
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
bge 5b
@ 2 channels
4: cmp r3, #2
blt 4f
ldmia r1!, {r4-r5}
mov lr, r2
mov r8, r0
tst lr, #8
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
beq 6f
subs lr, lr, #8
beq 7f
vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #16
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 d18, d16, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 d19, d17, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r5,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vsri.32 d2, d0, #16
vst1.32 {d19[1]}, [r8], ip
vsri.32 d3, d1, #16
vst1.32 {d22[0]}, [r8], ip
vsri.32 d6, d4, #16
vst1.32 {d22[1]}, [r8], ip
vsri.32 d7, d5, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
beq 6f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
bgt 6b
6: vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
b 8f
7: vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
popeq {r4-r8,pc}
@ 1 channel
4: ldr r4, [r1],#4
tst r2, #8
mov lr, r2
mov r5, r0
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
bne 8f
6: subs lr, lr, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r4,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
beq 7f
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
7: vst1.16 {d4[1]}, [r5,:16], ip
vst1.16 {d4[3]}, [r5,:16], ip
vst1.16 {d5[1]}, [r5,:16], ip
vst1.16 {d5[3]}, [r5,:16], ip
vst1.16 {d6[1]}, [r5,:16], ip
vst1.16 {d6[3]}, [r5,:16], ip
vst1.16 {d7[1]}, [r5,:16], ip
vst1.16 {d7[3]}, [r5,:16], ip
bgt 6b
pop {r4-r8,pc}
8: subs lr, lr, #8
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
b 6b
endfunc
function ff_vector_fmul_neon, export=1
subs r3, r3, #8
vld1.64 {d0-d3}, [r1,:128]!
@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
function ff_int32_to_float_fmul_scalar_neon, export=1
VFP vdup.32 q0, d0[0]
VFP len .req r2
NOVFP vdup.32 q0, r2
NOVFP len .req r3
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
1: subs len, len, #8
pld [r1, #16]
vmul.f32 q9, q3, q0
vmul.f32 q10, q8, q0
beq 2f
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
b 1b
2: vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
bx lr
.unreq len
endfunc
function ff_vector_fmul_reverse_neon, export=1
add r2, r2, r3, lsl #2
sub r2, r2, #32

View File

@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
#if HAVE_ARMV6
/**
* ARM VFP optimized float to int16 conversion.
* Assume that len is a positive number and is multiple of 8, destination
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
* performance), little endian byte sex
*/
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
function ff_float_to_int16_vfp, export=1
push {r4-r8,lr}
vpush {d8-d11}
vldmia r1!, {s16-s23}
vcvt.s32.f32 s0, s16
vcvt.s32.f32 s1, s17
vcvt.s32.f32 s2, s18
vcvt.s32.f32 s3, s19
vcvt.s32.f32 s4, s20
vcvt.s32.f32 s5, s21
vcvt.s32.f32 s6, s22
vcvt.s32.f32 s7, s23
1:
subs r2, r2, #8
vmov r3, r4, s0, s1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
ssat r6, #16, r6
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
vcvtgt.s32.f32 s7, s23
ssat r8, #16, r8
ssat r7, #16, r7
ssat lr, #16, lr
ssat ip, #16, ip
pkhbt r5, r7, r8, lsl #16
pkhbt r6, ip, lr, lsl #16
stmia r0!, {r3-r6}
bgt 1b
vpop {d8-d11}
pop {r4-r8,pc}
endfunc
#endif

View File

@ -0,0 +1,48 @@
/*
* ARM optimized Format Conversion Utils
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavcodec/avcodec.h"
#include "libavcodec/fmtconvert.h"
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
float mul, int len);
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
{
if (HAVE_ARMVFP && HAVE_ARMV6) {
c->float_to_int16 = ff_float_to_int16_vfp;
}
if (HAVE_NEON) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
}
}

View File

@ -0,0 +1,391 @@
/*
* ARM NEON optimised Format Conversion Utils
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "asm.S"
preserve8
.text
function ff_float_to_int16_neon, export=1
subs r2, r2, #8
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q9, q1, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vshrn.s32 d4, q8, #16
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #16
vshrn.s32 d5, q9, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vld1.64 {d16-d17},[r1,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r1,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6-d7}, [r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vld1.64 {d0-d1}, [r1,:128]!
vshrn.s32 d4, q8, #16
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vshrn.s32 d5, q9, #16
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vst1.64 {d6-d7}, [r0,:128]!
bx lr
3: vshrn.s32 d4, q8, #16
vshrn.s32 d5, q9, #16
vst1.64 {d4-d5}, [r0,:128]!
bx lr
endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
ldr r3, [r1]
ldr r1, [r1, #4]
subs r2, r2, #8
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q9, q1, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q10, q8, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vld1.64 {d26-d27},[r1,:128]!
vsri.32 q11, q9, #16
vst1.64 {d20-d21},[r0,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q12, q0, #16
vld1.64 {d16-d17},[r3,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d25},[r0,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r3,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d26-d27},[r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vsri.32 q10, q8, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vsri.32 q11, q9, #16
vld1.64 {d26-d27},[r1,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d20-d21},[r0,:128]!
vsri.32 q12, q0, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d27},[r0,:128]!
bx lr
3: vsri.32 q10, q8, #16
vsri.32 q11, q9, #16
vst1.64 {d20-d23},[r0,:128]!
bx lr
4: push {r4-r8,lr}
cmp r3, #4
lsl ip, r3, #1
blt 4f
@ 4 channels
5: ldmia r1!, {r4-r7}
mov lr, r2
mov r8, r0
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #8
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q9, q8, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 q11, q10, #16
vld1.64 {d4-d5}, [r6,:128]!
vcvt.s32.f32 q2, q2, #16
vzip.32 d18, d22
vld1.64 {d6-d7}, [r7,:128]!
vcvt.s32.f32 q3, q3, #16
vzip.32 d19, d23
vst1.64 {d18}, [r8], ip
vsri.32 q1, q0, #16
vst1.64 {d22}, [r8], ip
vsri.32 q3, q2, #16
vst1.64 {d19}, [r8], ip
vzip.32 d2, d6
vst1.64 {d23}, [r8], ip
vzip.32 d3, d7
beq 7f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.64 {d2}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6}, [r8], ip
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.64 {d3}, [r8], ip
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d7}, [r8], ip
b 6b
7: vst1.64 {d2}, [r8], ip
vst1.64 {d6}, [r8], ip
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
bge 5b
@ 2 channels
4: cmp r3, #2
blt 4f
ldmia r1!, {r4-r5}
mov lr, r2
mov r8, r0
tst lr, #8
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
beq 6f
subs lr, lr, #8
beq 7f
vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #16
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 d18, d16, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 d19, d17, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r5,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vsri.32 d2, d0, #16
vst1.32 {d19[1]}, [r8], ip
vsri.32 d3, d1, #16
vst1.32 {d22[0]}, [r8], ip
vsri.32 d6, d4, #16
vst1.32 {d22[1]}, [r8], ip
vsri.32 d7, d5, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
beq 6f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
bgt 6b
6: vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
b 8f
7: vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
popeq {r4-r8,pc}
@ 1 channel
4: ldr r4, [r1],#4
tst r2, #8
mov lr, r2
mov r5, r0
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
bne 8f
6: subs lr, lr, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r4,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
beq 7f
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
7: vst1.16 {d4[1]}, [r5,:16], ip
vst1.16 {d4[3]}, [r5,:16], ip
vst1.16 {d5[1]}, [r5,:16], ip
vst1.16 {d5[3]}, [r5,:16], ip
vst1.16 {d6[1]}, [r5,:16], ip
vst1.16 {d6[3]}, [r5,:16], ip
vst1.16 {d7[1]}, [r5,:16], ip
vst1.16 {d7[3]}, [r5,:16], ip
bgt 6b
pop {r4-r8,pc}
8: subs lr, lr, #8
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
b 6b
endfunc
function ff_int32_to_float_fmul_scalar_neon, export=1
VFP vdup.32 q0, d0[0]
VFP len .req r2
NOVFP vdup.32 q0, r2
NOVFP len .req r3
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
1: subs len, len, #8
pld [r1, #16]
vmul.f32 q9, q3, q0
vmul.f32 q10, q8, q0
beq 2f
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
b 1b
2: vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
bx lr
.unreq len
endfunc

View File

@ -0,0 +1,77 @@
/*
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "asm.S"
.syntax unified
/**
* ARM VFP optimized float to int16 conversion.
* Assume that len is a positive number and is multiple of 8, destination
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
* performance), little endian byte sex
*/
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
function ff_float_to_int16_vfp, export=1
push {r4-r8,lr}
vpush {d8-d11}
vldmia r1!, {s16-s23}
vcvt.s32.f32 s0, s16
vcvt.s32.f32 s1, s17
vcvt.s32.f32 s2, s18
vcvt.s32.f32 s3, s19
vcvt.s32.f32 s4, s20
vcvt.s32.f32 s5, s21
vcvt.s32.f32 s6, s22
vcvt.s32.f32 s7, s23
1:
subs r2, r2, #8
vmov r3, r4, s0, s1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
ssat r6, #16, r6
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
vcvtgt.s32.f32 s7, s23
ssat r8, #16, r8
ssat r7, #16, r7
ssat lr, #16, lr
ssat ip, #16, ip
pkhbt r5, r7, r8, lsl #16
pkhbt r6, ip, lr, lsl #16
stmia r0!, {r3-r6}
bgt 1b
vpop {d8-d11}
pop {r4-r8,pc}
endfunc

View File

@ -33,6 +33,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
extern const uint16_t ff_wma_critical_freqs[25];
@ -43,6 +44,7 @@ typedef struct {
AVCodecContext *avctx;
GetBitContext gb;
DSPContext dsp;
FmtConvertContext fmt_conv;
int first;
int channels;
int frame_len; ///< transform size (samples)
@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
s->avctx = avctx;
dsputil_init(&s->dsp, avctx);
ff_fmt_convert_init(&s->fmt_conv, avctx);
/* determine frame length */
if (avctx->sample_rate < 22050) {
@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
ff_rdft_calc(&s->trans.rdft, coeffs);
}
s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
s->frame_len, s->channels);
if (!s->first) {
int count = s->overlap_len * s->channels;

View File

@ -40,6 +40,7 @@
#include "dca.h"
#include "synth_filter.h"
#include "dcadsp.h"
#include "fmtconvert.h"
//#define TRACE
@ -347,6 +348,7 @@ typedef struct {
FFTContext imdct;
SynthFilterContext synth;
DCADSPContext dcadsp;
FmtConvertContext fmt_conv;
} DCAContext;
static const uint16_t dca_vlc_offs[] = {
@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
}
s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
block, rscale, 8);
}
@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
}
}
s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
samples += 256 * channels;
}
@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
ff_mdct_init(&s->imdct, 6, 1, 1.0);
ff_synth_filter_init(&s->synth);
ff_dcadsp_init(&s->dcadsp);
ff_fmt_convert_init(&s->fmt_conv, avctx);
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
s->samples_chanptr[i] = s->samples + i * 256;

View File

@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
return p;
}
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
int i;
for(i=0; i<len; i++)
dst[i] = src[i] * mul;
}
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
uint32_t maxi, uint32_t maxisign)
{
@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
}
}
static av_always_inline int float_to_int16_one(const float *src){
return av_clip_int16(lrintf(*src));
}
static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
int i;
for(i=0; i<len; i++)
dst[i] = float_to_int16_one(src+i);
}
static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
int i,j,c;
if(channels==2){
for(i=0; i<len; i++){
dst[2*i] = float_to_int16_one(src[0]+i);
dst[2*i+1] = float_to_int16_one(src[1]+i);
}
}else{
for(c=0; c<channels; c++)
for(i=0, j=c; i<len; i++, j+=channels)
dst[j] = float_to_int16_one(src[c]+i);
}
}
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
{
int res = 0;
@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->vector_fmul_reverse = vector_fmul_reverse_c;
c->vector_fmul_add = vector_fmul_add_c;
c->vector_fmul_window = vector_fmul_window_c;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->vector_clipf = vector_clipf_c;
c->float_to_int16 = ff_float_to_int16_c;
c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->scalarproduct_float = scalarproduct_float_c;

View File

@ -392,7 +392,6 @@ typedef struct DSPContext {
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
/**
* Multiply a vector of floats by a scalar float. Source and
@ -445,10 +444,6 @@ typedef struct DSPContext {
*/
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
/* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
void (*float_to_int16)(int16_t *dst, const float *src, long len);
void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
void (*fdct248)(DCTELEM *block/* align 16*/);

68
libavcodec/fmtconvert.c Normal file
View File

@ -0,0 +1,68 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "avcodec.h"
#include "fmtconvert.h"
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
int i;
for(i=0; i<len; i++)
dst[i] = src[i] * mul;
}
static av_always_inline int float_to_int16_one(const float *src){
return av_clip_int16(lrintf(*src));
}
static void float_to_int16_c(int16_t *dst, const float *src, long len)
{
int i;
for(i=0; i<len; i++)
dst[i] = float_to_int16_one(src+i);
}
static void float_to_int16_interleave_c(int16_t *dst, const float **src,
long len, int channels)
{
int i,j,c;
if(channels==2){
for(i=0; i<len; i++){
dst[2*i] = float_to_int16_one(src[0]+i);
dst[2*i+1] = float_to_int16_one(src[1]+i);
}
}else{
for(c=0; c<channels; c++)
for(i=0, j=c; i<len; i++, j+=channels)
dst[j] = float_to_int16_one(src[c]+i);
}
}
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->float_to_int16 = float_to_int16_c;
c->float_to_int16_interleave = float_to_int16_interleave_c;
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
}

79
libavcodec/fmtconvert.h Normal file
View File

@ -0,0 +1,79 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_FMTCONVERT_H
#define AVCODEC_FMTCONVERT_H
#include "avcodec.h"
typedef struct FmtConvertContext {
/**
* Convert an array of int32_t to float and multiply by a float value.
* @param dst destination array of float.
* constraints: 16-byte aligned
* @param src source array of int32_t.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
*/
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
/**
* Convert an array of float to an array of int16_t.
*
* Convert floats from in the range [-32768.0,32767.0] to ints
* without rescaling
*
* @param dst destination array of int16_t.
* constraints: 16-byte aligned
* @param src source array of float.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
*/
void (*float_to_int16)(int16_t *dst, const float *src, long len);
/**
* Convert multiple arrays of float to an interleaved array of int16_t.
*
* Convert floats from in the range [-32768.0,32767.0] to ints
* without rescaling
*
* @param dst destination array of interleaved int16_t.
* constraints: 16-byte aligned
* @param src source array of float arrays, one for each channel.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
* @param channels number of channels
*/
void (*float_to_int16_interleave)(int16_t *dst, const float **src,
long len, int channels);
} FmtConvertContext;
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
#endif /* AVCODEC_FMTCONVERT_H */

View File

@ -38,6 +38,7 @@
#include "avcodec.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
#define ALT_BITSTREAM_READER_LE
#include "get_bits.h"
@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
float scale_bias;
DSPContext dsp;
FFTContext imdct_ctx;
FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext;
@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
dsputil_init(&s->dsp, avctx);
ff_fmt_convert_init(&s->fmt_conv, avctx);
s->scale_bias = 1.0/(1*8);
@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
for (i=0 ; i<blocks ; i++) {
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
*data_size += NELLY_SAMPLES*sizeof(int16_t);
}

View File

@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
ppc/fdct_altivec.o \
ppc/float_altivec.o \
ppc/fmtconvert_altivec.o \
ppc/gmc_altivec.o \
ppc/idct_altivec.o \
ppc/int_altivec.o \

View File

@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
}
}
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
{
union {
vector float v;
float s[4];
} mul_u;
int i;
vector float src1, src2, dst1, dst2, mul_v, zero;
zero = (vector float)vec_splat_u32(0);
mul_u.s[0] = mul;
mul_v = vec_splat(mul_u.v, 0);
for(i=0; i<len; i+=8) {
src1 = vec_ctf(vec_ld(0, src+i), 0);
src2 = vec_ctf(vec_ld(16, src+i), 0);
dst1 = vec_madd(src1, mul_v, zero);
dst2 = vec_madd(src2, mul_v, zero);
vec_st(dst1, 0, dst+i);
vec_st(dst2, 16, dst+i);
}
}
static vector signed short
float_to_int16_one_altivec(const float *src)
{
vector float s0 = vec_ld(0, src);
vector float s1 = vec_ld(16, src);
vector signed int t0 = vec_cts(s0, 0);
vector signed int t1 = vec_cts(s1, 0);
return vec_packs(t0,t1);
}
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
int i;
vector signed short d0, d1, d;
vector unsigned char align;
if(((long)dst)&15) //FIXME
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst+i);
d = float_to_int16_one_altivec(src+i);
d1 = vec_ld(15, dst+i);
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
align = vec_lvsr(0, dst+i);
d0 = vec_perm(d1, d, align);
d1 = vec_perm(d, d1, align);
vec_st(d0, 0, dst+i);
vec_st(d1,15, dst+i);
}
else
for(i=0; i<len-7; i+=8) {
d = float_to_int16_one_altivec(src+i);
vec_st(d, 0, dst+i);
}
}
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
long len, int channels)
{
int i;
vector signed short d0, d1, d2, c0, c1, t0, t1;
vector unsigned char align;
if(channels == 1)
float_to_int16_altivec(dst, src[0], len);
else
if (channels == 2) {
if(((long)dst)&15)
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst + i);
t0 = float_to_int16_one_altivec(src[0] + i);
d1 = vec_ld(31, dst + i);
t1 = float_to_int16_one_altivec(src[1] + i);
c0 = vec_mergeh(t0, t1);
c1 = vec_mergel(t0, t1);
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d2, c0, align);
d1 = vec_perm(c0, c1, align);
vec_st(d0, 0, dst + i);
d0 = vec_perm(c1, d2, align);
vec_st(d1, 15, dst + i);
vec_st(d0, 31, dst + i);
dst+=8;
}
else
for(i=0; i<len-7; i+=8) {
t0 = float_to_int16_one_altivec(src[0] + i);
t1 = float_to_int16_one_altivec(src[1] + i);
d0 = vec_mergeh(t0, t1);
d1 = vec_mergel(t0, t1);
vec_st(d0, 0, dst + i);
vec_st(d1, 16, dst + i);
dst+=8;
}
} else {
DECLARE_ALIGNED(16, int16_t, tmp)[len];
int c, j;
for (c = 0; c < channels; c++) {
float_to_int16_altivec(tmp, src[c], len);
for (i = 0, j = c; i < len; i++, j+=channels) {
dst[j] = tmp[i];
}
}
}
}
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = vector_fmul_altivec;
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
c->vector_fmul_add = vector_fmul_add_altivec;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vector_fmul_window = vector_fmul_window_altivec;
c->float_to_int16 = float_to_int16_altivec;
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}

View File

@ -0,0 +1,142 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/fmtconvert.h"
#include "dsputil_altivec.h"
#include "util_altivec.h"
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
{
union {
vector float v;
float s[4];
} mul_u;
int i;
vector float src1, src2, dst1, dst2, mul_v, zero;
zero = (vector float)vec_splat_u32(0);
mul_u.s[0] = mul;
mul_v = vec_splat(mul_u.v, 0);
for(i=0; i<len; i+=8) {
src1 = vec_ctf(vec_ld(0, src+i), 0);
src2 = vec_ctf(vec_ld(16, src+i), 0);
dst1 = vec_madd(src1, mul_v, zero);
dst2 = vec_madd(src2, mul_v, zero);
vec_st(dst1, 0, dst+i);
vec_st(dst2, 16, dst+i);
}
}
static vector signed short
float_to_int16_one_altivec(const float *src)
{
vector float s0 = vec_ld(0, src);
vector float s1 = vec_ld(16, src);
vector signed int t0 = vec_cts(s0, 0);
vector signed int t1 = vec_cts(s1, 0);
return vec_packs(t0,t1);
}
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
int i;
vector signed short d0, d1, d;
vector unsigned char align;
if(((long)dst)&15) //FIXME
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst+i);
d = float_to_int16_one_altivec(src+i);
d1 = vec_ld(15, dst+i);
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
align = vec_lvsr(0, dst+i);
d0 = vec_perm(d1, d, align);
d1 = vec_perm(d, d1, align);
vec_st(d0, 0, dst+i);
vec_st(d1,15, dst+i);
}
else
for(i=0; i<len-7; i+=8) {
d = float_to_int16_one_altivec(src+i);
vec_st(d, 0, dst+i);
}
}
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
long len, int channels)
{
int i;
vector signed short d0, d1, d2, c0, c1, t0, t1;
vector unsigned char align;
if(channels == 1)
float_to_int16_altivec(dst, src[0], len);
else
if (channels == 2) {
if(((long)dst)&15)
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst + i);
t0 = float_to_int16_one_altivec(src[0] + i);
d1 = vec_ld(31, dst + i);
t1 = float_to_int16_one_altivec(src[1] + i);
c0 = vec_mergeh(t0, t1);
c1 = vec_mergel(t0, t1);
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d2, c0, align);
d1 = vec_perm(c0, c1, align);
vec_st(d0, 0, dst + i);
d0 = vec_perm(c1, d2, align);
vec_st(d1, 15, dst + i);
vec_st(d0, 31, dst + i);
dst+=8;
}
else
for(i=0; i<len-7; i+=8) {
t0 = float_to_int16_one_altivec(src[0] + i);
t1 = float_to_int16_one_altivec(src[1] + i);
d0 = vec_mergeh(t0, t1);
d1 = vec_mergel(t0, t1);
vec_st(d0, 0, dst + i);
vec_st(d1, 16, dst + i);
dst+=8;
}
} else {
DECLARE_ALIGNED(16, int16_t, tmp)[len];
int c, j;
for (c = 0; c < channels; c++) {
float_to_int16_altivec(tmp, src[c], len);
for (i = 0, j = c; i < len; i++, j+=channels) {
dst[j] = tmp[i];
}
}
}
}
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = float_to_int16_altivec;
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}

View File

@ -31,6 +31,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
#include "vorbis.h"
#include "xiph.h"
@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
AVCodecContext *avccontext;
GetBitContext gb;
DSPContext dsp;
FmtConvertContext fmt_conv;
FFTContext mdct[2];
uint_fast8_t first_frame;
@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
vc->avccontext = avccontext;
dsputil_init(&vc->dsp, avccontext);
ff_fmt_convert_init(&vc->fmt_conv, avccontext);
vc->scale_bias = 32768.0f;
@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
}
vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
vc->audio_channels);
*data_size = len * 2 * vc->audio_channels;
return buf_size ;

View File

@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
s->block_align = avctx->block_align;
dsputil_init(&s->dsp, avctx);
ff_fmt_convert_init(&s->fmt_conv, avctx);
if (avctx->codec->id == CODEC_ID_WMAV1) {
s->version = 1;

View File

@ -26,6 +26,7 @@
#include "put_bits.h"
#include "dsputil.h"
#include "fft.h"
#include "fmtconvert.h"
/* size of blocks */
#define BLOCK_MIN_BITS 7
@ -134,6 +135,7 @@ typedef struct WMACodecContext {
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
DSPContext dsp;
FmtConvertContext fmt_conv;
#ifdef TRACE
int frame_count;

View File

@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
incr = s->nb_channels;
for (ch = 0; ch < MAX_CHANNELS; ch++)
output[ch] = s->frame_out[ch];
s->dsp.float_to_int16_interleave(samples, output, n, incr);
s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
for (ch = 0; ch < incr; ch++) {
/* prepare for next block */
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));

View File

@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
x86/deinterlace.o \
x86/fmtconvert.o \
x86/h264_chromamc.o \
$(YASM-OBJS-yes)
@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
x86/dsputil_mmx.o \
x86/fdct_mmx.o \
x86/fmtconvert_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/motion_est_mmx.o \

View File

@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
}
#endif /* HAVE_6REGS */
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtpi2ps (%2,%0), %%xmm0 \n"
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
"movlhps %%xmm1, %%xmm0 \n"
"movlhps %%xmm3, %%xmm2 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm2 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm2, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtdq2ps (%2,%0), %%xmm0 \n"
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm1 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm1, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
int len)
{
@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
);
}
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
// not bit-exact: pf2id uses different rounding than C and SSE
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"pf2id (%2,%0,2) , %%mm0 \n\t"
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
"packssdw %%mm1 , %%mm0 \n\t"
"packssdw %%mm3 , %%mm2 \n\t"
"movq %%mm0 , (%1,%0) \n\t"
"movq %%mm2 , 8(%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
"femms \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
"packssdw %%mm1 , %%mm0 \n\t"
"packssdw %%mm3 , %%mm2 \n\t"
"movq %%mm0 , (%1,%0) \n\t"
"movq %%mm2 , 8(%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
"emms \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
"packssdw %%xmm1 , %%xmm0 \n\t"
"movdqa %%xmm0 , (%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
void ff_vp3_idct_mmx(int16_t *input_data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
#if !HAVE_YASM
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#endif
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
int i,j,c;\
for(c=0; c<channels; c++){\
float_to_int16_##cpu(tmp, src[c], len);\
for(i=0, j=c; i<len; i++, j+=channels)\
dst[j] = tmp[i];\
}\
}\
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
x86_reg reglen = len; \
const float *src0 = src[0];\
const float *src1 = src[1];\
__asm__ volatile(\
"shl $2, %0 \n"\
"add %0, %1 \n"\
"add %0, %2 \n"\
"add %0, %3 \n"\
"neg %0 \n"\
body\
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
FLOAT_TO_INT16_INTERLEAVE(3dnow,
"1: \n"
"pf2id (%2,%0), %%mm0 \n"
"pf2id 8(%2,%0), %%mm1 \n"
"pf2id (%3,%0), %%mm2 \n"
"pf2id 8(%3,%0), %%mm3 \n"
"packssdw %%mm1, %%mm0 \n"
"packssdw %%mm3, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"punpcklwd %%mm2, %%mm0 \n"
"punpckhwd %%mm2, %%mm1 \n"
"movq %%mm0, (%1,%0)\n"
"movq %%mm1, 8(%1,%0)\n"
"add $16, %0 \n"
"js 1b \n"
"femms \n"
)
FLOAT_TO_INT16_INTERLEAVE(sse,
"1: \n"
"cvtps2pi (%2,%0), %%mm0 \n"
"cvtps2pi 8(%2,%0), %%mm1 \n"
"cvtps2pi (%3,%0), %%mm2 \n"
"cvtps2pi 8(%3,%0), %%mm3 \n"
"packssdw %%mm1, %%mm0 \n"
"packssdw %%mm3, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"punpcklwd %%mm2, %%mm0 \n"
"punpckhwd %%mm2, %%mm1 \n"
"movq %%mm0, (%1,%0)\n"
"movq %%mm1, 8(%1,%0)\n"
"add $16, %0 \n"
"js 1b \n"
"emms \n"
)
FLOAT_TO_INT16_INTERLEAVE(sse2,
"1: \n"
"cvtps2dq (%2,%0), %%xmm0 \n"
"cvtps2dq (%3,%0), %%xmm1 \n"
"packssdw %%xmm1, %%xmm0 \n"
"movhlps %%xmm0, %%xmm1 \n"
"punpcklwd %%xmm1, %%xmm0 \n"
"movdqa %%xmm0, (%1,%0) \n"
"add $16, %0 \n"
"js 1b \n"
)
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
if(channels==6)
ff_float_to_int16_interleave6_3dn2(dst, src, len);
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16 = float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
if(mm_flags & AV_CPU_FLAG_MMX2){
#if HAVE_YASM
@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->vector_clipf = vector_clipf_sse;
c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
#endif
@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;

View File

@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
section .text align=16
%macro PSWAPD_SSE 2
pshufw %1, %2, 0x4e
%endmacro
%macro PSWAPD_3DN1 2
movq %1, %2
psrlq %1, 32
punpckldq %1, %2
%endmacro
%macro FLOAT_TO_INT16_INTERLEAVE6 1
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64
%define lend r10d
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
cvtps2pi mm0, [srcq]
cvtps2pi mm1, [srcq+src1q]
cvtps2pi mm2, [srcq+src2q]
cvtps2pi mm3, [srcq+src3q]
cvtps2pi mm4, [srcq+src4q]
cvtps2pi mm5, [srcq+src5q]
packssdw mm0, mm3
packssdw mm1, mm4
packssdw mm2, mm5
pswapd mm3, mm0
punpcklwd mm0, mm1
punpckhwd mm1, mm2
punpcklwd mm2, mm3
pswapd mm3, mm0
punpckldq mm0, mm2
punpckhdq mm2, mm1
punpckldq mm1, mm3
movq [dstq ], mm0
movq [dstq+16], mm2
movq [dstq+ 8], mm1
add srcq, 8
add dstq, 24
sub lend, 2
jg .loop
emms
RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
%define pswapd PSWAPD_SSE
FLOAT_TO_INT16_INTERLEAVE6 sse
%define cvtps2pi pf2id
%define pswapd PSWAPD_3DN1
FLOAT_TO_INT16_INTERLEAVE6 3dnow
%undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3dn2
%undef cvtps2pi
%macro SCALARPRODUCT 1
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift

View File

@ -0,0 +1,91 @@
;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86inc.asm"
section .text align=16
%macro PSWAPD_SSE 2
pshufw %1, %2, 0x4e
%endmacro
%macro PSWAPD_3DN1 2
movq %1, %2
psrlq %1, 32
punpckldq %1, %2
%endmacro
%macro FLOAT_TO_INT16_INTERLEAVE6 1
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64
%define lend r10d
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
cvtps2pi mm0, [srcq]
cvtps2pi mm1, [srcq+src1q]
cvtps2pi mm2, [srcq+src2q]
cvtps2pi mm3, [srcq+src3q]
cvtps2pi mm4, [srcq+src4q]
cvtps2pi mm5, [srcq+src5q]
packssdw mm0, mm3
packssdw mm1, mm4
packssdw mm2, mm5
pswapd mm3, mm0
punpcklwd mm0, mm1
punpckhwd mm1, mm2
punpcklwd mm2, mm3
pswapd mm3, mm0
punpckldq mm0, mm2
punpckhdq mm2, mm1
punpckldq mm1, mm3
movq [dstq ], mm0
movq [dstq+16], mm2
movq [dstq+ 8], mm1
add srcq, 8
add dstq, 24
sub lend, 2
jg .loop
emms
RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
%define pswapd PSWAPD_SSE
FLOAT_TO_INT16_INTERLEAVE6 sse
%define cvtps2pi pf2id
%define pswapd PSWAPD_3DN1
FLOAT_TO_INT16_INTERLEAVE6 3dnow
%undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3dn2
%undef cvtps2pi

View File

@ -0,0 +1,266 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h"
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtpi2ps (%2,%0), %%xmm0 \n"
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
"movlhps %%xmm1, %%xmm0 \n"
"movlhps %%xmm3, %%xmm2 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm2 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm2, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtdq2ps (%2,%0), %%xmm0 \n"
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm1 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm1, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
// not bit-exact: pf2id uses different rounding than C and SSE
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"pf2id (%2,%0,2) , %%mm0 \n\t"
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
"packssdw %%mm1 , %%mm0 \n\t"
"packssdw %%mm3 , %%mm2 \n\t"
"movq %%mm0 , (%1,%0) \n\t"
"movq %%mm2 , 8(%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
"femms \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
"packssdw %%mm1 , %%mm0 \n\t"
"packssdw %%mm3 , %%mm2 \n\t"
"movq %%mm0 , (%1,%0) \n\t"
"movq %%mm2 , 8(%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
"emms \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
__asm__ volatile(
"add %0 , %0 \n\t"
"lea (%2,%0,2) , %2 \n\t"
"add %0 , %1 \n\t"
"neg %0 \n\t"
"1: \n\t"
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
"packssdw %%xmm1 , %%xmm0 \n\t"
"movdqa %%xmm0 , (%1,%0) \n\t"
"add $16 , %0 \n\t"
" js 1b \n\t"
:"+r"(reglen), "+r"(dst), "+r"(src)
);
}
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
#if !HAVE_YASM
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#endif
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
int i,j,c;\
for(c=0; c<channels; c++){\
float_to_int16_##cpu(tmp, src[c], len);\
for(i=0, j=c; i<len; i++, j+=channels)\
dst[j] = tmp[i];\
}\
}\
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
x86_reg reglen = len; \
const float *src0 = src[0];\
const float *src1 = src[1];\
__asm__ volatile(\
"shl $2, %0 \n"\
"add %0, %1 \n"\
"add %0, %2 \n"\
"add %0, %3 \n"\
"neg %0 \n"\
body\
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
FLOAT_TO_INT16_INTERLEAVE(3dnow,
"1: \n"
"pf2id (%2,%0), %%mm0 \n"
"pf2id 8(%2,%0), %%mm1 \n"
"pf2id (%3,%0), %%mm2 \n"
"pf2id 8(%3,%0), %%mm3 \n"
"packssdw %%mm1, %%mm0 \n"
"packssdw %%mm3, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"punpcklwd %%mm2, %%mm0 \n"
"punpckhwd %%mm2, %%mm1 \n"
"movq %%mm0, (%1,%0)\n"
"movq %%mm1, 8(%1,%0)\n"
"add $16, %0 \n"
"js 1b \n"
"femms \n"
)
FLOAT_TO_INT16_INTERLEAVE(sse,
"1: \n"
"cvtps2pi (%2,%0), %%mm0 \n"
"cvtps2pi 8(%2,%0), %%mm1 \n"
"cvtps2pi (%3,%0), %%mm2 \n"
"cvtps2pi 8(%3,%0), %%mm3 \n"
"packssdw %%mm1, %%mm0 \n"
"packssdw %%mm3, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"punpcklwd %%mm2, %%mm0 \n"
"punpckhwd %%mm2, %%mm1 \n"
"movq %%mm0, (%1,%0)\n"
"movq %%mm1, 8(%1,%0)\n"
"add $16, %0 \n"
"js 1b \n"
"emms \n"
)
FLOAT_TO_INT16_INTERLEAVE(sse2,
"1: \n"
"cvtps2dq (%2,%0), %%xmm0 \n"
"cvtps2dq (%3,%0), %%xmm1 \n"
"packssdw %%xmm1, %%xmm0 \n"
"movhlps %%xmm0, %%xmm1 \n"
"punpcklwd %%xmm1, %%xmm0 \n"
"movdqa %%xmm0, (%1,%0) \n"
"add $16, %0 \n"
"js 1b \n"
)
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
if(channels==6)
ff_float_to_int16_interleave6_3dn2(dst, src, len);
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
if(mm_flags & AV_CPU_FLAG_3DNOW){
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16 = float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
if(mm_flags & AV_CPU_FLAG_SSE){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
}
if(mm_flags & AV_CPU_FLAG_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
}
}