mirror of https://git.ffmpeg.org/ffmpeg.git
Separate format conversion DSP functions from DSPContext.
This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com>
This commit is contained in:
parent
770c410fbb
commit
c73d99e672
|
@ -12,6 +12,7 @@ OBJS = allcodecs.o \
|
||||||
bitstream_filter.o \
|
bitstream_filter.o \
|
||||||
dsputil.o \
|
dsputil.o \
|
||||||
faanidct.o \
|
faanidct.o \
|
||||||
|
fmtconvert.o \
|
||||||
imgconvert.o \
|
imgconvert.o \
|
||||||
jrevdct.o \
|
jrevdct.o \
|
||||||
opt.o \
|
opt.o \
|
||||||
|
|
|
@ -35,6 +35,7 @@
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
#include "mpeg4audio.h"
|
#include "mpeg4audio.h"
|
||||||
#include "sbr.h"
|
#include "sbr.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
@ -268,6 +269,7 @@ typedef struct {
|
||||||
FFTContext mdct;
|
FFTContext mdct;
|
||||||
FFTContext mdct_small;
|
FFTContext mdct_small;
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
int random_state;
|
int random_state;
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
|
|
|
@ -85,6 +85,7 @@
|
||||||
#include "get_bits.h"
|
#include "get_bits.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
#include "lpc.h"
|
#include "lpc.h"
|
||||||
|
|
||||||
#include "aac.h"
|
#include "aac.h"
|
||||||
|
@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
|
||||||
ff_aac_sbr_init();
|
ff_aac_sbr_init();
|
||||||
|
|
||||||
dsputil_init(&ac->dsp, avctx);
|
dsputil_init(&ac->dsp, avctx);
|
||||||
|
ff_fmt_convert_init(&ac->fmt_conv, avctx);
|
||||||
|
|
||||||
ac->random_state = 0x1f2e3d4c;
|
ac->random_state = 0x1f2e3d4c;
|
||||||
|
|
||||||
|
@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
|
||||||
*data_size = data_size_tmp;
|
*data_size = data_size_tmp;
|
||||||
|
|
||||||
if (samples)
|
if (samples)
|
||||||
ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
|
ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
|
||||||
|
|
||||||
if (ac->output_configured)
|
if (ac->output_configured)
|
||||||
ac->output_configured = OC_LOCKED;
|
ac->output_configured = OC_LOCKED;
|
||||||
|
|
|
@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
|
||||||
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
|
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
|
||||||
ff_kbd_window_init(s->window, 5.0, 256);
|
ff_kbd_window_init(s->window, 5.0, 256);
|
||||||
dsputil_init(&s->dsp, avctx);
|
dsputil_init(&s->dsp, avctx);
|
||||||
|
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||||
av_lfg_init(&s->dith_state, 0);
|
av_lfg_init(&s->dith_state, 0);
|
||||||
|
|
||||||
/* set scale value for float to int16 conversion */
|
/* set scale value for float to int16 conversion */
|
||||||
|
@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
|
||||||
} else {
|
} else {
|
||||||
gain *= s->dynamic_range[0];
|
gain *= s->dynamic_range[0];
|
||||||
}
|
}
|
||||||
s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
|
s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* apply spectral extension to high frequency bins */
|
/* apply spectral extension to high frequency bins */
|
||||||
|
@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
|
||||||
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
|
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
|
||||||
err = 1;
|
err = 1;
|
||||||
}
|
}
|
||||||
s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
|
s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
|
||||||
out_samples += 256 * s->out_channels;
|
out_samples += 256 * s->out_channels;
|
||||||
}
|
}
|
||||||
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
|
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
|
||||||
|
|
|
@ -55,6 +55,7 @@
|
||||||
#include "get_bits.h"
|
#include "get_bits.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
/* override ac3.h to include coupling channel */
|
/* override ac3.h to include coupling channel */
|
||||||
#undef AC3_MAX_CHANNELS
|
#undef AC3_MAX_CHANNELS
|
||||||
|
@ -190,6 +191,7 @@ typedef struct {
|
||||||
|
|
||||||
///@defgroup opt optimization
|
///@defgroup opt optimization
|
||||||
DSPContext dsp; ///< for optimization
|
DSPContext dsp; ///< for optimization
|
||||||
|
FmtConvertContext fmt_conv; ///< optimized conversion functions
|
||||||
float mul_bias; ///< scaling for float_to_int16 conversion
|
float mul_bias; ///< scaling for float_to_int16 conversion
|
||||||
///@}
|
///@}
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
|
||||||
OBJS += arm/dsputil_init_arm.o \
|
OBJS += arm/dsputil_init_arm.o \
|
||||||
arm/dsputil_arm.o \
|
arm/dsputil_arm.o \
|
||||||
arm/fft_init_arm.o \
|
arm/fft_init_arm.o \
|
||||||
|
arm/fmtconvert_init_arm.o \
|
||||||
arm/jrevdct_arm.o \
|
arm/jrevdct_arm.o \
|
||||||
arm/mpegvideo_arm.o \
|
arm/mpegvideo_arm.o \
|
||||||
arm/simple_idct_arm.o \
|
arm/simple_idct_arm.o \
|
||||||
|
@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
|
||||||
arm/dsputil_armv6.o \
|
arm/dsputil_armv6.o \
|
||||||
arm/simple_idct_armv6.o \
|
arm/simple_idct_armv6.o \
|
||||||
|
|
||||||
|
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
|
||||||
|
|
||||||
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
|
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
|
||||||
arm/dsputil_init_vfp.o \
|
arm/dsputil_init_vfp.o \
|
||||||
|
$(VFP-OBJS-yes)
|
||||||
|
|
||||||
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
|
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
|
||||||
arm/mpegvideo_iwmmxt.o \
|
arm/mpegvideo_iwmmxt.o \
|
||||||
|
@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
|
||||||
|
|
||||||
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
|
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
|
||||||
arm/dsputil_neon.o \
|
arm/dsputil_neon.o \
|
||||||
|
arm/fmtconvert_neon.o \
|
||||||
arm/int_neon.o \
|
arm/int_neon.o \
|
||||||
arm/mpegvideo_neon.o \
|
arm/mpegvideo_neon.o \
|
||||||
arm/simple_idct_neon.o \
|
arm/simple_idct_neon.o \
|
||||||
|
|
|
@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
|
||||||
int len);
|
int len);
|
||||||
void ff_butterflies_float_neon(float *v1, float *v2, int len);
|
void ff_butterflies_float_neon(float *v1, float *v2, int len);
|
||||||
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
|
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
|
||||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
|
||||||
float mul, int len);
|
|
||||||
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
|
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
|
||||||
const float *src1, int len);
|
const float *src1, int len);
|
||||||
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
||||||
|
@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
||||||
|
|
||||||
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
|
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
|
||||||
int len);
|
int len);
|
||||||
void ff_float_to_int16_neon(int16_t *, const float *, long);
|
|
||||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
|
||||||
|
|
||||||
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
|
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
|
||||||
|
|
||||||
|
@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||||
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
|
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
|
||||||
c->butterflies_float = ff_butterflies_float_neon;
|
c->butterflies_float = ff_butterflies_float_neon;
|
||||||
c->scalarproduct_float = ff_scalarproduct_float_neon;
|
c->scalarproduct_float = ff_scalarproduct_float_neon;
|
||||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
|
||||||
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
|
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
|
||||||
c->vector_fmul_add = ff_vector_fmul_add_neon;
|
c->vector_fmul_add = ff_vector_fmul_add_neon;
|
||||||
c->vector_clipf = ff_vector_clipf_neon;
|
c->vector_clipf = ff_vector_clipf_neon;
|
||||||
|
@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||||
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
|
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
|
||||||
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
|
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
|
||||||
|
|
||||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
|
||||||
c->float_to_int16 = ff_float_to_int16_neon;
|
|
||||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CONFIG_VORBIS_DECODER)
|
if (CONFIG_VORBIS_DECODER)
|
||||||
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
|
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
|
||||||
|
|
||||||
|
|
|
@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
|
||||||
const float *src1, int len);
|
const float *src1, int len);
|
||||||
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
|
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
|
||||||
const float *src1, int len);
|
const float *src1, int len);
|
||||||
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
|
|
||||||
|
|
||||||
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
|
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
c->vector_fmul = ff_vector_fmul_vfp;
|
c->vector_fmul = ff_vector_fmul_vfp;
|
||||||
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
|
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
|
||||||
#if HAVE_ARMV6
|
|
||||||
c->float_to_int16 = ff_float_to_int16_vfp;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
|
||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
function ff_float_to_int16_neon, export=1
|
|
||||||
subs r2, r2, #8
|
|
||||||
vld1.64 {d0-d1}, [r1,:128]!
|
|
||||||
vcvt.s32.f32 q8, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r1,:128]!
|
|
||||||
vcvt.s32.f32 q9, q1, #16
|
|
||||||
beq 3f
|
|
||||||
bics ip, r2, #15
|
|
||||||
beq 2f
|
|
||||||
1: subs ip, ip, #16
|
|
||||||
vshrn.s32 d4, q8, #16
|
|
||||||
vld1.64 {d0-d1}, [r1,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vshrn.s32 d5, q9, #16
|
|
||||||
vld1.64 {d2-d3}, [r1,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vshrn.s32 d6, q0, #16
|
|
||||||
vst1.64 {d4-d5}, [r0,:128]!
|
|
||||||
vshrn.s32 d7, q1, #16
|
|
||||||
vld1.64 {d16-d17},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vld1.64 {d18-d19},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vst1.64 {d6-d7}, [r0,:128]!
|
|
||||||
bne 1b
|
|
||||||
ands r2, r2, #15
|
|
||||||
beq 3f
|
|
||||||
2: vld1.64 {d0-d1}, [r1,:128]!
|
|
||||||
vshrn.s32 d4, q8, #16
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r1,:128]!
|
|
||||||
vshrn.s32 d5, q9, #16
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vshrn.s32 d6, q0, #16
|
|
||||||
vst1.64 {d4-d5}, [r0,:128]!
|
|
||||||
vshrn.s32 d7, q1, #16
|
|
||||||
vst1.64 {d6-d7}, [r0,:128]!
|
|
||||||
bx lr
|
|
||||||
3: vshrn.s32 d4, q8, #16
|
|
||||||
vshrn.s32 d5, q9, #16
|
|
||||||
vst1.64 {d4-d5}, [r0,:128]!
|
|
||||||
bx lr
|
|
||||||
endfunc
|
|
||||||
|
|
||||||
function ff_float_to_int16_interleave_neon, export=1
|
|
||||||
cmp r3, #2
|
|
||||||
ldrlt r1, [r1]
|
|
||||||
blt ff_float_to_int16_neon
|
|
||||||
bne 4f
|
|
||||||
|
|
||||||
ldr r3, [r1]
|
|
||||||
ldr r1, [r1, #4]
|
|
||||||
|
|
||||||
subs r2, r2, #8
|
|
||||||
vld1.64 {d0-d1}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q8, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q9, q1, #16
|
|
||||||
vld1.64 {d20-d21},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vld1.64 {d22-d23},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
beq 3f
|
|
||||||
bics ip, r2, #15
|
|
||||||
beq 2f
|
|
||||||
1: subs ip, ip, #16
|
|
||||||
vld1.64 {d0-d1}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vsri.32 q10, q8, #16
|
|
||||||
vld1.64 {d2-d3}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vld1.64 {d24-d25},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q12, q12, #16
|
|
||||||
vld1.64 {d26-d27},[r1,:128]!
|
|
||||||
vsri.32 q11, q9, #16
|
|
||||||
vst1.64 {d20-d21},[r0,:128]!
|
|
||||||
vcvt.s32.f32 q13, q13, #16
|
|
||||||
vst1.64 {d22-d23},[r0,:128]!
|
|
||||||
vsri.32 q12, q0, #16
|
|
||||||
vld1.64 {d16-d17},[r3,:128]!
|
|
||||||
vsri.32 q13, q1, #16
|
|
||||||
vst1.64 {d24-d25},[r0,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vld1.64 {d18-d19},[r3,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vld1.64 {d20-d21},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vld1.64 {d22-d23},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
vst1.64 {d26-d27},[r0,:128]!
|
|
||||||
bne 1b
|
|
||||||
ands r2, r2, #15
|
|
||||||
beq 3f
|
|
||||||
2: vsri.32 q10, q8, #16
|
|
||||||
vld1.64 {d0-d1}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r3,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vld1.64 {d24-d25},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q12, q12, #16
|
|
||||||
vsri.32 q11, q9, #16
|
|
||||||
vld1.64 {d26-d27},[r1,:128]!
|
|
||||||
vcvt.s32.f32 q13, q13, #16
|
|
||||||
vst1.64 {d20-d21},[r0,:128]!
|
|
||||||
vsri.32 q12, q0, #16
|
|
||||||
vst1.64 {d22-d23},[r0,:128]!
|
|
||||||
vsri.32 q13, q1, #16
|
|
||||||
vst1.64 {d24-d27},[r0,:128]!
|
|
||||||
bx lr
|
|
||||||
3: vsri.32 q10, q8, #16
|
|
||||||
vsri.32 q11, q9, #16
|
|
||||||
vst1.64 {d20-d23},[r0,:128]!
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
4: push {r4-r8,lr}
|
|
||||||
cmp r3, #4
|
|
||||||
lsl ip, r3, #1
|
|
||||||
blt 4f
|
|
||||||
|
|
||||||
@ 4 channels
|
|
||||||
5: ldmia r1!, {r4-r7}
|
|
||||||
mov lr, r2
|
|
||||||
mov r8, r0
|
|
||||||
vld1.64 {d16-d17},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vld1.64 {d18-d19},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vld1.64 {d20-d21},[r6,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vld1.64 {d22-d23},[r7,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
6: subs lr, lr, #8
|
|
||||||
vld1.64 {d0-d1}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vsri.32 q9, q8, #16
|
|
||||||
vld1.64 {d2-d3}, [r5,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vsri.32 q11, q10, #16
|
|
||||||
vld1.64 {d4-d5}, [r6,:128]!
|
|
||||||
vcvt.s32.f32 q2, q2, #16
|
|
||||||
vzip.32 d18, d22
|
|
||||||
vld1.64 {d6-d7}, [r7,:128]!
|
|
||||||
vcvt.s32.f32 q3, q3, #16
|
|
||||||
vzip.32 d19, d23
|
|
||||||
vst1.64 {d18}, [r8], ip
|
|
||||||
vsri.32 q1, q0, #16
|
|
||||||
vst1.64 {d22}, [r8], ip
|
|
||||||
vsri.32 q3, q2, #16
|
|
||||||
vst1.64 {d19}, [r8], ip
|
|
||||||
vzip.32 d2, d6
|
|
||||||
vst1.64 {d23}, [r8], ip
|
|
||||||
vzip.32 d3, d7
|
|
||||||
beq 7f
|
|
||||||
vld1.64 {d16-d17},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vst1.64 {d2}, [r8], ip
|
|
||||||
vld1.64 {d18-d19},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vst1.64 {d6}, [r8], ip
|
|
||||||
vld1.64 {d20-d21},[r6,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vst1.64 {d3}, [r8], ip
|
|
||||||
vld1.64 {d22-d23},[r7,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
vst1.64 {d7}, [r8], ip
|
|
||||||
b 6b
|
|
||||||
7: vst1.64 {d2}, [r8], ip
|
|
||||||
vst1.64 {d6}, [r8], ip
|
|
||||||
vst1.64 {d3}, [r8], ip
|
|
||||||
vst1.64 {d7}, [r8], ip
|
|
||||||
subs r3, r3, #4
|
|
||||||
popeq {r4-r8,pc}
|
|
||||||
cmp r3, #4
|
|
||||||
add r0, r0, #8
|
|
||||||
bge 5b
|
|
||||||
|
|
||||||
@ 2 channels
|
|
||||||
4: cmp r3, #2
|
|
||||||
blt 4f
|
|
||||||
ldmia r1!, {r4-r5}
|
|
||||||
mov lr, r2
|
|
||||||
mov r8, r0
|
|
||||||
tst lr, #8
|
|
||||||
vld1.64 {d16-d17},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vld1.64 {d18-d19},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vld1.64 {d20-d21},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vld1.64 {d22-d23},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
beq 6f
|
|
||||||
subs lr, lr, #8
|
|
||||||
beq 7f
|
|
||||||
vsri.32 d18, d16, #16
|
|
||||||
vsri.32 d19, d17, #16
|
|
||||||
vld1.64 {d16-d17},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vst1.32 {d18[0]}, [r8], ip
|
|
||||||
vsri.32 d22, d20, #16
|
|
||||||
vst1.32 {d18[1]}, [r8], ip
|
|
||||||
vsri.32 d23, d21, #16
|
|
||||||
vst1.32 {d19[0]}, [r8], ip
|
|
||||||
vst1.32 {d19[1]}, [r8], ip
|
|
||||||
vld1.64 {d18-d19},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vst1.32 {d22[0]}, [r8], ip
|
|
||||||
vst1.32 {d22[1]}, [r8], ip
|
|
||||||
vld1.64 {d20-d21},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vst1.32 {d23[0]}, [r8], ip
|
|
||||||
vst1.32 {d23[1]}, [r8], ip
|
|
||||||
vld1.64 {d22-d23},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
6: subs lr, lr, #16
|
|
||||||
vld1.64 {d0-d1}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vsri.32 d18, d16, #16
|
|
||||||
vld1.64 {d2-d3}, [r5,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
vsri.32 d19, d17, #16
|
|
||||||
vld1.64 {d4-d5}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q2, q2, #16
|
|
||||||
vld1.64 {d6-d7}, [r5,:128]!
|
|
||||||
vcvt.s32.f32 q3, q3, #16
|
|
||||||
vst1.32 {d18[0]}, [r8], ip
|
|
||||||
vsri.32 d22, d20, #16
|
|
||||||
vst1.32 {d18[1]}, [r8], ip
|
|
||||||
vsri.32 d23, d21, #16
|
|
||||||
vst1.32 {d19[0]}, [r8], ip
|
|
||||||
vsri.32 d2, d0, #16
|
|
||||||
vst1.32 {d19[1]}, [r8], ip
|
|
||||||
vsri.32 d3, d1, #16
|
|
||||||
vst1.32 {d22[0]}, [r8], ip
|
|
||||||
vsri.32 d6, d4, #16
|
|
||||||
vst1.32 {d22[1]}, [r8], ip
|
|
||||||
vsri.32 d7, d5, #16
|
|
||||||
vst1.32 {d23[0]}, [r8], ip
|
|
||||||
vst1.32 {d23[1]}, [r8], ip
|
|
||||||
beq 6f
|
|
||||||
vld1.64 {d16-d17},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q8, q8, #16
|
|
||||||
vst1.32 {d2[0]}, [r8], ip
|
|
||||||
vst1.32 {d2[1]}, [r8], ip
|
|
||||||
vld1.64 {d18-d19},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q9, q9, #16
|
|
||||||
vst1.32 {d3[0]}, [r8], ip
|
|
||||||
vst1.32 {d3[1]}, [r8], ip
|
|
||||||
vld1.64 {d20-d21},[r4,:128]!
|
|
||||||
vcvt.s32.f32 q10, q10, #16
|
|
||||||
vst1.32 {d6[0]}, [r8], ip
|
|
||||||
vst1.32 {d6[1]}, [r8], ip
|
|
||||||
vld1.64 {d22-d23},[r5,:128]!
|
|
||||||
vcvt.s32.f32 q11, q11, #16
|
|
||||||
vst1.32 {d7[0]}, [r8], ip
|
|
||||||
vst1.32 {d7[1]}, [r8], ip
|
|
||||||
bgt 6b
|
|
||||||
6: vst1.32 {d2[0]}, [r8], ip
|
|
||||||
vst1.32 {d2[1]}, [r8], ip
|
|
||||||
vst1.32 {d3[0]}, [r8], ip
|
|
||||||
vst1.32 {d3[1]}, [r8], ip
|
|
||||||
vst1.32 {d6[0]}, [r8], ip
|
|
||||||
vst1.32 {d6[1]}, [r8], ip
|
|
||||||
vst1.32 {d7[0]}, [r8], ip
|
|
||||||
vst1.32 {d7[1]}, [r8], ip
|
|
||||||
b 8f
|
|
||||||
7: vsri.32 d18, d16, #16
|
|
||||||
vsri.32 d19, d17, #16
|
|
||||||
vst1.32 {d18[0]}, [r8], ip
|
|
||||||
vsri.32 d22, d20, #16
|
|
||||||
vst1.32 {d18[1]}, [r8], ip
|
|
||||||
vsri.32 d23, d21, #16
|
|
||||||
vst1.32 {d19[0]}, [r8], ip
|
|
||||||
vst1.32 {d19[1]}, [r8], ip
|
|
||||||
vst1.32 {d22[0]}, [r8], ip
|
|
||||||
vst1.32 {d22[1]}, [r8], ip
|
|
||||||
vst1.32 {d23[0]}, [r8], ip
|
|
||||||
vst1.32 {d23[1]}, [r8], ip
|
|
||||||
8: subs r3, r3, #2
|
|
||||||
add r0, r0, #4
|
|
||||||
popeq {r4-r8,pc}
|
|
||||||
|
|
||||||
@ 1 channel
|
|
||||||
4: ldr r4, [r1],#4
|
|
||||||
tst r2, #8
|
|
||||||
mov lr, r2
|
|
||||||
mov r5, r0
|
|
||||||
vld1.64 {d0-d1}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
bne 8f
|
|
||||||
6: subs lr, lr, #16
|
|
||||||
vld1.64 {d4-d5}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q2, q2, #16
|
|
||||||
vld1.64 {d6-d7}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q3, q3, #16
|
|
||||||
vst1.16 {d0[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d0[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d1[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d1[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d2[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d2[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d3[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d3[3]}, [r5,:16], ip
|
|
||||||
beq 7f
|
|
||||||
vld1.64 {d0-d1}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
7: vst1.16 {d4[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d4[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d5[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d5[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d6[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d6[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d7[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d7[3]}, [r5,:16], ip
|
|
||||||
bgt 6b
|
|
||||||
pop {r4-r8,pc}
|
|
||||||
8: subs lr, lr, #8
|
|
||||||
vst1.16 {d0[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d0[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d1[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d1[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d2[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d2[3]}, [r5,:16], ip
|
|
||||||
vst1.16 {d3[1]}, [r5,:16], ip
|
|
||||||
vst1.16 {d3[3]}, [r5,:16], ip
|
|
||||||
popeq {r4-r8,pc}
|
|
||||||
vld1.64 {d0-d1}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q0, q0, #16
|
|
||||||
vld1.64 {d2-d3}, [r4,:128]!
|
|
||||||
vcvt.s32.f32 q1, q1, #16
|
|
||||||
b 6b
|
|
||||||
endfunc
|
|
||||||
|
|
||||||
function ff_vector_fmul_neon, export=1
|
function ff_vector_fmul_neon, export=1
|
||||||
subs r3, r3, #8
|
subs r3, r3, #8
|
||||||
vld1.64 {d0-d3}, [r1,:128]!
|
vld1.64 {d0-d3}, [r1,:128]!
|
||||||
|
@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
|
||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
function ff_int32_to_float_fmul_scalar_neon, export=1
|
|
||||||
VFP vdup.32 q0, d0[0]
|
|
||||||
VFP len .req r2
|
|
||||||
NOVFP vdup.32 q0, r2
|
|
||||||
NOVFP len .req r3
|
|
||||||
|
|
||||||
vld1.32 {q1},[r1,:128]!
|
|
||||||
vcvt.f32.s32 q3, q1
|
|
||||||
vld1.32 {q2},[r1,:128]!
|
|
||||||
vcvt.f32.s32 q8, q2
|
|
||||||
1: subs len, len, #8
|
|
||||||
pld [r1, #16]
|
|
||||||
vmul.f32 q9, q3, q0
|
|
||||||
vmul.f32 q10, q8, q0
|
|
||||||
beq 2f
|
|
||||||
vld1.32 {q1},[r1,:128]!
|
|
||||||
vcvt.f32.s32 q3, q1
|
|
||||||
vld1.32 {q2},[r1,:128]!
|
|
||||||
vcvt.f32.s32 q8, q2
|
|
||||||
vst1.32 {q9}, [r0,:128]!
|
|
||||||
vst1.32 {q10},[r0,:128]!
|
|
||||||
b 1b
|
|
||||||
2: vst1.32 {q9}, [r0,:128]!
|
|
||||||
vst1.32 {q10},[r0,:128]!
|
|
||||||
bx lr
|
|
||||||
.unreq len
|
|
||||||
endfunc
|
|
||||||
|
|
||||||
function ff_vector_fmul_reverse_neon, export=1
|
function ff_vector_fmul_reverse_neon, export=1
|
||||||
add r2, r2, r3, lsl #2
|
add r2, r2, r3, lsl #2
|
||||||
sub r2, r2, #32
|
sub r2, r2, #32
|
||||||
|
|
|
@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
|
||||||
vpop {d8-d15}
|
vpop {d8-d15}
|
||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
#if HAVE_ARMV6
|
|
||||||
/**
|
|
||||||
* ARM VFP optimized float to int16 conversion.
|
|
||||||
* Assume that len is a positive number and is multiple of 8, destination
|
|
||||||
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
|
|
||||||
* performance), little endian byte sex
|
|
||||||
*/
|
|
||||||
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
|
|
||||||
function ff_float_to_int16_vfp, export=1
|
|
||||||
push {r4-r8,lr}
|
|
||||||
vpush {d8-d11}
|
|
||||||
vldmia r1!, {s16-s23}
|
|
||||||
vcvt.s32.f32 s0, s16
|
|
||||||
vcvt.s32.f32 s1, s17
|
|
||||||
vcvt.s32.f32 s2, s18
|
|
||||||
vcvt.s32.f32 s3, s19
|
|
||||||
vcvt.s32.f32 s4, s20
|
|
||||||
vcvt.s32.f32 s5, s21
|
|
||||||
vcvt.s32.f32 s6, s22
|
|
||||||
vcvt.s32.f32 s7, s23
|
|
||||||
1:
|
|
||||||
subs r2, r2, #8
|
|
||||||
vmov r3, r4, s0, s1
|
|
||||||
vmov r5, r6, s2, s3
|
|
||||||
vmov r7, r8, s4, s5
|
|
||||||
vmov ip, lr, s6, s7
|
|
||||||
vldmiagt r1!, {s16-s23}
|
|
||||||
ssat r4, #16, r4
|
|
||||||
ssat r3, #16, r3
|
|
||||||
ssat r6, #16, r6
|
|
||||||
ssat r5, #16, r5
|
|
||||||
pkhbt r3, r3, r4, lsl #16
|
|
||||||
pkhbt r4, r5, r6, lsl #16
|
|
||||||
vcvtgt.s32.f32 s0, s16
|
|
||||||
vcvtgt.s32.f32 s1, s17
|
|
||||||
vcvtgt.s32.f32 s2, s18
|
|
||||||
vcvtgt.s32.f32 s3, s19
|
|
||||||
vcvtgt.s32.f32 s4, s20
|
|
||||||
vcvtgt.s32.f32 s5, s21
|
|
||||||
vcvtgt.s32.f32 s6, s22
|
|
||||||
vcvtgt.s32.f32 s7, s23
|
|
||||||
ssat r8, #16, r8
|
|
||||||
ssat r7, #16, r7
|
|
||||||
ssat lr, #16, lr
|
|
||||||
ssat ip, #16, ip
|
|
||||||
pkhbt r5, r7, r8, lsl #16
|
|
||||||
pkhbt r6, ip, lr, lsl #16
|
|
||||||
stmia r0!, {r3-r6}
|
|
||||||
bgt 1b
|
|
||||||
|
|
||||||
vpop {d8-d11}
|
|
||||||
pop {r4-r8,pc}
|
|
||||||
endfunc
|
|
||||||
#endif
|
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
/*
|
||||||
|
* ARM optimized Format Conversion Utils
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "libavcodec/avcodec.h"
|
||||||
|
#include "libavcodec/fmtconvert.h"
|
||||||
|
|
||||||
|
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
||||||
|
float mul, int len);
|
||||||
|
|
||||||
|
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
|
||||||
|
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||||
|
|
||||||
|
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
|
||||||
|
|
||||||
|
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
|
||||||
|
{
|
||||||
|
if (HAVE_ARMVFP && HAVE_ARMV6) {
|
||||||
|
c->float_to_int16 = ff_float_to_int16_vfp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (HAVE_NEON) {
|
||||||
|
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||||
|
|
||||||
|
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||||
|
c->float_to_int16 = ff_float_to_int16_neon;
|
||||||
|
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,391 @@
|
||||||
|
/*
|
||||||
|
* ARM NEON optimised Format Conversion Utils
|
||||||
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
preserve8
|
||||||
|
.text
|
||||||
|
|
||||||
|
function ff_float_to_int16_neon, export=1
|
||||||
|
subs r2, r2, #8
|
||||||
|
vld1.64 {d0-d1}, [r1,:128]!
|
||||||
|
vcvt.s32.f32 q8, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r1,:128]!
|
||||||
|
vcvt.s32.f32 q9, q1, #16
|
||||||
|
beq 3f
|
||||||
|
bics ip, r2, #15
|
||||||
|
beq 2f
|
||||||
|
1: subs ip, ip, #16
|
||||||
|
vshrn.s32 d4, q8, #16
|
||||||
|
vld1.64 {d0-d1}, [r1,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vshrn.s32 d5, q9, #16
|
||||||
|
vld1.64 {d2-d3}, [r1,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vshrn.s32 d6, q0, #16
|
||||||
|
vst1.64 {d4-d5}, [r0,:128]!
|
||||||
|
vshrn.s32 d7, q1, #16
|
||||||
|
vld1.64 {d16-d17},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vld1.64 {d18-d19},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vst1.64 {d6-d7}, [r0,:128]!
|
||||||
|
bne 1b
|
||||||
|
ands r2, r2, #15
|
||||||
|
beq 3f
|
||||||
|
2: vld1.64 {d0-d1}, [r1,:128]!
|
||||||
|
vshrn.s32 d4, q8, #16
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r1,:128]!
|
||||||
|
vshrn.s32 d5, q9, #16
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vshrn.s32 d6, q0, #16
|
||||||
|
vst1.64 {d4-d5}, [r0,:128]!
|
||||||
|
vshrn.s32 d7, q1, #16
|
||||||
|
vst1.64 {d6-d7}, [r0,:128]!
|
||||||
|
bx lr
|
||||||
|
3: vshrn.s32 d4, q8, #16
|
||||||
|
vshrn.s32 d5, q9, #16
|
||||||
|
vst1.64 {d4-d5}, [r0,:128]!
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function ff_float_to_int16_interleave_neon, export=1
|
||||||
|
cmp r3, #2
|
||||||
|
ldrlt r1, [r1]
|
||||||
|
blt ff_float_to_int16_neon
|
||||||
|
bne 4f
|
||||||
|
|
||||||
|
ldr r3, [r1]
|
||||||
|
ldr r1, [r1, #4]
|
||||||
|
|
||||||
|
subs r2, r2, #8
|
||||||
|
vld1.64 {d0-d1}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q8, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q9, q1, #16
|
||||||
|
vld1.64 {d20-d21},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vld1.64 {d22-d23},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
beq 3f
|
||||||
|
bics ip, r2, #15
|
||||||
|
beq 2f
|
||||||
|
1: subs ip, ip, #16
|
||||||
|
vld1.64 {d0-d1}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vsri.32 q10, q8, #16
|
||||||
|
vld1.64 {d2-d3}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vld1.64 {d24-d25},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q12, q12, #16
|
||||||
|
vld1.64 {d26-d27},[r1,:128]!
|
||||||
|
vsri.32 q11, q9, #16
|
||||||
|
vst1.64 {d20-d21},[r0,:128]!
|
||||||
|
vcvt.s32.f32 q13, q13, #16
|
||||||
|
vst1.64 {d22-d23},[r0,:128]!
|
||||||
|
vsri.32 q12, q0, #16
|
||||||
|
vld1.64 {d16-d17},[r3,:128]!
|
||||||
|
vsri.32 q13, q1, #16
|
||||||
|
vst1.64 {d24-d25},[r0,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vld1.64 {d18-d19},[r3,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vld1.64 {d20-d21},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vld1.64 {d22-d23},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
vst1.64 {d26-d27},[r0,:128]!
|
||||||
|
bne 1b
|
||||||
|
ands r2, r2, #15
|
||||||
|
beq 3f
|
||||||
|
2: vsri.32 q10, q8, #16
|
||||||
|
vld1.64 {d0-d1}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r3,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vld1.64 {d24-d25},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q12, q12, #16
|
||||||
|
vsri.32 q11, q9, #16
|
||||||
|
vld1.64 {d26-d27},[r1,:128]!
|
||||||
|
vcvt.s32.f32 q13, q13, #16
|
||||||
|
vst1.64 {d20-d21},[r0,:128]!
|
||||||
|
vsri.32 q12, q0, #16
|
||||||
|
vst1.64 {d22-d23},[r0,:128]!
|
||||||
|
vsri.32 q13, q1, #16
|
||||||
|
vst1.64 {d24-d27},[r0,:128]!
|
||||||
|
bx lr
|
||||||
|
3: vsri.32 q10, q8, #16
|
||||||
|
vsri.32 q11, q9, #16
|
||||||
|
vst1.64 {d20-d23},[r0,:128]!
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
4: push {r4-r8,lr}
|
||||||
|
cmp r3, #4
|
||||||
|
lsl ip, r3, #1
|
||||||
|
blt 4f
|
||||||
|
|
||||||
|
@ 4 channels
|
||||||
|
5: ldmia r1!, {r4-r7}
|
||||||
|
mov lr, r2
|
||||||
|
mov r8, r0
|
||||||
|
vld1.64 {d16-d17},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vld1.64 {d18-d19},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vld1.64 {d20-d21},[r6,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vld1.64 {d22-d23},[r7,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
6: subs lr, lr, #8
|
||||||
|
vld1.64 {d0-d1}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vsri.32 q9, q8, #16
|
||||||
|
vld1.64 {d2-d3}, [r5,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vsri.32 q11, q10, #16
|
||||||
|
vld1.64 {d4-d5}, [r6,:128]!
|
||||||
|
vcvt.s32.f32 q2, q2, #16
|
||||||
|
vzip.32 d18, d22
|
||||||
|
vld1.64 {d6-d7}, [r7,:128]!
|
||||||
|
vcvt.s32.f32 q3, q3, #16
|
||||||
|
vzip.32 d19, d23
|
||||||
|
vst1.64 {d18}, [r8], ip
|
||||||
|
vsri.32 q1, q0, #16
|
||||||
|
vst1.64 {d22}, [r8], ip
|
||||||
|
vsri.32 q3, q2, #16
|
||||||
|
vst1.64 {d19}, [r8], ip
|
||||||
|
vzip.32 d2, d6
|
||||||
|
vst1.64 {d23}, [r8], ip
|
||||||
|
vzip.32 d3, d7
|
||||||
|
beq 7f
|
||||||
|
vld1.64 {d16-d17},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vst1.64 {d2}, [r8], ip
|
||||||
|
vld1.64 {d18-d19},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vst1.64 {d6}, [r8], ip
|
||||||
|
vld1.64 {d20-d21},[r6,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vst1.64 {d3}, [r8], ip
|
||||||
|
vld1.64 {d22-d23},[r7,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
vst1.64 {d7}, [r8], ip
|
||||||
|
b 6b
|
||||||
|
7: vst1.64 {d2}, [r8], ip
|
||||||
|
vst1.64 {d6}, [r8], ip
|
||||||
|
vst1.64 {d3}, [r8], ip
|
||||||
|
vst1.64 {d7}, [r8], ip
|
||||||
|
subs r3, r3, #4
|
||||||
|
popeq {r4-r8,pc}
|
||||||
|
cmp r3, #4
|
||||||
|
add r0, r0, #8
|
||||||
|
bge 5b
|
||||||
|
|
||||||
|
@ 2 channels
|
||||||
|
4: cmp r3, #2
|
||||||
|
blt 4f
|
||||||
|
ldmia r1!, {r4-r5}
|
||||||
|
mov lr, r2
|
||||||
|
mov r8, r0
|
||||||
|
tst lr, #8
|
||||||
|
vld1.64 {d16-d17},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vld1.64 {d18-d19},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vld1.64 {d20-d21},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vld1.64 {d22-d23},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
beq 6f
|
||||||
|
subs lr, lr, #8
|
||||||
|
beq 7f
|
||||||
|
vsri.32 d18, d16, #16
|
||||||
|
vsri.32 d19, d17, #16
|
||||||
|
vld1.64 {d16-d17},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vst1.32 {d18[0]}, [r8], ip
|
||||||
|
vsri.32 d22, d20, #16
|
||||||
|
vst1.32 {d18[1]}, [r8], ip
|
||||||
|
vsri.32 d23, d21, #16
|
||||||
|
vst1.32 {d19[0]}, [r8], ip
|
||||||
|
vst1.32 {d19[1]}, [r8], ip
|
||||||
|
vld1.64 {d18-d19},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vst1.32 {d22[0]}, [r8], ip
|
||||||
|
vst1.32 {d22[1]}, [r8], ip
|
||||||
|
vld1.64 {d20-d21},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vst1.32 {d23[0]}, [r8], ip
|
||||||
|
vst1.32 {d23[1]}, [r8], ip
|
||||||
|
vld1.64 {d22-d23},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
6: subs lr, lr, #16
|
||||||
|
vld1.64 {d0-d1}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vsri.32 d18, d16, #16
|
||||||
|
vld1.64 {d2-d3}, [r5,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
vsri.32 d19, d17, #16
|
||||||
|
vld1.64 {d4-d5}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q2, q2, #16
|
||||||
|
vld1.64 {d6-d7}, [r5,:128]!
|
||||||
|
vcvt.s32.f32 q3, q3, #16
|
||||||
|
vst1.32 {d18[0]}, [r8], ip
|
||||||
|
vsri.32 d22, d20, #16
|
||||||
|
vst1.32 {d18[1]}, [r8], ip
|
||||||
|
vsri.32 d23, d21, #16
|
||||||
|
vst1.32 {d19[0]}, [r8], ip
|
||||||
|
vsri.32 d2, d0, #16
|
||||||
|
vst1.32 {d19[1]}, [r8], ip
|
||||||
|
vsri.32 d3, d1, #16
|
||||||
|
vst1.32 {d22[0]}, [r8], ip
|
||||||
|
vsri.32 d6, d4, #16
|
||||||
|
vst1.32 {d22[1]}, [r8], ip
|
||||||
|
vsri.32 d7, d5, #16
|
||||||
|
vst1.32 {d23[0]}, [r8], ip
|
||||||
|
vst1.32 {d23[1]}, [r8], ip
|
||||||
|
beq 6f
|
||||||
|
vld1.64 {d16-d17},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q8, q8, #16
|
||||||
|
vst1.32 {d2[0]}, [r8], ip
|
||||||
|
vst1.32 {d2[1]}, [r8], ip
|
||||||
|
vld1.64 {d18-d19},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q9, q9, #16
|
||||||
|
vst1.32 {d3[0]}, [r8], ip
|
||||||
|
vst1.32 {d3[1]}, [r8], ip
|
||||||
|
vld1.64 {d20-d21},[r4,:128]!
|
||||||
|
vcvt.s32.f32 q10, q10, #16
|
||||||
|
vst1.32 {d6[0]}, [r8], ip
|
||||||
|
vst1.32 {d6[1]}, [r8], ip
|
||||||
|
vld1.64 {d22-d23},[r5,:128]!
|
||||||
|
vcvt.s32.f32 q11, q11, #16
|
||||||
|
vst1.32 {d7[0]}, [r8], ip
|
||||||
|
vst1.32 {d7[1]}, [r8], ip
|
||||||
|
bgt 6b
|
||||||
|
6: vst1.32 {d2[0]}, [r8], ip
|
||||||
|
vst1.32 {d2[1]}, [r8], ip
|
||||||
|
vst1.32 {d3[0]}, [r8], ip
|
||||||
|
vst1.32 {d3[1]}, [r8], ip
|
||||||
|
vst1.32 {d6[0]}, [r8], ip
|
||||||
|
vst1.32 {d6[1]}, [r8], ip
|
||||||
|
vst1.32 {d7[0]}, [r8], ip
|
||||||
|
vst1.32 {d7[1]}, [r8], ip
|
||||||
|
b 8f
|
||||||
|
7: vsri.32 d18, d16, #16
|
||||||
|
vsri.32 d19, d17, #16
|
||||||
|
vst1.32 {d18[0]}, [r8], ip
|
||||||
|
vsri.32 d22, d20, #16
|
||||||
|
vst1.32 {d18[1]}, [r8], ip
|
||||||
|
vsri.32 d23, d21, #16
|
||||||
|
vst1.32 {d19[0]}, [r8], ip
|
||||||
|
vst1.32 {d19[1]}, [r8], ip
|
||||||
|
vst1.32 {d22[0]}, [r8], ip
|
||||||
|
vst1.32 {d22[1]}, [r8], ip
|
||||||
|
vst1.32 {d23[0]}, [r8], ip
|
||||||
|
vst1.32 {d23[1]}, [r8], ip
|
||||||
|
8: subs r3, r3, #2
|
||||||
|
add r0, r0, #4
|
||||||
|
popeq {r4-r8,pc}
|
||||||
|
|
||||||
|
@ 1 channel
|
||||||
|
4: ldr r4, [r1],#4
|
||||||
|
tst r2, #8
|
||||||
|
mov lr, r2
|
||||||
|
mov r5, r0
|
||||||
|
vld1.64 {d0-d1}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
bne 8f
|
||||||
|
6: subs lr, lr, #16
|
||||||
|
vld1.64 {d4-d5}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q2, q2, #16
|
||||||
|
vld1.64 {d6-d7}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q3, q3, #16
|
||||||
|
vst1.16 {d0[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d0[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d1[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d1[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d2[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d2[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d3[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d3[3]}, [r5,:16], ip
|
||||||
|
beq 7f
|
||||||
|
vld1.64 {d0-d1}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
7: vst1.16 {d4[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d4[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d5[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d5[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d6[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d6[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d7[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d7[3]}, [r5,:16], ip
|
||||||
|
bgt 6b
|
||||||
|
pop {r4-r8,pc}
|
||||||
|
8: subs lr, lr, #8
|
||||||
|
vst1.16 {d0[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d0[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d1[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d1[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d2[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d2[3]}, [r5,:16], ip
|
||||||
|
vst1.16 {d3[1]}, [r5,:16], ip
|
||||||
|
vst1.16 {d3[3]}, [r5,:16], ip
|
||||||
|
popeq {r4-r8,pc}
|
||||||
|
vld1.64 {d0-d1}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q0, q0, #16
|
||||||
|
vld1.64 {d2-d3}, [r4,:128]!
|
||||||
|
vcvt.s32.f32 q1, q1, #16
|
||||||
|
b 6b
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function ff_int32_to_float_fmul_scalar_neon, export=1
|
||||||
|
VFP vdup.32 q0, d0[0]
|
||||||
|
VFP len .req r2
|
||||||
|
NOVFP vdup.32 q0, r2
|
||||||
|
NOVFP len .req r3
|
||||||
|
|
||||||
|
vld1.32 {q1},[r1,:128]!
|
||||||
|
vcvt.f32.s32 q3, q1
|
||||||
|
vld1.32 {q2},[r1,:128]!
|
||||||
|
vcvt.f32.s32 q8, q2
|
||||||
|
1: subs len, len, #8
|
||||||
|
pld [r1, #16]
|
||||||
|
vmul.f32 q9, q3, q0
|
||||||
|
vmul.f32 q10, q8, q0
|
||||||
|
beq 2f
|
||||||
|
vld1.32 {q1},[r1,:128]!
|
||||||
|
vcvt.f32.s32 q3, q1
|
||||||
|
vld1.32 {q2},[r1,:128]!
|
||||||
|
vcvt.f32.s32 q8, q2
|
||||||
|
vst1.32 {q9}, [r0,:128]!
|
||||||
|
vst1.32 {q10},[r0,:128]!
|
||||||
|
b 1b
|
||||||
|
2: vst1.32 {q9}, [r0,:128]!
|
||||||
|
vst1.32 {q10},[r0,:128]!
|
||||||
|
bx lr
|
||||||
|
.unreq len
|
||||||
|
endfunc
|
|
@ -0,0 +1,77 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
.syntax unified
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ARM VFP optimized float to int16 conversion.
|
||||||
|
* Assume that len is a positive number and is multiple of 8, destination
|
||||||
|
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
|
||||||
|
* performance), little endian byte sex
|
||||||
|
*/
|
||||||
|
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
|
||||||
|
function ff_float_to_int16_vfp, export=1
|
||||||
|
push {r4-r8,lr}
|
||||||
|
vpush {d8-d11}
|
||||||
|
vldmia r1!, {s16-s23}
|
||||||
|
vcvt.s32.f32 s0, s16
|
||||||
|
vcvt.s32.f32 s1, s17
|
||||||
|
vcvt.s32.f32 s2, s18
|
||||||
|
vcvt.s32.f32 s3, s19
|
||||||
|
vcvt.s32.f32 s4, s20
|
||||||
|
vcvt.s32.f32 s5, s21
|
||||||
|
vcvt.s32.f32 s6, s22
|
||||||
|
vcvt.s32.f32 s7, s23
|
||||||
|
1:
|
||||||
|
subs r2, r2, #8
|
||||||
|
vmov r3, r4, s0, s1
|
||||||
|
vmov r5, r6, s2, s3
|
||||||
|
vmov r7, r8, s4, s5
|
||||||
|
vmov ip, lr, s6, s7
|
||||||
|
vldmiagt r1!, {s16-s23}
|
||||||
|
ssat r4, #16, r4
|
||||||
|
ssat r3, #16, r3
|
||||||
|
ssat r6, #16, r6
|
||||||
|
ssat r5, #16, r5
|
||||||
|
pkhbt r3, r3, r4, lsl #16
|
||||||
|
pkhbt r4, r5, r6, lsl #16
|
||||||
|
vcvtgt.s32.f32 s0, s16
|
||||||
|
vcvtgt.s32.f32 s1, s17
|
||||||
|
vcvtgt.s32.f32 s2, s18
|
||||||
|
vcvtgt.s32.f32 s3, s19
|
||||||
|
vcvtgt.s32.f32 s4, s20
|
||||||
|
vcvtgt.s32.f32 s5, s21
|
||||||
|
vcvtgt.s32.f32 s6, s22
|
||||||
|
vcvtgt.s32.f32 s7, s23
|
||||||
|
ssat r8, #16, r8
|
||||||
|
ssat r7, #16, r7
|
||||||
|
ssat lr, #16, lr
|
||||||
|
ssat ip, #16, ip
|
||||||
|
pkhbt r5, r7, r8, lsl #16
|
||||||
|
pkhbt r6, ip, lr, lsl #16
|
||||||
|
stmia r0!, {r3-r6}
|
||||||
|
bgt 1b
|
||||||
|
|
||||||
|
vpop {d8-d11}
|
||||||
|
pop {r4-r8,pc}
|
||||||
|
endfunc
|
|
@ -33,6 +33,7 @@
|
||||||
#include "get_bits.h"
|
#include "get_bits.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
extern const uint16_t ff_wma_critical_freqs[25];
|
extern const uint16_t ff_wma_critical_freqs[25];
|
||||||
|
|
||||||
|
@ -43,6 +44,7 @@ typedef struct {
|
||||||
AVCodecContext *avctx;
|
AVCodecContext *avctx;
|
||||||
GetBitContext gb;
|
GetBitContext gb;
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
int first;
|
int first;
|
||||||
int channels;
|
int channels;
|
||||||
int frame_len; ///< transform size (samples)
|
int frame_len; ///< transform size (samples)
|
||||||
|
@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
|
||||||
|
|
||||||
s->avctx = avctx;
|
s->avctx = avctx;
|
||||||
dsputil_init(&s->dsp, avctx);
|
dsputil_init(&s->dsp, avctx);
|
||||||
|
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||||
|
|
||||||
/* determine frame length */
|
/* determine frame length */
|
||||||
if (avctx->sample_rate < 22050) {
|
if (avctx->sample_rate < 22050) {
|
||||||
|
@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
|
||||||
ff_rdft_calc(&s->trans.rdft, coeffs);
|
ff_rdft_calc(&s->trans.rdft, coeffs);
|
||||||
}
|
}
|
||||||
|
|
||||||
s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
|
s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
|
||||||
|
s->frame_len, s->channels);
|
||||||
|
|
||||||
if (!s->first) {
|
if (!s->first) {
|
||||||
int count = s->overlap_len * s->channels;
|
int count = s->overlap_len * s->channels;
|
||||||
|
|
|
@ -40,6 +40,7 @@
|
||||||
#include "dca.h"
|
#include "dca.h"
|
||||||
#include "synth_filter.h"
|
#include "synth_filter.h"
|
||||||
#include "dcadsp.h"
|
#include "dcadsp.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
//#define TRACE
|
//#define TRACE
|
||||||
|
|
||||||
|
@ -347,6 +348,7 @@ typedef struct {
|
||||||
FFTContext imdct;
|
FFTContext imdct;
|
||||||
SynthFilterContext synth;
|
SynthFilterContext synth;
|
||||||
DCADSPContext dcadsp;
|
DCADSPContext dcadsp;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
} DCAContext;
|
} DCAContext;
|
||||||
|
|
||||||
static const uint16_t dca_vlc_offs[] = {
|
static const uint16_t dca_vlc_offs[] = {
|
||||||
|
@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
|
||||||
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
|
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
|
||||||
}
|
}
|
||||||
|
|
||||||
s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
|
s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
|
||||||
block, rscale, 8);
|
block, rscale, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
|
s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
|
||||||
samples += 256 * channels;
|
samples += 256 * channels;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
|
||||||
ff_mdct_init(&s->imdct, 6, 1, 1.0);
|
ff_mdct_init(&s->imdct, 6, 1, 1.0);
|
||||||
ff_synth_filter_init(&s->synth);
|
ff_synth_filter_init(&s->synth);
|
||||||
ff_dcadsp_init(&s->dcadsp);
|
ff_dcadsp_init(&s->dcadsp);
|
||||||
|
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||||
|
|
||||||
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
|
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
|
||||||
s->samples_chanptr[i] = s->samples + i * 256;
|
s->samples_chanptr[i] = s->samples + i * 256;
|
||||||
|
|
|
@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
|
||||||
int i;
|
|
||||||
for(i=0; i<len; i++)
|
|
||||||
dst[i] = src[i] * mul;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
|
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
|
||||||
uint32_t maxi, uint32_t maxisign)
|
uint32_t maxi, uint32_t maxisign)
|
||||||
{
|
{
|
||||||
|
@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static av_always_inline int float_to_int16_one(const float *src){
|
|
||||||
return av_clip_int16(lrintf(*src));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
|
|
||||||
int i;
|
|
||||||
for(i=0; i<len; i++)
|
|
||||||
dst[i] = float_to_int16_one(src+i);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
|
|
||||||
int i,j,c;
|
|
||||||
if(channels==2){
|
|
||||||
for(i=0; i<len; i++){
|
|
||||||
dst[2*i] = float_to_int16_one(src[0]+i);
|
|
||||||
dst[2*i+1] = float_to_int16_one(src[1]+i);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
for(c=0; c<channels; c++)
|
|
||||||
for(i=0, j=c; i<len; i++, j+=channels)
|
|
||||||
dst[j] = float_to_int16_one(src[c]+i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
|
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
|
||||||
{
|
{
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||||
c->vector_fmul_reverse = vector_fmul_reverse_c;
|
c->vector_fmul_reverse = vector_fmul_reverse_c;
|
||||||
c->vector_fmul_add = vector_fmul_add_c;
|
c->vector_fmul_add = vector_fmul_add_c;
|
||||||
c->vector_fmul_window = vector_fmul_window_c;
|
c->vector_fmul_window = vector_fmul_window_c;
|
||||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
|
|
||||||
c->vector_clipf = vector_clipf_c;
|
c->vector_clipf = vector_clipf_c;
|
||||||
c->float_to_int16 = ff_float_to_int16_c;
|
|
||||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
|
|
||||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||||
c->scalarproduct_float = scalarproduct_float_c;
|
c->scalarproduct_float = scalarproduct_float_c;
|
||||||
|
|
|
@ -392,7 +392,6 @@ typedef struct DSPContext {
|
||||||
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
|
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
|
||||||
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
|
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
|
||||||
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
|
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
|
||||||
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
|
|
||||||
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
|
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
|
||||||
/**
|
/**
|
||||||
* Multiply a vector of floats by a scalar float. Source and
|
* Multiply a vector of floats by a scalar float. Source and
|
||||||
|
@ -445,10 +444,6 @@ typedef struct DSPContext {
|
||||||
*/
|
*/
|
||||||
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
|
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
|
||||||
|
|
||||||
/* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
|
|
||||||
void (*float_to_int16)(int16_t *dst, const float *src, long len);
|
|
||||||
void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
|
|
||||||
|
|
||||||
/* (I)DCT */
|
/* (I)DCT */
|
||||||
void (*fdct)(DCTELEM *block/* align 16*/);
|
void (*fdct)(DCTELEM *block/* align 16*/);
|
||||||
void (*fdct248)(DCTELEM *block/* align 16*/);
|
void (*fdct248)(DCTELEM *block/* align 16*/);
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* Format Conversion Utils
|
||||||
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||||
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "avcodec.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
|
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
||||||
|
int i;
|
||||||
|
for(i=0; i<len; i++)
|
||||||
|
dst[i] = src[i] * mul;
|
||||||
|
}
|
||||||
|
|
||||||
|
static av_always_inline int float_to_int16_one(const float *src){
|
||||||
|
return av_clip_int16(lrintf(*src));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_c(int16_t *dst, const float *src, long len)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for(i=0; i<len; i++)
|
||||||
|
dst[i] = float_to_int16_one(src+i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_interleave_c(int16_t *dst, const float **src,
|
||||||
|
long len, int channels)
|
||||||
|
{
|
||||||
|
int i,j,c;
|
||||||
|
if(channels==2){
|
||||||
|
for(i=0; i<len; i++){
|
||||||
|
dst[2*i] = float_to_int16_one(src[0]+i);
|
||||||
|
dst[2*i+1] = float_to_int16_one(src[1]+i);
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
for(c=0; c<channels; c++)
|
||||||
|
for(i=0, j=c; i<len; i++, j+=channels)
|
||||||
|
dst[j] = float_to_int16_one(src[c]+i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
|
||||||
|
{
|
||||||
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
|
||||||
|
c->float_to_int16 = float_to_int16_c;
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_c;
|
||||||
|
|
||||||
|
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
|
||||||
|
if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
|
||||||
|
if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
* Format Conversion Utils
|
||||||
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||||
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AVCODEC_FMTCONVERT_H
|
||||||
|
#define AVCODEC_FMTCONVERT_H
|
||||||
|
|
||||||
|
#include "avcodec.h"
|
||||||
|
|
||||||
|
typedef struct FmtConvertContext {
|
||||||
|
/**
|
||||||
|
* Convert an array of int32_t to float and multiply by a float value.
|
||||||
|
* @param dst destination array of float.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param src source array of int32_t.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param len number of elements to convert.
|
||||||
|
* constraints: multiple of 8
|
||||||
|
*/
|
||||||
|
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert an array of float to an array of int16_t.
|
||||||
|
*
|
||||||
|
* Convert floats from in the range [-32768.0,32767.0] to ints
|
||||||
|
* without rescaling
|
||||||
|
*
|
||||||
|
* @param dst destination array of int16_t.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param src source array of float.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param len number of elements to convert.
|
||||||
|
* constraints: multiple of 8
|
||||||
|
*/
|
||||||
|
void (*float_to_int16)(int16_t *dst, const float *src, long len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert multiple arrays of float to an interleaved array of int16_t.
|
||||||
|
*
|
||||||
|
* Convert floats from in the range [-32768.0,32767.0] to ints
|
||||||
|
* without rescaling
|
||||||
|
*
|
||||||
|
* @param dst destination array of interleaved int16_t.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param src source array of float arrays, one for each channel.
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param len number of elements to convert.
|
||||||
|
* constraints: multiple of 8
|
||||||
|
* @param channels number of channels
|
||||||
|
*/
|
||||||
|
void (*float_to_int16_interleave)(int16_t *dst, const float **src,
|
||||||
|
long len, int channels);
|
||||||
|
} FmtConvertContext;
|
||||||
|
|
||||||
|
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
|
||||||
|
|
||||||
|
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
|
||||||
|
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
|
||||||
|
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
|
||||||
|
|
||||||
|
#endif /* AVCODEC_FMTCONVERT_H */
|
|
@ -38,6 +38,7 @@
|
||||||
#include "avcodec.h"
|
#include "avcodec.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
#define ALT_BITSTREAM_READER_LE
|
#define ALT_BITSTREAM_READER_LE
|
||||||
#include "get_bits.h"
|
#include "get_bits.h"
|
||||||
|
@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
|
||||||
float scale_bias;
|
float scale_bias;
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
FFTContext imdct_ctx;
|
FFTContext imdct_ctx;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
|
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
|
||||||
} NellyMoserDecodeContext;
|
} NellyMoserDecodeContext;
|
||||||
|
|
||||||
|
@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
|
||||||
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
|
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
|
||||||
|
|
||||||
dsputil_init(&s->dsp, avctx);
|
dsputil_init(&s->dsp, avctx);
|
||||||
|
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||||
|
|
||||||
s->scale_bias = 1.0/(1*8);
|
s->scale_bias = 1.0/(1*8);
|
||||||
|
|
||||||
|
@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
|
||||||
|
|
||||||
for (i=0 ; i<blocks ; i++) {
|
for (i=0 ; i<blocks ; i++) {
|
||||||
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
|
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
|
||||||
s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
|
s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
|
||||||
*data_size += NELLY_SAMPLES*sizeof(int16_t);
|
*data_size += NELLY_SAMPLES*sizeof(int16_t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
|
||||||
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
|
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
|
||||||
ppc/fdct_altivec.o \
|
ppc/fdct_altivec.o \
|
||||||
ppc/float_altivec.o \
|
ppc/float_altivec.o \
|
||||||
|
ppc/fmtconvert_altivec.o \
|
||||||
ppc/gmc_altivec.o \
|
ppc/gmc_altivec.o \
|
||||||
ppc/idct_altivec.o \
|
ppc/idct_altivec.o \
|
||||||
ppc/int_altivec.o \
|
ppc/int_altivec.o \
|
||||||
|
|
|
@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
|
|
||||||
{
|
|
||||||
union {
|
|
||||||
vector float v;
|
|
||||||
float s[4];
|
|
||||||
} mul_u;
|
|
||||||
int i;
|
|
||||||
vector float src1, src2, dst1, dst2, mul_v, zero;
|
|
||||||
|
|
||||||
zero = (vector float)vec_splat_u32(0);
|
|
||||||
mul_u.s[0] = mul;
|
|
||||||
mul_v = vec_splat(mul_u.v, 0);
|
|
||||||
|
|
||||||
for(i=0; i<len; i+=8) {
|
|
||||||
src1 = vec_ctf(vec_ld(0, src+i), 0);
|
|
||||||
src2 = vec_ctf(vec_ld(16, src+i), 0);
|
|
||||||
dst1 = vec_madd(src1, mul_v, zero);
|
|
||||||
dst2 = vec_madd(src2, mul_v, zero);
|
|
||||||
vec_st(dst1, 0, dst+i);
|
|
||||||
vec_st(dst2, 16, dst+i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static vector signed short
|
|
||||||
float_to_int16_one_altivec(const float *src)
|
|
||||||
{
|
|
||||||
vector float s0 = vec_ld(0, src);
|
|
||||||
vector float s1 = vec_ld(16, src);
|
|
||||||
vector signed int t0 = vec_cts(s0, 0);
|
|
||||||
vector signed int t1 = vec_cts(s1, 0);
|
|
||||||
return vec_packs(t0,t1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
vector signed short d0, d1, d;
|
|
||||||
vector unsigned char align;
|
|
||||||
if(((long)dst)&15) //FIXME
|
|
||||||
for(i=0; i<len-7; i+=8) {
|
|
||||||
d0 = vec_ld(0, dst+i);
|
|
||||||
d = float_to_int16_one_altivec(src+i);
|
|
||||||
d1 = vec_ld(15, dst+i);
|
|
||||||
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
|
|
||||||
align = vec_lvsr(0, dst+i);
|
|
||||||
d0 = vec_perm(d1, d, align);
|
|
||||||
d1 = vec_perm(d, d1, align);
|
|
||||||
vec_st(d0, 0, dst+i);
|
|
||||||
vec_st(d1,15, dst+i);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
for(i=0; i<len-7; i+=8) {
|
|
||||||
d = float_to_int16_one_altivec(src+i);
|
|
||||||
vec_st(d, 0, dst+i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
|
|
||||||
long len, int channels)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
vector signed short d0, d1, d2, c0, c1, t0, t1;
|
|
||||||
vector unsigned char align;
|
|
||||||
if(channels == 1)
|
|
||||||
float_to_int16_altivec(dst, src[0], len);
|
|
||||||
else
|
|
||||||
if (channels == 2) {
|
|
||||||
if(((long)dst)&15)
|
|
||||||
for(i=0; i<len-7; i+=8) {
|
|
||||||
d0 = vec_ld(0, dst + i);
|
|
||||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
|
||||||
d1 = vec_ld(31, dst + i);
|
|
||||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
|
||||||
c0 = vec_mergeh(t0, t1);
|
|
||||||
c1 = vec_mergel(t0, t1);
|
|
||||||
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
|
|
||||||
align = vec_lvsr(0, dst + i);
|
|
||||||
d0 = vec_perm(d2, c0, align);
|
|
||||||
d1 = vec_perm(c0, c1, align);
|
|
||||||
vec_st(d0, 0, dst + i);
|
|
||||||
d0 = vec_perm(c1, d2, align);
|
|
||||||
vec_st(d1, 15, dst + i);
|
|
||||||
vec_st(d0, 31, dst + i);
|
|
||||||
dst+=8;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
for(i=0; i<len-7; i+=8) {
|
|
||||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
|
||||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
|
||||||
d0 = vec_mergeh(t0, t1);
|
|
||||||
d1 = vec_mergel(t0, t1);
|
|
||||||
vec_st(d0, 0, dst + i);
|
|
||||||
vec_st(d1, 16, dst + i);
|
|
||||||
dst+=8;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];
|
|
||||||
int c, j;
|
|
||||||
for (c = 0; c < channels; c++) {
|
|
||||||
float_to_int16_altivec(tmp, src[c], len);
|
|
||||||
for (i = 0, j = c; i < len; i++, j+=channels) {
|
|
||||||
dst[j] = tmp[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
c->vector_fmul = vector_fmul_altivec;
|
c->vector_fmul = vector_fmul_altivec;
|
||||||
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
|
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
|
||||||
c->vector_fmul_add = vector_fmul_add_altivec;
|
c->vector_fmul_add = vector_fmul_add_altivec;
|
||||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
|
|
||||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||||
c->vector_fmul_window = vector_fmul_window_altivec;
|
c->vector_fmul_window = vector_fmul_window_altivec;
|
||||||
c->float_to_int16 = float_to_int16_altivec;
|
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,142 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavcodec/fmtconvert.h"
|
||||||
|
|
||||||
|
#include "dsputil_altivec.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
|
|
||||||
|
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
|
||||||
|
{
|
||||||
|
union {
|
||||||
|
vector float v;
|
||||||
|
float s[4];
|
||||||
|
} mul_u;
|
||||||
|
int i;
|
||||||
|
vector float src1, src2, dst1, dst2, mul_v, zero;
|
||||||
|
|
||||||
|
zero = (vector float)vec_splat_u32(0);
|
||||||
|
mul_u.s[0] = mul;
|
||||||
|
mul_v = vec_splat(mul_u.v, 0);
|
||||||
|
|
||||||
|
for(i=0; i<len; i+=8) {
|
||||||
|
src1 = vec_ctf(vec_ld(0, src+i), 0);
|
||||||
|
src2 = vec_ctf(vec_ld(16, src+i), 0);
|
||||||
|
dst1 = vec_madd(src1, mul_v, zero);
|
||||||
|
dst2 = vec_madd(src2, mul_v, zero);
|
||||||
|
vec_st(dst1, 0, dst+i);
|
||||||
|
vec_st(dst2, 16, dst+i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static vector signed short
|
||||||
|
float_to_int16_one_altivec(const float *src)
|
||||||
|
{
|
||||||
|
vector float s0 = vec_ld(0, src);
|
||||||
|
vector float s1 = vec_ld(16, src);
|
||||||
|
vector signed int t0 = vec_cts(s0, 0);
|
||||||
|
vector signed int t1 = vec_cts(s1, 0);
|
||||||
|
return vec_packs(t0,t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
vector signed short d0, d1, d;
|
||||||
|
vector unsigned char align;
|
||||||
|
if(((long)dst)&15) //FIXME
|
||||||
|
for(i=0; i<len-7; i+=8) {
|
||||||
|
d0 = vec_ld(0, dst+i);
|
||||||
|
d = float_to_int16_one_altivec(src+i);
|
||||||
|
d1 = vec_ld(15, dst+i);
|
||||||
|
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
|
||||||
|
align = vec_lvsr(0, dst+i);
|
||||||
|
d0 = vec_perm(d1, d, align);
|
||||||
|
d1 = vec_perm(d, d1, align);
|
||||||
|
vec_st(d0, 0, dst+i);
|
||||||
|
vec_st(d1,15, dst+i);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
for(i=0; i<len-7; i+=8) {
|
||||||
|
d = float_to_int16_one_altivec(src+i);
|
||||||
|
vec_st(d, 0, dst+i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
|
||||||
|
long len, int channels)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
vector signed short d0, d1, d2, c0, c1, t0, t1;
|
||||||
|
vector unsigned char align;
|
||||||
|
if(channels == 1)
|
||||||
|
float_to_int16_altivec(dst, src[0], len);
|
||||||
|
else
|
||||||
|
if (channels == 2) {
|
||||||
|
if(((long)dst)&15)
|
||||||
|
for(i=0; i<len-7; i+=8) {
|
||||||
|
d0 = vec_ld(0, dst + i);
|
||||||
|
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||||
|
d1 = vec_ld(31, dst + i);
|
||||||
|
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||||
|
c0 = vec_mergeh(t0, t1);
|
||||||
|
c1 = vec_mergel(t0, t1);
|
||||||
|
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
|
||||||
|
align = vec_lvsr(0, dst + i);
|
||||||
|
d0 = vec_perm(d2, c0, align);
|
||||||
|
d1 = vec_perm(c0, c1, align);
|
||||||
|
vec_st(d0, 0, dst + i);
|
||||||
|
d0 = vec_perm(c1, d2, align);
|
||||||
|
vec_st(d1, 15, dst + i);
|
||||||
|
vec_st(d0, 31, dst + i);
|
||||||
|
dst+=8;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
for(i=0; i<len-7; i+=8) {
|
||||||
|
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||||
|
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||||
|
d0 = vec_mergeh(t0, t1);
|
||||||
|
d1 = vec_mergel(t0, t1);
|
||||||
|
vec_st(d0, 0, dst + i);
|
||||||
|
vec_st(d1, 16, dst + i);
|
||||||
|
dst+=8;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
DECLARE_ALIGNED(16, int16_t, tmp)[len];
|
||||||
|
int c, j;
|
||||||
|
for (c = 0; c < channels; c++) {
|
||||||
|
float_to_int16_altivec(tmp, src[c], len);
|
||||||
|
for (i = 0, j = c; i < len; i++, j+=channels) {
|
||||||
|
dst[j] = tmp[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
|
||||||
|
{
|
||||||
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
|
||||||
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||||
|
c->float_to_int16 = float_to_int16_altivec;
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -31,6 +31,7 @@
|
||||||
#include "get_bits.h"
|
#include "get_bits.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
#include "vorbis.h"
|
#include "vorbis.h"
|
||||||
#include "xiph.h"
|
#include "xiph.h"
|
||||||
|
@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
|
||||||
AVCodecContext *avccontext;
|
AVCodecContext *avccontext;
|
||||||
GetBitContext gb;
|
GetBitContext gb;
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
|
|
||||||
FFTContext mdct[2];
|
FFTContext mdct[2];
|
||||||
uint_fast8_t first_frame;
|
uint_fast8_t first_frame;
|
||||||
|
@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
|
||||||
|
|
||||||
vc->avccontext = avccontext;
|
vc->avccontext = avccontext;
|
||||||
dsputil_init(&vc->dsp, avccontext);
|
dsputil_init(&vc->dsp, avccontext);
|
||||||
|
ff_fmt_convert_init(&vc->fmt_conv, avccontext);
|
||||||
|
|
||||||
vc->scale_bias = 32768.0f;
|
vc->scale_bias = 32768.0f;
|
||||||
|
|
||||||
|
@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
|
||||||
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
|
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
|
||||||
}
|
}
|
||||||
|
|
||||||
vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
|
vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
|
||||||
|
vc->audio_channels);
|
||||||
*data_size = len * 2 * vc->audio_channels;
|
*data_size = len * 2 * vc->audio_channels;
|
||||||
|
|
||||||
return buf_size ;
|
return buf_size ;
|
||||||
|
|
|
@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
|
||||||
s->block_align = avctx->block_align;
|
s->block_align = avctx->block_align;
|
||||||
|
|
||||||
dsputil_init(&s->dsp, avctx);
|
dsputil_init(&s->dsp, avctx);
|
||||||
|
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||||
|
|
||||||
if (avctx->codec->id == CODEC_ID_WMAV1) {
|
if (avctx->codec->id == CODEC_ID_WMAV1) {
|
||||||
s->version = 1;
|
s->version = 1;
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "put_bits.h"
|
#include "put_bits.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "fft.h"
|
#include "fft.h"
|
||||||
|
#include "fmtconvert.h"
|
||||||
|
|
||||||
/* size of blocks */
|
/* size of blocks */
|
||||||
#define BLOCK_MIN_BITS 7
|
#define BLOCK_MIN_BITS 7
|
||||||
|
@ -134,6 +135,7 @@ typedef struct WMACodecContext {
|
||||||
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
|
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
|
||||||
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
|
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
|
FmtConvertContext fmt_conv;
|
||||||
|
|
||||||
#ifdef TRACE
|
#ifdef TRACE
|
||||||
int frame_count;
|
int frame_count;
|
||||||
|
|
|
@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
|
||||||
incr = s->nb_channels;
|
incr = s->nb_channels;
|
||||||
for (ch = 0; ch < MAX_CHANNELS; ch++)
|
for (ch = 0; ch < MAX_CHANNELS; ch++)
|
||||||
output[ch] = s->frame_out[ch];
|
output[ch] = s->frame_out[ch];
|
||||||
s->dsp.float_to_int16_interleave(samples, output, n, incr);
|
s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
|
||||||
for (ch = 0; ch < incr; ch++) {
|
for (ch = 0; ch < incr; ch++) {
|
||||||
/* prepare for next block */
|
/* prepare for next block */
|
||||||
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
|
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
|
||||||
|
|
|
@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
||||||
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
|
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
|
||||||
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
|
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
|
||||||
x86/deinterlace.o \
|
x86/deinterlace.o \
|
||||||
|
x86/fmtconvert.o \
|
||||||
x86/h264_chromamc.o \
|
x86/h264_chromamc.o \
|
||||||
$(YASM-OBJS-yes)
|
$(YASM-OBJS-yes)
|
||||||
|
|
||||||
|
@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||||
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
|
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
|
||||||
x86/dsputil_mmx.o \
|
x86/dsputil_mmx.o \
|
||||||
x86/fdct_mmx.o \
|
x86/fdct_mmx.o \
|
||||||
|
x86/fmtconvert_mmx.o \
|
||||||
x86/idct_mmx_xvid.o \
|
x86/idct_mmx_xvid.o \
|
||||||
x86/idct_sse2_xvid.o \
|
x86/idct_sse2_xvid.o \
|
||||||
x86/motion_est_mmx.o \
|
x86/motion_est_mmx.o \
|
||||||
|
|
|
@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
|
||||||
}
|
}
|
||||||
#endif /* HAVE_6REGS */
|
#endif /* HAVE_6REGS */
|
||||||
|
|
||||||
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
|
|
||||||
{
|
|
||||||
x86_reg i = -4*len;
|
|
||||||
__asm__ volatile(
|
|
||||||
"movss %3, %%xmm4 \n"
|
|
||||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
|
||||||
"1: \n"
|
|
||||||
"cvtpi2ps (%2,%0), %%xmm0 \n"
|
|
||||||
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
|
|
||||||
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
|
|
||||||
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
|
|
||||||
"movlhps %%xmm1, %%xmm0 \n"
|
|
||||||
"movlhps %%xmm3, %%xmm2 \n"
|
|
||||||
"mulps %%xmm4, %%xmm0 \n"
|
|
||||||
"mulps %%xmm4, %%xmm2 \n"
|
|
||||||
"movaps %%xmm0, (%1,%0) \n"
|
|
||||||
"movaps %%xmm2, 16(%1,%0) \n"
|
|
||||||
"add $32, %0 \n"
|
|
||||||
"jl 1b \n"
|
|
||||||
:"+r"(i)
|
|
||||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
|
|
||||||
{
|
|
||||||
x86_reg i = -4*len;
|
|
||||||
__asm__ volatile(
|
|
||||||
"movss %3, %%xmm4 \n"
|
|
||||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
|
||||||
"1: \n"
|
|
||||||
"cvtdq2ps (%2,%0), %%xmm0 \n"
|
|
||||||
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
|
|
||||||
"mulps %%xmm4, %%xmm0 \n"
|
|
||||||
"mulps %%xmm4, %%xmm1 \n"
|
|
||||||
"movaps %%xmm0, (%1,%0) \n"
|
|
||||||
"movaps %%xmm1, 16(%1,%0) \n"
|
|
||||||
"add $32, %0 \n"
|
|
||||||
"jl 1b \n"
|
|
||||||
:"+r"(i)
|
|
||||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
|
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
|
||||||
int len)
|
int len)
|
||||||
{
|
{
|
||||||
|
@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
|
||||||
x86_reg reglen = len;
|
|
||||||
// not bit-exact: pf2id uses different rounding than C and SSE
|
|
||||||
__asm__ volatile(
|
|
||||||
"add %0 , %0 \n\t"
|
|
||||||
"lea (%2,%0,2) , %2 \n\t"
|
|
||||||
"add %0 , %1 \n\t"
|
|
||||||
"neg %0 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"pf2id (%2,%0,2) , %%mm0 \n\t"
|
|
||||||
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
|
|
||||||
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
|
|
||||||
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
|
|
||||||
"packssdw %%mm1 , %%mm0 \n\t"
|
|
||||||
"packssdw %%mm3 , %%mm2 \n\t"
|
|
||||||
"movq %%mm0 , (%1,%0) \n\t"
|
|
||||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
|
||||||
"add $16 , %0 \n\t"
|
|
||||||
" js 1b \n\t"
|
|
||||||
"femms \n\t"
|
|
||||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
|
||||||
x86_reg reglen = len;
|
|
||||||
__asm__ volatile(
|
|
||||||
"add %0 , %0 \n\t"
|
|
||||||
"lea (%2,%0,2) , %2 \n\t"
|
|
||||||
"add %0 , %1 \n\t"
|
|
||||||
"neg %0 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
|
|
||||||
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
|
|
||||||
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
|
|
||||||
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
|
|
||||||
"packssdw %%mm1 , %%mm0 \n\t"
|
|
||||||
"packssdw %%mm3 , %%mm2 \n\t"
|
|
||||||
"movq %%mm0 , (%1,%0) \n\t"
|
|
||||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
|
||||||
"add $16 , %0 \n\t"
|
|
||||||
" js 1b \n\t"
|
|
||||||
"emms \n\t"
|
|
||||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
|
||||||
x86_reg reglen = len;
|
|
||||||
__asm__ volatile(
|
|
||||||
"add %0 , %0 \n\t"
|
|
||||||
"lea (%2,%0,2) , %2 \n\t"
|
|
||||||
"add %0 , %1 \n\t"
|
|
||||||
"neg %0 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
|
|
||||||
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
|
|
||||||
"packssdw %%xmm1 , %%xmm0 \n\t"
|
|
||||||
"movdqa %%xmm0 , (%1,%0) \n\t"
|
|
||||||
"add $16 , %0 \n\t"
|
|
||||||
" js 1b \n\t"
|
|
||||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ff_vp3_idct_mmx(int16_t *input_data);
|
void ff_vp3_idct_mmx(int16_t *input_data);
|
||||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
|
@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
|
||||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
|
|
||||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
|
||||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
|
||||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
|
||||||
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
||||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
||||||
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
||||||
|
@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
|
||||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||||
|
|
||||||
#if !HAVE_YASM
|
|
||||||
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
|
|
||||||
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
|
||||||
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
|
||||||
#endif
|
|
||||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
|
||||||
|
|
||||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
|
|
||||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
|
||||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
||||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
|
||||||
int i,j,c;\
|
|
||||||
for(c=0; c<channels; c++){\
|
|
||||||
float_to_int16_##cpu(tmp, src[c], len);\
|
|
||||||
for(i=0, j=c; i<len; i++, j+=channels)\
|
|
||||||
dst[j] = tmp[i];\
|
|
||||||
}\
|
|
||||||
}\
|
|
||||||
\
|
|
||||||
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
||||||
if(channels==1)\
|
|
||||||
float_to_int16_##cpu(dst, src[0], len);\
|
|
||||||
else if(channels==2){\
|
|
||||||
x86_reg reglen = len; \
|
|
||||||
const float *src0 = src[0];\
|
|
||||||
const float *src1 = src[1];\
|
|
||||||
__asm__ volatile(\
|
|
||||||
"shl $2, %0 \n"\
|
|
||||||
"add %0, %1 \n"\
|
|
||||||
"add %0, %2 \n"\
|
|
||||||
"add %0, %3 \n"\
|
|
||||||
"neg %0 \n"\
|
|
||||||
body\
|
|
||||||
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
|
|
||||||
);\
|
|
||||||
}else if(channels==6){\
|
|
||||||
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
|
||||||
}else\
|
|
||||||
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
|
||||||
}
|
|
||||||
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE(3dnow,
|
|
||||||
"1: \n"
|
|
||||||
"pf2id (%2,%0), %%mm0 \n"
|
|
||||||
"pf2id 8(%2,%0), %%mm1 \n"
|
|
||||||
"pf2id (%3,%0), %%mm2 \n"
|
|
||||||
"pf2id 8(%3,%0), %%mm3 \n"
|
|
||||||
"packssdw %%mm1, %%mm0 \n"
|
|
||||||
"packssdw %%mm3, %%mm2 \n"
|
|
||||||
"movq %%mm0, %%mm1 \n"
|
|
||||||
"punpcklwd %%mm2, %%mm0 \n"
|
|
||||||
"punpckhwd %%mm2, %%mm1 \n"
|
|
||||||
"movq %%mm0, (%1,%0)\n"
|
|
||||||
"movq %%mm1, 8(%1,%0)\n"
|
|
||||||
"add $16, %0 \n"
|
|
||||||
"js 1b \n"
|
|
||||||
"femms \n"
|
|
||||||
)
|
|
||||||
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE(sse,
|
|
||||||
"1: \n"
|
|
||||||
"cvtps2pi (%2,%0), %%mm0 \n"
|
|
||||||
"cvtps2pi 8(%2,%0), %%mm1 \n"
|
|
||||||
"cvtps2pi (%3,%0), %%mm2 \n"
|
|
||||||
"cvtps2pi 8(%3,%0), %%mm3 \n"
|
|
||||||
"packssdw %%mm1, %%mm0 \n"
|
|
||||||
"packssdw %%mm3, %%mm2 \n"
|
|
||||||
"movq %%mm0, %%mm1 \n"
|
|
||||||
"punpcklwd %%mm2, %%mm0 \n"
|
|
||||||
"punpckhwd %%mm2, %%mm1 \n"
|
|
||||||
"movq %%mm0, (%1,%0)\n"
|
|
||||||
"movq %%mm1, 8(%1,%0)\n"
|
|
||||||
"add $16, %0 \n"
|
|
||||||
"js 1b \n"
|
|
||||||
"emms \n"
|
|
||||||
)
|
|
||||||
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE(sse2,
|
|
||||||
"1: \n"
|
|
||||||
"cvtps2dq (%2,%0), %%xmm0 \n"
|
|
||||||
"cvtps2dq (%3,%0), %%xmm1 \n"
|
|
||||||
"packssdw %%xmm1, %%xmm0 \n"
|
|
||||||
"movhlps %%xmm0, %%xmm1 \n"
|
|
||||||
"punpcklwd %%xmm1, %%xmm0 \n"
|
|
||||||
"movdqa %%xmm0, (%1,%0) \n"
|
|
||||||
"add $16, %0 \n"
|
|
||||||
"js 1b \n"
|
|
||||||
)
|
|
||||||
|
|
||||||
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
|
||||||
if(channels==6)
|
|
||||||
ff_float_to_int16_interleave6_3dn2(dst, src, len);
|
|
||||||
else
|
|
||||||
float_to_int16_interleave_3dnow(dst, src, len, channels);
|
|
||||||
}
|
|
||||||
|
|
||||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||||
|
|
||||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
|
@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
||||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
|
||||||
c->vector_fmul = vector_fmul_3dnow;
|
c->vector_fmul = vector_fmul_3dnow;
|
||||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
||||||
c->float_to_int16 = float_to_int16_3dnow;
|
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
||||||
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
|
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
|
||||||
#if HAVE_6REGS
|
#if HAVE_6REGS
|
||||||
c->vector_fmul_window = vector_fmul_window_3dnow2;
|
c->vector_fmul_window = vector_fmul_window_3dnow2;
|
||||||
#endif
|
#endif
|
||||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if(mm_flags & AV_CPU_FLAG_MMX2){
|
if(mm_flags & AV_CPU_FLAG_MMX2){
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
|
@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
#if HAVE_6REGS
|
#if HAVE_6REGS
|
||||||
c->vector_fmul_window = vector_fmul_window_sse;
|
c->vector_fmul_window = vector_fmul_window_sse;
|
||||||
#endif
|
#endif
|
||||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
|
|
||||||
c->vector_clipf = vector_clipf_sse;
|
c->vector_clipf = vector_clipf_sse;
|
||||||
c->float_to_int16 = float_to_int16_sse;
|
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
||||||
#endif
|
#endif
|
||||||
|
@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
if(mm_flags & AV_CPU_FLAG_3DNOW)
|
if(mm_flags & AV_CPU_FLAG_3DNOW)
|
||||||
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
|
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
|
||||||
if(mm_flags & AV_CPU_FLAG_SSE2){
|
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
|
||||||
c->float_to_int16 = float_to_int16_sse2;
|
|
||||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||||
|
|
|
@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||||
|
|
||||||
section .text align=16
|
section .text align=16
|
||||||
|
|
||||||
%macro PSWAPD_SSE 2
|
|
||||||
pshufw %1, %2, 0x4e
|
|
||||||
%endmacro
|
|
||||||
%macro PSWAPD_3DN1 2
|
|
||||||
movq %1, %2
|
|
||||||
psrlq %1, 32
|
|
||||||
punpckldq %1, %2
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
|
||||||
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
|
||||||
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
|
||||||
%ifdef ARCH_X86_64
|
|
||||||
%define lend r10d
|
|
||||||
mov lend, r2d
|
|
||||||
%else
|
|
||||||
%define lend dword r2m
|
|
||||||
%endif
|
|
||||||
mov src1q, [srcq+1*gprsize]
|
|
||||||
mov src2q, [srcq+2*gprsize]
|
|
||||||
mov src3q, [srcq+3*gprsize]
|
|
||||||
mov src4q, [srcq+4*gprsize]
|
|
||||||
mov src5q, [srcq+5*gprsize]
|
|
||||||
mov srcq, [srcq]
|
|
||||||
sub src1q, srcq
|
|
||||||
sub src2q, srcq
|
|
||||||
sub src3q, srcq
|
|
||||||
sub src4q, srcq
|
|
||||||
sub src5q, srcq
|
|
||||||
.loop:
|
|
||||||
cvtps2pi mm0, [srcq]
|
|
||||||
cvtps2pi mm1, [srcq+src1q]
|
|
||||||
cvtps2pi mm2, [srcq+src2q]
|
|
||||||
cvtps2pi mm3, [srcq+src3q]
|
|
||||||
cvtps2pi mm4, [srcq+src4q]
|
|
||||||
cvtps2pi mm5, [srcq+src5q]
|
|
||||||
packssdw mm0, mm3
|
|
||||||
packssdw mm1, mm4
|
|
||||||
packssdw mm2, mm5
|
|
||||||
pswapd mm3, mm0
|
|
||||||
punpcklwd mm0, mm1
|
|
||||||
punpckhwd mm1, mm2
|
|
||||||
punpcklwd mm2, mm3
|
|
||||||
pswapd mm3, mm0
|
|
||||||
punpckldq mm0, mm2
|
|
||||||
punpckhdq mm2, mm1
|
|
||||||
punpckldq mm1, mm3
|
|
||||||
movq [dstq ], mm0
|
|
||||||
movq [dstq+16], mm2
|
|
||||||
movq [dstq+ 8], mm1
|
|
||||||
add srcq, 8
|
|
||||||
add dstq, 24
|
|
||||||
sub lend, 2
|
|
||||||
jg .loop
|
|
||||||
emms
|
|
||||||
RET
|
|
||||||
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
|
||||||
|
|
||||||
%define pswapd PSWAPD_SSE
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE6 sse
|
|
||||||
%define cvtps2pi pf2id
|
|
||||||
%define pswapd PSWAPD_3DN1
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE6 3dnow
|
|
||||||
%undef pswapd
|
|
||||||
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
|
||||||
%undef cvtps2pi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
%macro SCALARPRODUCT 1
|
%macro SCALARPRODUCT 1
|
||||||
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
|
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
|
||||||
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
||||||
|
|
|
@ -0,0 +1,91 @@
|
||||||
|
;******************************************************************************
|
||||||
|
;* x86 optimized Format Conversion Utils
|
||||||
|
;* Copyright (c) 2008 Loren Merritt
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;******************************************************************************
|
||||||
|
|
||||||
|
%include "x86inc.asm"
|
||||||
|
|
||||||
|
section .text align=16
|
||||||
|
|
||||||
|
%macro PSWAPD_SSE 2
|
||||||
|
pshufw %1, %2, 0x4e
|
||||||
|
%endmacro
|
||||||
|
%macro PSWAPD_3DN1 2
|
||||||
|
movq %1, %2
|
||||||
|
psrlq %1, 32
|
||||||
|
punpckldq %1, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
||||||
|
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
||||||
|
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
||||||
|
%ifdef ARCH_X86_64
|
||||||
|
%define lend r10d
|
||||||
|
mov lend, r2d
|
||||||
|
%else
|
||||||
|
%define lend dword r2m
|
||||||
|
%endif
|
||||||
|
mov src1q, [srcq+1*gprsize]
|
||||||
|
mov src2q, [srcq+2*gprsize]
|
||||||
|
mov src3q, [srcq+3*gprsize]
|
||||||
|
mov src4q, [srcq+4*gprsize]
|
||||||
|
mov src5q, [srcq+5*gprsize]
|
||||||
|
mov srcq, [srcq]
|
||||||
|
sub src1q, srcq
|
||||||
|
sub src2q, srcq
|
||||||
|
sub src3q, srcq
|
||||||
|
sub src4q, srcq
|
||||||
|
sub src5q, srcq
|
||||||
|
.loop:
|
||||||
|
cvtps2pi mm0, [srcq]
|
||||||
|
cvtps2pi mm1, [srcq+src1q]
|
||||||
|
cvtps2pi mm2, [srcq+src2q]
|
||||||
|
cvtps2pi mm3, [srcq+src3q]
|
||||||
|
cvtps2pi mm4, [srcq+src4q]
|
||||||
|
cvtps2pi mm5, [srcq+src5q]
|
||||||
|
packssdw mm0, mm3
|
||||||
|
packssdw mm1, mm4
|
||||||
|
packssdw mm2, mm5
|
||||||
|
pswapd mm3, mm0
|
||||||
|
punpcklwd mm0, mm1
|
||||||
|
punpckhwd mm1, mm2
|
||||||
|
punpcklwd mm2, mm3
|
||||||
|
pswapd mm3, mm0
|
||||||
|
punpckldq mm0, mm2
|
||||||
|
punpckhdq mm2, mm1
|
||||||
|
punpckldq mm1, mm3
|
||||||
|
movq [dstq ], mm0
|
||||||
|
movq [dstq+16], mm2
|
||||||
|
movq [dstq+ 8], mm1
|
||||||
|
add srcq, 8
|
||||||
|
add dstq, 24
|
||||||
|
sub lend, 2
|
||||||
|
jg .loop
|
||||||
|
emms
|
||||||
|
RET
|
||||||
|
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
||||||
|
|
||||||
|
%define pswapd PSWAPD_SSE
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE6 sse
|
||||||
|
%define cvtps2pi pf2id
|
||||||
|
%define pswapd PSWAPD_3DN1
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE6 3dnow
|
||||||
|
%undef pswapd
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
||||||
|
%undef cvtps2pi
|
|
@ -0,0 +1,266 @@
|
||||||
|
/*
|
||||||
|
* Format Conversion Utils
|
||||||
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||||
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*
|
||||||
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/cpu.h"
|
||||||
|
#include "libavutil/x86_cpu.h"
|
||||||
|
#include "libavcodec/fmtconvert.h"
|
||||||
|
|
||||||
|
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
|
||||||
|
{
|
||||||
|
x86_reg i = -4*len;
|
||||||
|
__asm__ volatile(
|
||||||
|
"movss %3, %%xmm4 \n"
|
||||||
|
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||||
|
"1: \n"
|
||||||
|
"cvtpi2ps (%2,%0), %%xmm0 \n"
|
||||||
|
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
|
||||||
|
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
|
||||||
|
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
|
||||||
|
"movlhps %%xmm1, %%xmm0 \n"
|
||||||
|
"movlhps %%xmm3, %%xmm2 \n"
|
||||||
|
"mulps %%xmm4, %%xmm0 \n"
|
||||||
|
"mulps %%xmm4, %%xmm2 \n"
|
||||||
|
"movaps %%xmm0, (%1,%0) \n"
|
||||||
|
"movaps %%xmm2, 16(%1,%0) \n"
|
||||||
|
"add $32, %0 \n"
|
||||||
|
"jl 1b \n"
|
||||||
|
:"+r"(i)
|
||||||
|
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
|
||||||
|
{
|
||||||
|
x86_reg i = -4*len;
|
||||||
|
__asm__ volatile(
|
||||||
|
"movss %3, %%xmm4 \n"
|
||||||
|
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||||
|
"1: \n"
|
||||||
|
"cvtdq2ps (%2,%0), %%xmm0 \n"
|
||||||
|
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
|
||||||
|
"mulps %%xmm4, %%xmm0 \n"
|
||||||
|
"mulps %%xmm4, %%xmm1 \n"
|
||||||
|
"movaps %%xmm0, (%1,%0) \n"
|
||||||
|
"movaps %%xmm1, 16(%1,%0) \n"
|
||||||
|
"add $32, %0 \n"
|
||||||
|
"jl 1b \n"
|
||||||
|
:"+r"(i)
|
||||||
|
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
||||||
|
x86_reg reglen = len;
|
||||||
|
// not bit-exact: pf2id uses different rounding than C and SSE
|
||||||
|
__asm__ volatile(
|
||||||
|
"add %0 , %0 \n\t"
|
||||||
|
"lea (%2,%0,2) , %2 \n\t"
|
||||||
|
"add %0 , %1 \n\t"
|
||||||
|
"neg %0 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"pf2id (%2,%0,2) , %%mm0 \n\t"
|
||||||
|
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
|
||||||
|
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
|
||||||
|
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
|
||||||
|
"packssdw %%mm1 , %%mm0 \n\t"
|
||||||
|
"packssdw %%mm3 , %%mm2 \n\t"
|
||||||
|
"movq %%mm0 , (%1,%0) \n\t"
|
||||||
|
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||||
|
"add $16 , %0 \n\t"
|
||||||
|
" js 1b \n\t"
|
||||||
|
"femms \n\t"
|
||||||
|
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
||||||
|
x86_reg reglen = len;
|
||||||
|
__asm__ volatile(
|
||||||
|
"add %0 , %0 \n\t"
|
||||||
|
"lea (%2,%0,2) , %2 \n\t"
|
||||||
|
"add %0 , %1 \n\t"
|
||||||
|
"neg %0 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
|
||||||
|
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
|
||||||
|
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
|
||||||
|
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
|
||||||
|
"packssdw %%mm1 , %%mm0 \n\t"
|
||||||
|
"packssdw %%mm3 , %%mm2 \n\t"
|
||||||
|
"movq %%mm0 , (%1,%0) \n\t"
|
||||||
|
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||||
|
"add $16 , %0 \n\t"
|
||||||
|
" js 1b \n\t"
|
||||||
|
"emms \n\t"
|
||||||
|
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
||||||
|
x86_reg reglen = len;
|
||||||
|
__asm__ volatile(
|
||||||
|
"add %0 , %0 \n\t"
|
||||||
|
"lea (%2,%0,2) , %2 \n\t"
|
||||||
|
"add %0 , %1 \n\t"
|
||||||
|
"neg %0 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
|
||||||
|
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
|
||||||
|
"packssdw %%xmm1 , %%xmm0 \n\t"
|
||||||
|
"movdqa %%xmm0 , (%1,%0) \n\t"
|
||||||
|
"add $16 , %0 \n\t"
|
||||||
|
" js 1b \n\t"
|
||||||
|
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||||
|
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||||
|
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||||
|
|
||||||
|
#if !HAVE_YASM
|
||||||
|
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
|
||||||
|
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||||
|
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||||
|
#endif
|
||||||
|
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
||||||
|
|
||||||
|
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
|
||||||
|
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||||
|
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||||
|
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
||||||
|
int i,j,c;\
|
||||||
|
for(c=0; c<channels; c++){\
|
||||||
|
float_to_int16_##cpu(tmp, src[c], len);\
|
||||||
|
for(i=0, j=c; i<len; i++, j+=channels)\
|
||||||
|
dst[j] = tmp[i];\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
\
|
||||||
|
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||||
|
if(channels==1)\
|
||||||
|
float_to_int16_##cpu(dst, src[0], len);\
|
||||||
|
else if(channels==2){\
|
||||||
|
x86_reg reglen = len; \
|
||||||
|
const float *src0 = src[0];\
|
||||||
|
const float *src1 = src[1];\
|
||||||
|
__asm__ volatile(\
|
||||||
|
"shl $2, %0 \n"\
|
||||||
|
"add %0, %1 \n"\
|
||||||
|
"add %0, %2 \n"\
|
||||||
|
"add %0, %3 \n"\
|
||||||
|
"neg %0 \n"\
|
||||||
|
body\
|
||||||
|
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
|
||||||
|
);\
|
||||||
|
}else if(channels==6){\
|
||||||
|
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
||||||
|
}else\
|
||||||
|
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE(3dnow,
|
||||||
|
"1: \n"
|
||||||
|
"pf2id (%2,%0), %%mm0 \n"
|
||||||
|
"pf2id 8(%2,%0), %%mm1 \n"
|
||||||
|
"pf2id (%3,%0), %%mm2 \n"
|
||||||
|
"pf2id 8(%3,%0), %%mm3 \n"
|
||||||
|
"packssdw %%mm1, %%mm0 \n"
|
||||||
|
"packssdw %%mm3, %%mm2 \n"
|
||||||
|
"movq %%mm0, %%mm1 \n"
|
||||||
|
"punpcklwd %%mm2, %%mm0 \n"
|
||||||
|
"punpckhwd %%mm2, %%mm1 \n"
|
||||||
|
"movq %%mm0, (%1,%0)\n"
|
||||||
|
"movq %%mm1, 8(%1,%0)\n"
|
||||||
|
"add $16, %0 \n"
|
||||||
|
"js 1b \n"
|
||||||
|
"femms \n"
|
||||||
|
)
|
||||||
|
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE(sse,
|
||||||
|
"1: \n"
|
||||||
|
"cvtps2pi (%2,%0), %%mm0 \n"
|
||||||
|
"cvtps2pi 8(%2,%0), %%mm1 \n"
|
||||||
|
"cvtps2pi (%3,%0), %%mm2 \n"
|
||||||
|
"cvtps2pi 8(%3,%0), %%mm3 \n"
|
||||||
|
"packssdw %%mm1, %%mm0 \n"
|
||||||
|
"packssdw %%mm3, %%mm2 \n"
|
||||||
|
"movq %%mm0, %%mm1 \n"
|
||||||
|
"punpcklwd %%mm2, %%mm0 \n"
|
||||||
|
"punpckhwd %%mm2, %%mm1 \n"
|
||||||
|
"movq %%mm0, (%1,%0)\n"
|
||||||
|
"movq %%mm1, 8(%1,%0)\n"
|
||||||
|
"add $16, %0 \n"
|
||||||
|
"js 1b \n"
|
||||||
|
"emms \n"
|
||||||
|
)
|
||||||
|
|
||||||
|
FLOAT_TO_INT16_INTERLEAVE(sse2,
|
||||||
|
"1: \n"
|
||||||
|
"cvtps2dq (%2,%0), %%xmm0 \n"
|
||||||
|
"cvtps2dq (%3,%0), %%xmm1 \n"
|
||||||
|
"packssdw %%xmm1, %%xmm0 \n"
|
||||||
|
"movhlps %%xmm0, %%xmm1 \n"
|
||||||
|
"punpcklwd %%xmm1, %%xmm0 \n"
|
||||||
|
"movdqa %%xmm0, (%1,%0) \n"
|
||||||
|
"add $16, %0 \n"
|
||||||
|
"js 1b \n"
|
||||||
|
)
|
||||||
|
|
||||||
|
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
||||||
|
if(channels==6)
|
||||||
|
ff_float_to_int16_interleave6_3dn2(dst, src, len);
|
||||||
|
else
|
||||||
|
float_to_int16_interleave_3dnow(dst, src, len, channels);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
||||||
|
{
|
||||||
|
int mm_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||||
|
|
||||||
|
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
||||||
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||||
|
c->float_to_int16 = float_to_int16_3dnow;
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
||||||
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(mm_flags & AV_CPU_FLAG_SSE){
|
||||||
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
|
||||||
|
c->float_to_int16 = float_to_int16_sse;
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
||||||
|
}
|
||||||
|
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||||
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
||||||
|
c->float_to_int16 = float_to_int16_sse2;
|
||||||
|
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue