mirror of https://git.ffmpeg.org/ffmpeg.git
ac3enc: Add x86-optimized function to speed up log2_tab().
AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
1a973feb45
commit
fbb6b49dab
|
@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
|
||||||
|
{
|
||||||
|
int i, v = 0;
|
||||||
|
for (i = 0; i < len; i++)
|
||||||
|
v |= abs(src[i]);
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
av_cold void ff_ac3dsp_init(AC3DSPContext *c)
|
av_cold void ff_ac3dsp_init(AC3DSPContext *c)
|
||||||
{
|
{
|
||||||
c->ac3_exponent_min = ac3_exponent_min_c;
|
c->ac3_exponent_min = ac3_exponent_min_c;
|
||||||
|
c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
|
||||||
|
|
||||||
if (HAVE_MMX)
|
if (HAVE_MMX)
|
||||||
ff_ac3dsp_init_x86(c);
|
ff_ac3dsp_init_x86(c);
|
||||||
|
|
|
@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
|
||||||
* @param nb_coefs number of frequency coefficients.
|
* @param nb_coefs number of frequency coefficients.
|
||||||
*/
|
*/
|
||||||
void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the maximum MSB of the absolute value of each element in an
|
||||||
|
* array of int16_t.
|
||||||
|
* @param src input array
|
||||||
|
* constraints: align 16. values must be in range [-32767,32767]
|
||||||
|
* @param len number of values in the array
|
||||||
|
* constraints: multiple of 16 greater than 0
|
||||||
|
* @return a value with the same MSB as max(abs(src[]))
|
||||||
|
*/
|
||||||
|
int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
|
||||||
} AC3DSPContext;
|
} AC3DSPContext;
|
||||||
|
|
||||||
void ff_ac3dsp_init (AC3DSPContext *c);
|
void ff_ac3dsp_init (AC3DSPContext *c);
|
||||||
|
|
|
@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
|
||||||
* @param n number of values in the array
|
* @param n number of values in the array
|
||||||
* @return log2(max(abs(tab[])))
|
* @return log2(max(abs(tab[])))
|
||||||
*/
|
*/
|
||||||
static int log2_tab(int16_t *tab, int n)
|
static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
|
||||||
{
|
{
|
||||||
int i, v;
|
int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len);
|
||||||
|
|
||||||
v = 0;
|
|
||||||
for (i = 0; i < n; i++)
|
|
||||||
v |= abs(tab[i]);
|
|
||||||
|
|
||||||
return av_log2(v);
|
return av_log2(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
|
||||||
*/
|
*/
|
||||||
static int normalize_samples(AC3EncodeContext *s)
|
static int normalize_samples(AC3EncodeContext *s)
|
||||||
{
|
{
|
||||||
int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE);
|
int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
|
||||||
lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
|
lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
|
||||||
return v - 9;
|
return v - 9;
|
||||||
}
|
}
|
||||||
|
|
|
@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2
|
||||||
%endif
|
%endif
|
||||||
%undef PMINUB
|
%undef PMINUB
|
||||||
%undef LOOP_ALIGN
|
%undef LOOP_ALIGN
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
|
||||||
|
;
|
||||||
|
; This function uses 2 different methods to calculate a valid result.
|
||||||
|
; 1) logical 'or' of abs of each element
|
||||||
|
; This is used for ssse3 because of the pabsw instruction.
|
||||||
|
; It is also used for mmx because of the lack of min/max instructions.
|
||||||
|
; 2) calculate min/max for the array, then or(abs(min),abs(max))
|
||||||
|
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
%macro AC3_MAX_MSB_ABS_INT16 2
|
||||||
|
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
|
||||||
|
pxor m2, m2
|
||||||
|
pxor m3, m3
|
||||||
|
.loop:
|
||||||
|
%ifidn %2, min_max
|
||||||
|
mova m0, [srcq]
|
||||||
|
mova m1, [srcq+mmsize]
|
||||||
|
pminsw m2, m0
|
||||||
|
pminsw m2, m1
|
||||||
|
pmaxsw m3, m0
|
||||||
|
pmaxsw m3, m1
|
||||||
|
%else ; or_abs
|
||||||
|
%ifidn %1, mmx
|
||||||
|
mova m0, [srcq]
|
||||||
|
mova m1, [srcq+mmsize]
|
||||||
|
ABS2 m0, m1, m3, m4
|
||||||
|
%else ; ssse3
|
||||||
|
; using memory args is faster for ssse3
|
||||||
|
pabsw m0, [srcq]
|
||||||
|
pabsw m1, [srcq+mmsize]
|
||||||
|
%endif
|
||||||
|
por m2, m0
|
||||||
|
por m2, m1
|
||||||
|
%endif
|
||||||
|
add srcq, mmsize*2
|
||||||
|
sub lend, mmsize
|
||||||
|
ja .loop
|
||||||
|
%ifidn %2, min_max
|
||||||
|
ABS2 m2, m3, m0, m1
|
||||||
|
por m2, m3
|
||||||
|
%endif
|
||||||
|
%ifidn mmsize, 16
|
||||||
|
mova m0, m2
|
||||||
|
punpckhqdq m0, m0
|
||||||
|
por m2, m0
|
||||||
|
%endif
|
||||||
|
PSHUFLW m0, m2, 0xe
|
||||||
|
por m2, m0
|
||||||
|
PSHUFLW m0, m2, 0x1
|
||||||
|
por m2, m0
|
||||||
|
movd eax, m2
|
||||||
|
and eax, 0xFFFF
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
%define ABS2 ABS2_MMX
|
||||||
|
%define PSHUFLW pshufw
|
||||||
|
AC3_MAX_MSB_ABS_INT16 mmx, or_abs
|
||||||
|
%define ABS2 ABS2_MMX2
|
||||||
|
AC3_MAX_MSB_ABS_INT16 mmxext, min_max
|
||||||
|
INIT_XMM
|
||||||
|
%define PSHUFLW pshuflw
|
||||||
|
AC3_MAX_MSB_ABS_INT16 sse2, min_max
|
||||||
|
%define ABS2 ABS2_SSSE3
|
||||||
|
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
|
||||||
|
|
|
@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int n
|
||||||
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||||
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||||
|
|
||||||
|
extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
|
||||||
|
extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
|
||||||
|
extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
|
||||||
|
extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
|
||||||
|
|
||||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
|
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
|
||||||
{
|
{
|
||||||
int mm_flags = av_get_cpu_flags();
|
int mm_flags = av_get_cpu_flags();
|
||||||
|
@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
|
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
|
||||||
|
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
|
||||||
}
|
}
|
||||||
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
|
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
|
||||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
|
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
|
||||||
|
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
|
||||||
}
|
}
|
||||||
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
|
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
|
||||||
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
|
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
|
||||||
|
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
|
||||||
|
}
|
||||||
|
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
|
||||||
|
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue