SBR DSP x86: implement SSE sbr_sum_square_sse

The 32bits targets have been compiled with -mfpmath=sse for proper reference.
sbr_sum_square C  /32bits: 82c (unrolled)/102c
               C  /64bits: 69c (unrolled)/82c
               SSE/32bits: 42c
               SSE/64bits: 31c

Use of SSE4.1 dpps to perform the final sum is slower.
Not unrolling to perform 8 operations in a loop yields 10 more cycles.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Christophe GISQUET 2012-02-23 19:48:58 +01:00 committed by Ronald S. Bultje
parent 2e74a5abc2
commit 34454c761f
5 changed files with 116 additions and 0 deletions

View File

@ -238,4 +238,6 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s)
if (ARCH_ARM)
ff_sbrdsp_init_arm(s);
if (HAVE_MMX)
ff_sbrdsp_init_x86(s);
}

View File

@ -46,5 +46,6 @@ extern const float ff_sbr_noise_table[][2];
void ff_sbrdsp_init(SBRDSPContext *s);
void ff_sbrdsp_init_arm(SBRDSPContext *s);
void ff_sbrdsp_init_x86(SBRDSPContext *s);
#endif

View File

@ -47,6 +47,8 @@ YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o
MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o

74
libavcodec/x86/sbrdsp.asm Normal file
View File

@ -0,0 +1,74 @@
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
;SECTION_RODATA
SECTION .text
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2, r1
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movd r0m, m0
fld dword r0m
%endif
RET

View File

@ -0,0 +1,37 @@
/*
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavcodec/sbrdsp.h"
float ff_sbr_sum_square_sse(float (*x)[2], int n);
void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
if (HAVE_YASM) {
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_SSE) {
s->sum_square = ff_sbr_sum_square_sse;
}
}
}