x86: huffyuvdsp: port add_bytes to yasm

C   MMX  SSE2
Cycles: 2972  587  302

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Christophe Gisquet 2014-05-28 15:52:24 +02:00 committed by Michael Niedermayer
parent 2267003981
commit 99a319c4e7
6 changed files with 48 additions and 36 deletions

View File

@ -27,7 +27,7 @@
#define pb_7f (~0UL / 255 * 0x7f)
#define pb_80 (~0UL / 255 * 0x80)
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
{
long i;

View File

@ -35,7 +35,7 @@
typedef struct HuffYUVDSPContext {
void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
int w);
intptr_t w);
void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);

View File

@ -31,7 +31,7 @@
#include "libavcodec/huffyuvdsp.h"
#if HAVE_ALTIVEC
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
{
register int i;
register vector unsigned char vdst, vsrc;

View File

@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
%macro ADD_BYTES 0
cglobal add_bytes, 3,4,2, dst, src, w, size
mov sizeq, wq
and sizeq, -2*mmsize
jz .2
add dstq, sizeq
add srcq, sizeq
neg sizeq
.1:
mova m0, [srcq + sizeq]
mova m1, [srcq + sizeq + mmsize]
paddb m0, [dstq + sizeq]
paddb m1, [dstq + sizeq + mmsize]
mova [dstq + sizeq], m0
mova [dstq + sizeq + mmsize], m1
add sizeq, 2*mmsize
jl .1
.2:
and wq, 2*mmsize-1
jz .end
add dstq, wq
add srcq, wq
neg wq
.3
mov sizeb, [srcq + wq]
add [dstq + wq], sizeb
inc wq
jl .3
.end:
REP_RET
%endmacro
INIT_MMX mmx
ADD_BYTES
INIT_XMM sse2
ADD_BYTES

View File

@ -23,7 +23,8 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
#endif
if (INLINE_MMX(cpu_flags))
if (EXTERNAL_MMX(cpu_flags))
c->add_bytes = ff_add_bytes_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_bytes = ff_add_bytes_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe

View File

@ -22,9 +22,7 @@
#include "libavutil/x86/asm.h"
#include "huffyuvdsp.h"
#if HAVE_INLINE_ASM
#if HAVE_7REGS
#if HAVE_INLINE_ASM && HAVE_7REGS
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top)
@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
*left_top = tl;
}
#endif
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{
x86_reg i = 0;
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq (%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, (%2, %0) \n\t"
"movq 8(%1, %0), %%mm0 \n\t"
"movq 8(%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"2: \n\t"
"cmp %3, %0 \n\t"
"js 1b \n\t"
: "+r" (i)
: "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
for (; i < w; i++)
dst[i + 0] += src[i + 0];
}
#endif /* HAVE_INLINE_ASM */