mirror of https://git.ffmpeg.org/ffmpeg.git
x86: huffyuvdsp: port add_bytes to yasm
C MMX SSE2 Cycles: 2972 587 302 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
2267003981
commit
99a319c4e7
|
@ -27,7 +27,7 @@
|
|||
#define pb_7f (~0UL / 255 * 0x7f)
|
||||
#define pb_80 (~0UL / 255 * 0x80)
|
||||
|
||||
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
|
||||
static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
|
||||
{
|
||||
long i;
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
|
||||
typedef struct HuffYUVDSPContext {
|
||||
void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
|
||||
int w);
|
||||
intptr_t w);
|
||||
void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, int w,
|
||||
int *left, int *left_top);
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
#include "libavcodec/huffyuvdsp.h"
|
||||
|
||||
#if HAVE_ALTIVEC
|
||||
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
|
||||
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
|
||||
{
|
||||
register int i;
|
||||
register vector unsigned char vdst, vsrc;
|
||||
|
|
|
@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
|
|||
ADD_HFYU_LEFT_LOOP 0, 1
|
||||
.src_unaligned:
|
||||
ADD_HFYU_LEFT_LOOP 0, 0
|
||||
|
||||
%macro ADD_BYTES 0
|
||||
cglobal add_bytes, 3,4,2, dst, src, w, size
|
||||
mov sizeq, wq
|
||||
and sizeq, -2*mmsize
|
||||
jz .2
|
||||
add dstq, sizeq
|
||||
add srcq, sizeq
|
||||
neg sizeq
|
||||
.1:
|
||||
mova m0, [srcq + sizeq]
|
||||
mova m1, [srcq + sizeq + mmsize]
|
||||
paddb m0, [dstq + sizeq]
|
||||
paddb m1, [dstq + sizeq + mmsize]
|
||||
mova [dstq + sizeq], m0
|
||||
mova [dstq + sizeq + mmsize], m1
|
||||
add sizeq, 2*mmsize
|
||||
jl .1
|
||||
.2:
|
||||
and wq, 2*mmsize-1
|
||||
jz .end
|
||||
add dstq, wq
|
||||
add srcq, wq
|
||||
neg wq
|
||||
.3
|
||||
mov sizeb, [srcq + wq]
|
||||
add [dstq + wq], sizeb
|
||||
inc wq
|
||||
jl .3
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
ADD_BYTES
|
||||
INIT_XMM sse2
|
||||
ADD_BYTES
|
||||
|
|
|
@ -23,7 +23,8 @@
|
|||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/huffyuvdsp.h"
|
||||
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
|
||||
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
|
||||
|
||||
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, int w,
|
||||
|
@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
|
|||
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
|
||||
#endif
|
||||
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
c->add_bytes = ff_add_bytes_mmx;
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
|
@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
|
|||
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
|
||||
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
|
||||
|
|
|
@ -22,9 +22,7 @@
|
|||
#include "libavutil/x86/asm.h"
|
||||
#include "huffyuvdsp.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if HAVE_7REGS
|
||||
#if HAVE_INLINE_ASM && HAVE_7REGS
|
||||
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, int w,
|
||||
int *left, int *left_top)
|
||||
|
@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
|||
*left_top = tl;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
|
||||
{
|
||||
x86_reg i = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"jmp 2f \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq (%2, %0), %%mm1 \n\t"
|
||||
"paddb %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm1, (%2, %0) \n\t"
|
||||
"movq 8(%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%2, %0), %%mm1 \n\t"
|
||||
"paddb %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"2: \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
"js 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
|
||||
|
||||
for (; i < w; i++)
|
||||
dst[i + 0] += src[i + 0];
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
|
Loading…
Reference in New Issue