From bcc223523e68a52050dc3f7d0e6a07c82f6f2bff Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Sun, 14 Feb 2016 04:22:48 +0000 Subject: [PATCH] x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format Reviewed-by: Christophe Gisquet --- libavcodec/x86/vc1dsp.asm | 90 +++++++++++++++++++++++++++++++++++++ libavcodec/x86/vc1dsp_mmx.c | 61 ++++--------------------- 2 files changed, 98 insertions(+), 53 deletions(-) diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm index 91a1991851..eee42c27e6 100644 --- a/libavcodec/x86/vc1dsp.asm +++ b/libavcodec/x86/vc1dsp.asm @@ -25,6 +25,7 @@ cextern pw_4 cextern pw_5 cextern pw_9 +cextern pw_128 section .text @@ -319,6 +320,44 @@ cglobal vc1_h_loop_filter8, 3,5,8 RET %if HAVE_MMX_INLINE + +; XXX some of these macros are not used right now, but they will in the future +; when more functions are ported. + +%macro OP_PUT 2 ; dst, src +%endmacro + +%macro OP_AVG 2 ; dst, src + pavgb %1, %2 +%endmacro + +%macro NORMALIZE_MMX 1 ; shift + paddw m3, m7 ; +bias-r + paddw m4, m7 ; +bias-r + psraw m3, %1 + psraw m4, %1 +%endmacro + +%macro TRANSFER_DO_PACK 2 ; op, dst + packuswb m3, m4 + %1 m3, [%2] + mova [%2], m3 +%endmacro + +%macro TRANSFER_DONT_PACK 2 ; op, dst + %1 m3, [%2] + %1 m3, [%2 + mmsize] + mova [%2], m3 + mova [mmsize + %2], m4 +%endmacro + +; see MSPEL_FILTER13_CORE for use as UNPACK macro +%macro DO_UNPACK 1 ; reg + punpcklbw %1, m0 +%endmacro +%macro DONT_UNPACK 1 ; reg +%endmacro + ; Compute the rounder 32-r or 8-r and unpacks it to m7 %macro LOAD_ROUNDER_MMX 1 ; round movd m7, %1 @@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride dec i jnz .loop REP_RET +%undef rnd +%undef shift +%undef stride_neg2 +%undef stride_9minus4 +%undef i + +; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, +; const int16_t *src, int rnd); +; Data is already unpacked, so some operations can directly be made from +; memory. +%macro HOR_16B_SHIFT2 2 ; op, opname +cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h + mov hq, 8 + sub srcq, 2 + sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias + LOAD_ROUNDER_MMX rndq + mova m5, [pw_9] + mova m6, [pw_128] + pxor m0, m0 + +.loop: + mova m1, [srcq + 2 * 0] + mova m2, [srcq + 2 * 0 + mmsize] + mova m3, [srcq + 2 * 1] + mova m4, [srcq + 2 * 1 + mmsize] + paddw m3, [srcq + 2 * 2] + paddw m4, [srcq + 2 * 2 + mmsize] + paddw m1, [srcq + 2 * 3] + paddw m2, [srcq + 2 * 3 + mmsize] + pmullw m3, m5 + pmullw m4, m5 + psubw m3, m1 + psubw m4, m2 + NORMALIZE_MMX 7 + ; remove bias + paddw m3, m6 + paddw m4, m6 + TRANSFER_DO_PACK %1, dstq + add srcq, 24 + add dstq, strideq + dec hq + jnz .loop + + RET +%endmacro + +INIT_MMX mmx +HOR_16B_SHIFT2 OP_PUT, put + +INIT_MMX mmxext +HOR_16B_SHIFT2 OP_AVG, avg %endif ; HAVE_MMX_INLINE %macro INV_TRANS_INIT 0 diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index ff13d9b119..832564837b 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -38,6 +38,10 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, x86_reg stride, int rnd, int64_t shift); +void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, + const int16_t *src, int rnd); +void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride, + const int16_t *src, int rnd); #define OP_PUT(S,D) #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" @@ -70,55 +74,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, "punpcklwd %%mm7, %%mm7 \n\t" \ "punpckldq %%mm7, %%mm7 \n\t" -/** - * Data is already unpacked, so some operations can directly be made from - * memory. - */ -#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ -static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ - const int16_t *src, int rnd)\ -{\ - int h = 8;\ -\ - src -= 1;\ - rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ - __asm__ volatile(\ - LOAD_ROUNDER_MMX("%4")\ - "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ - "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ - "1: \n\t"\ - "movq 2*0+0(%1), %%mm1 \n\t"\ - "movq 2*0+8(%1), %%mm2 \n\t"\ - "movq 2*1+0(%1), %%mm3 \n\t"\ - "movq 2*1+8(%1), %%mm4 \n\t"\ - "paddw 2*3+0(%1), %%mm1 \n\t"\ - "paddw 2*3+8(%1), %%mm2 \n\t"\ - "paddw 2*2+0(%1), %%mm3 \n\t"\ - "paddw 2*2+8(%1), %%mm4 \n\t"\ - "pmullw %%mm5, %%mm3 \n\t"\ - "pmullw %%mm5, %%mm4 \n\t"\ - "psubw %%mm1, %%mm3 \n\t"\ - "psubw %%mm2, %%mm4 \n\t"\ - NORMALIZE_MMX("$7")\ - /* Remove bias */\ - "paddw %%mm6, %%mm3 \n\t"\ - "paddw %%mm6, %%mm4 \n\t"\ - TRANSFER_DO_PACK(OP)\ - "add $24, %1 \n\t"\ - "add %3, %2 \n\t"\ - "decl %0 \n\t"\ - "jnz 1b \n\t"\ - : "+r"(h), "+r" (src), "+r" (dst)\ - : "r"(stride), "m"(rnd)\ - NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ - : "memory"\ - );\ -} - -VC1_HOR_16b_SHIFT2(OP_PUT, put_) -VC1_HOR_16b_SHIFT2(OP_AVG, avg_) - - /** * Purely vertical or horizontal 1/2 shift interpolation. * Sacrify mm6 for *9 factor. @@ -380,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_ * @param hmode Vertical filter. * @param rnd Rounding bias. */ -#define VC1_MSPEL_MC(OP)\ +#define VC1_MSPEL_MC(OP, INSTR)\ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ int hmode, int vmode, int rnd)\ {\ static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ - { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ + { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\ static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ \ @@ -428,8 +383,8 @@ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ } -VC1_MSPEL_MC(put_) -VC1_MSPEL_MC(avg_) +VC1_MSPEL_MC(put_, mmx) +VC1_MSPEL_MC(avg_, mmxext) /** Macro to ease bicubic filter interpolation functions declarations */ #define DECLARE_FUNCTION(a, b) \