diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a15d735733..30b8effb70 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -10,7 +10,8 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o -OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o +OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ + x86/hpeldsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o @@ -33,6 +34,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ x86/fdct.o \ + x86/fpel_mmx.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ x86/simple_idct.o \ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 70b2249efd..097cab8285 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -253,68 +253,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, } while (--i); } -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - #define CLEAR_BLOCKS(name, n) \ static void name(int16_t *blocks) \ { \ @@ -1075,7 +1013,7 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, /* CAVS-specific */ void ff_put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) { - put_pixels8_mmx(dst, src, stride, 8); + ff_put_pixels8_mmx(dst, src, stride, 8); } void ff_avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) @@ -1085,7 +1023,7 @@ void ff_avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) void ff_put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) { - put_pixels16_mmx(dst, src, stride, 16); + ff_put_pixels16_mmx(dst, src, stride, 16); } void ff_avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) @@ -1097,7 +1035,7 @@ void ff_avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd) { - put_pixels8_mmx(dst, src, stride, 8); + ff_put_pixels8_mmx(dst, src, stride, 8); } static void vector_clipf_sse(float *dst, const float *src, diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index ca80e072da..df0c85ad61 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -155,6 +155,11 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_s void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); + +void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c new file mode 100644 index 0000000000..bb8b788024 --- /dev/null +++ b/libavcodec/x86/fpel_mmx.c @@ -0,0 +1,94 @@ +/* + * MMX-optimized avg/put pixel routines + * + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "config.h" +#include "dsputil_mmx.h" + +#if HAVE_MMX_INLINE + +void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "lea (%3, %3), %%"REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "lea (%3, %3), %%"REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq 8(%1 ), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1 ), %%mm0 \n\t" + "movq 8(%1 ), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r"(pixels), "+r"(block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +#endif /* HAVE_MMX_INLINE */ diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 824ea1c5eb..b0a87c8c9f 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -74,6 +74,10 @@ void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +#define put_pixels8_mmx ff_put_pixels8_mmx +#define put_pixels16_mmx ff_put_pixels16_mmx +#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx +#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx #if HAVE_INLINE_ASM @@ -133,74 +137,6 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, #endif /* HAVE_YASM */ - -#if HAVE_INLINE_ASM -#define put_no_rnd_pixels16_mmx put_pixels16_mmx -#define put_no_rnd_pixels8_mmx put_pixels8_mmx - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} -#endif /* HAVE_INLINE_ASM */ - #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ do { \ c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \