diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c index 3bc62ea156..f56f85932a 100644 --- a/libavcodec/x86/cavsdsp_mmx.c +++ b/libavcodec/x86/cavsdsp_mmx.c @@ -29,6 +29,12 @@ #include "libavcodec/cavsdsp.h" #include "dsputil_mmx.h" +/* in/out: mma=mma+mmb, mmb=mmb-mma */ +#define SUMSUB_BA( a, b ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "psubw "#a", "#b" \n\t" + /***************************************************************************** * * inverse transform diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 434d1859ec..71a65e753b 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -630,6 +630,34 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, } #endif +static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){ + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd (%1), %%mm0 \n\t" + "add %3, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "movd (%1,%3,1), %%mm2 \n\t" + "movd (%1,%3,2), %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, (%0) \n\t" + "add %2, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, (%0) \n\t" + "movd %%mm1, (%0,%2,1) \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%0,%2,2) \n\t" + + : "+&r" (dst), + "+&r" (src) + : "r" (dst_stride), + "r" (src_stride) + : "memory" + ); +} + #define H263_LOOP_FILTER \ "pxor %%mm7, %%mm7 \n\t" \ "movq %0, %%mm0 \n\t" \ diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index 37f4581b9c..fa42be6469 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -78,12 +78,6 @@ extern const double ff_pd_2[2]; "movq "#c", 2*"#stride"+"#out"\n\t"\ "movq "#d", 3*"#stride"+"#out"\n\t" -/* in/out: mma=mma+mmb, mmb=mmb-mma */ -#define SUMSUB_BA( a, b ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "psubw "#a", "#b" \n\t" - #define SBUTTERFLY(a,b,t,n,m)\ "mov" #m " " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ @@ -95,34 +89,6 @@ extern const double ff_pd_2[2]; SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ -static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){ - __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%1,%3,1), %%mm2 \n\t" - "movd (%1,%3,2), %%mm3 \n\t" - "punpcklbw %%mm1, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movd %%mm0, (%0) \n\t" - "add %2, %0 \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, (%0) \n\t" - "movd %%mm1, (%0,%2,1) \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, (%0,%2,2) \n\t" - - : "+&r" (dst), - "+&r" (src) - : "r" (dst_stride), - "r" (src_stride) - : "memory" - ); -} - // e,f,g,h can be memory // out: a,d,t,c #define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\