From 03c804e1ed65f52fd99020e1777103f4fb7abf15 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 20 May 2011 14:56:44 -0400 Subject: [PATCH 01/11] mp3lame: add #include required for AV_RB32 macro. Fixes compilation with mp3lame enabled. --- libavcodec/libmp3lame.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavcodec/libmp3lame.c b/libavcodec/libmp3lame.c index 62b55c9337..68f1bc9e2b 100644 --- a/libavcodec/libmp3lame.c +++ b/libavcodec/libmp3lame.c @@ -24,6 +24,7 @@ * Interface to libmp3lame for mp3 encoding. */ +#include "libavutil/intreadwrite.h" #include "avcodec.h" #include "mpegaudio.h" #include From 21bbca5b4422ddd10363bf1d8494564c54639b39 Mon Sep 17 00:00:00 2001 From: Reinhard Tartler Date: Thu, 12 May 2011 18:18:54 +0200 Subject: [PATCH 02/11] add changelog entries for 0.7_beta2 --- Changelog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog b/Changelog index f78150e03e..b05755503c 100644 --- a/Changelog +++ b/Changelog @@ -4,11 +4,15 @@ releases are sorted from youngest to oldest. version : + +version 0.7_beta2: + - Lots of deprecated API cruft removed - fft and imdct optimizations for AVX (Sandy Bridge) processors - DPX image encoder - SMPTE 302M AES3 audio decoder - Remove support for quitting ffmpeg with 'q', ctrl+c should be used. +- 9bit and 10bit per sample support in the h264 decoder version 0.7_beta1: From cdca7c378ed46cf67a7583a102ba1b2b91d00b9c Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 13 May 2011 10:24:31 +0300 Subject: [PATCH 03/11] svq3: Do initialization after parsing the extradata If done before, some parameters aren't known yet. With svq3/rtp, initializing before some parameters are known can lead to calling av_malloc(0), which on OS X currently returns broken pointers. --- libavcodec/svq3.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c index 1e4c962ba9..bc0215eff5 100644 --- a/libavcodec/svq3.c +++ b/libavcodec/svq3.c @@ -804,20 +804,11 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx) avctx->pix_fmt = avctx->codec->pix_fmts[0]; if (!s->context_initialized) { - s->width = avctx->width; - s->height = avctx->height; h->halfpel_flag = 1; h->thirdpel_flag = 1; h->unknown_svq3_flag = 0; h->chroma_qp[0] = h->chroma_qp[1] = 4; - if (MPV_common_init(s) < 0) - return -1; - - h->b_stride = 4*s->mb_width; - - ff_h264_alloc_tables(h); - /* prowl for the "SEQH" marker in the extradata */ extradata = (unsigned char *)avctx->extradata; for (m = 0; m < avctx->extradata_size; m++) { @@ -904,6 +895,16 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx) #endif } } + + s->width = avctx->width; + s->height = avctx->height; + + if (MPV_common_init(s) < 0) + return -1; + + h->b_stride = 4*s->mb_width; + + ff_h264_alloc_tables(h); } return 0; From 7d4c4394b5c94a665cc807fb8b92ea153b6225b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= Date: Thu, 5 May 2011 21:25:46 +0200 Subject: [PATCH 04/11] swscale: point out an alternative to sws_getContext Signed-off-by: Anton Khirnov --- libswscale/swscale.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libswscale/swscale.h b/libswscale/swscale.h index 5d0c9e9049..dd4de76b0a 100644 --- a/libswscale/swscale.h +++ b/libswscale/swscale.h @@ -187,6 +187,7 @@ void sws_freeContext(struct SwsContext *swsContext); * @return a pointer to an allocated context, or NULL in case of error * @note this function is to be removed after a saner alternative is * written + * @deprecated Use sws_getCachedContext() instead. */ struct SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat, int dstW, int dstH, enum PixelFormat dstFormat, From 9e2dabed4a7bf21e3e0c0f4ddc895f8ed90fa839 Mon Sep 17 00:00:00 2001 From: Can Wu Date: Sat, 14 May 2011 17:27:31 +0800 Subject: [PATCH 05/11] avio: check AVIOContext malloc failure Signed-off-by: Anton Khirnov --- libavformat/aviobuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c index 2b14d48ff5..fa63ddf2b9 100644 --- a/libavformat/aviobuf.c +++ b/libavformat/aviobuf.c @@ -113,6 +113,8 @@ AVIOContext *avio_alloc_context( int64_t (*seek)(void *opaque, int64_t offset, int whence)) { AVIOContext *s = av_mallocz(sizeof(AVIOContext)); + if (!s) + return NULL; ffio_init_context(s, buffer, buffer_size, write_flag, opaque, read_packet, write_packet, seek); return s; From d49051e0742c09345495ae0486c3601a15222ac4 Mon Sep 17 00:00:00 2001 From: Can Wu Date: Sat, 14 May 2011 17:34:28 +0800 Subject: [PATCH 06/11] avio: document buffer must created with av_malloc() and friends Else a later buffer resize in ffio_set_buf_size() will ABORT. Signed-off-by: Anton Khirnov --- libavformat/avio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libavformat/avio.h b/libavformat/avio.h index b98137b83b..e07e3c3c92 100644 --- a/libavformat/avio.h +++ b/libavformat/avio.h @@ -373,6 +373,7 @@ void avio_set_interrupt_cb(int (*interrupt_cb)(void)); * freed with av_free(). * * @param buffer Memory block for input/output operations via AVIOContext. + * The buffer must be allocated with av_malloc() and friends. * @param buffer_size The buffer size is very important for performance. * For protocols with fixed blocksize it should be set to this blocksize. * For others a typical size is a cache page, e.g. 4kb. From 153382e1b6b428a1dcb8dc3f06f64a6959d722c5 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 17 May 2011 16:58:04 +0200 Subject: [PATCH 07/11] multiple inclusion guard cleanup Add missing multiple inclusion guards; clean up #endif comments; add missing library prefixes; keep guard names consistent. --- cmdutils.h | 6 +++--- ffserver.h | 7 ++++--- libavcodec/aac_tablegen.h | 6 +++--- libavcodec/aac_tablegen_decl.h | 6 +++--- libavcodec/amrwbdata.h | 2 +- libavcodec/arm/asm-offsets.h | 2 +- libavcodec/arm/dsputil_arm.h | 2 +- libavcodec/arm/mpegvideo_arm.h | 2 +- libavcodec/arm/vp56_arith.h | 2 +- libavcodec/arm/vp8.h | 2 +- libavcodec/cavsdsp.h | 2 +- libavcodec/cbrt_tablegen.h | 6 +++--- libavcodec/cga_data.h | 2 +- libavcodec/dct.h | 2 +- libavcodec/dctref.h | 2 +- libavcodec/dv_tablegen.h | 6 +++--- libavcodec/flv.h | 3 +-- libavcodec/gsmdec_data.h | 6 +++--- libavcodec/h263.h | 3 ++- libavcodec/kbdwin.h | 2 +- libavcodec/motionpixels_tablegen.h | 6 +++--- libavcodec/mpeg4video.h | 2 +- libavcodec/mpegaudio_tablegen.h | 6 +++--- libavcodec/mpegaudiodsp.h | 2 +- libavcodec/msgsmdec.h | 6 +++--- libavcodec/nellymoser.h | 2 +- libavcodec/opt.h | 2 +- libavcodec/pcm_tablegen.h | 6 +++--- libavcodec/qdm2_tablegen.h | 6 +++--- libavcodec/rdft.h | 2 +- libavcodec/sh4/dsputil_sh4.h | 2 +- libavcodec/sinewin.h | 2 +- libavcodec/sinewin_tablegen.h | 5 +++++ libavcodec/sparc/dsputil_vis.h | 2 +- libavcodec/targa.h | 5 +++++ libavcodec/vp8.h | 2 +- libavcodec/vp8data.h | 2 +- libavcodec/x86/fft.h | 2 +- libavcodec/xvmc_internal.h | 6 +++--- libavfilter/avfilter.h | 2 +- libavfilter/avfiltergraph.h | 2 +- libavfilter/internal.h | 2 +- libavfilter/vsrc_buffer.h | 4 ++++ libavformat/avio_internal.h | 2 +- libavformat/ffmeta.h | 6 +++--- libavformat/mms.h | 3 ++- libavformat/spdif.h | 5 +++++ libavformat/url.h | 2 +- libavformat/version.h | 2 +- libavutil/avassert.h | 2 +- libavutil/cpu.h | 2 +- 51 files changed, 97 insertions(+), 76 deletions(-) diff --git a/cmdutils.h b/cmdutils.h index c99c8653fa..3bb1cd616b 100644 --- a/cmdutils.h +++ b/cmdutils.h @@ -19,8 +19,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef FFMPEG_CMDUTILS_H -#define FFMPEG_CMDUTILS_H +#ifndef LIBAV_CMDUTILS_H +#define LIBAV_CMDUTILS_H #include @@ -295,4 +295,4 @@ extern AVFilter ffsink; int get_filtered_video_frame(AVFilterContext *sink, AVFrame *frame, AVFilterBufferRef **picref, AVRational *pts_tb); -#endif /* FFMPEG_CMDUTILS_H */ +#endif /* LIBAV_CMDUTILS_H */ diff --git a/ffserver.h b/ffserver.h index 868b83bca4..43bc79c2c6 100644 --- a/ffserver.h +++ b/ffserver.h @@ -18,11 +18,12 @@ * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef FFMPEG_FFSERVER_H -#define FFMPEG_FFSERVER_H + +#ifndef LIBAV_FFSERVER_H +#define LIBAV_FFSERVER_H /* interface between ffserver and modules */ void ffserver_module_init(void); -#endif /* FFMPEG_FFSERVER_H */ +#endif /* LIBAV_FFSERVER_H */ diff --git a/libavcodec/aac_tablegen.h b/libavcodec/aac_tablegen.h index 98895694a8..4486e1a9fc 100644 --- a/libavcodec/aac_tablegen.h +++ b/libavcodec/aac_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AAC_TABLEGEN_H -#define AAC_TABLEGEN_H +#ifndef AVCODEC_AAC_TABLEGEN_H +#define AVCODEC_AAC_TABLEGEN_H #include "aac_tablegen_decl.h" @@ -40,4 +40,4 @@ void ff_aac_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* AAC_TABLEGEN_H */ +#endif /* AVCODEC_AAC_TABLEGEN_H */ diff --git a/libavcodec/aac_tablegen_decl.h b/libavcodec/aac_tablegen_decl.h index ce4ecb5bf1..496ca0c677 100644 --- a/libavcodec/aac_tablegen_decl.h +++ b/libavcodec/aac_tablegen_decl.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AAC_TABLEGEN_DECL_H -#define AAC_TABLEGEN_DECL_H +#ifndef AVCODEC_AAC_TABLEGEN_DECL_H +#define AVCODEC_AAC_TABLEGEN_DECL_H #if CONFIG_HARDCODED_TABLES #define ff_aac_tableinit() @@ -31,4 +31,4 @@ void ff_aac_tableinit(void); extern float ff_aac_pow2sf_tab[428]; #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* AAC_TABLEGEN_DECL_H */ +#endif /* AVCODEC_AAC_TABLEGEN_DECL_H */ diff --git a/libavcodec/amrwbdata.h b/libavcodec/amrwbdata.h index f4db99e208..5421c23afb 100644 --- a/libavcodec/amrwbdata.h +++ b/libavcodec/amrwbdata.h @@ -1887,4 +1887,4 @@ static const uint16_t cf_sizes_wb[] = { 40 /// SID/comfort noise frame }; -#endif +#endif /* AVCODEC_AMRWBDATA_H */ diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h index 43c16301c0..110d33dbb5 100644 --- a/libavcodec/arm/asm-offsets.h +++ b/libavcodec/arm/asm-offsets.h @@ -36,4 +36,4 @@ #define H263_AIC 0xf0 #define INTER_SCANTAB_RASTER_END 0x138 -#endif +#endif /* AVCODEC_ARM_ASM_OFFSETS_H */ diff --git a/libavcodec/arm/dsputil_arm.h b/libavcodec/arm/dsputil_arm.h index 14d9836114..6d7e6a6d16 100644 --- a/libavcodec/arm/dsputil_arm.h +++ b/libavcodec/arm/dsputil_arm.h @@ -30,4 +30,4 @@ void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); -#endif +#endif /* AVCODEC_ARM_DSPUTIL_H */ diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h index 0812ca1657..a36da6112b 100644 --- a/libavcodec/arm/mpegvideo_arm.h +++ b/libavcodec/arm/mpegvideo_arm.h @@ -24,4 +24,4 @@ void MPV_common_init_iwmmxt(MpegEncContext *s); void MPV_common_init_armv5te(MpegEncContext *s); -#endif +#endif /* AVCODEC_ARM_MPEGVIDEO_H */ diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h index f5dbd1d4c7..0591d614a9 100644 --- a/libavcodec/arm/vp56_arith.h +++ b/libavcodec/arm/vp56_arith.h @@ -89,4 +89,4 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) #endif -#endif +#endif /* AVCODEC_ARM_VP56_ARITH_H */ diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h index 35cdd8b2bf..76a0397a8d 100644 --- a/libavcodec/arm/vp8.h +++ b/libavcodec/arm/vp8.h @@ -26,4 +26,4 @@ int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, DCTELEM block[16], int i, uint8_t *token_prob, int16_t qmul[2]); #endif -#endif +#endif /* AVCODEC_ARM_VP8_H */ diff --git a/libavcodec/cavsdsp.h b/libavcodec/cavsdsp.h index de2f530d83..b1133b7264 100644 --- a/libavcodec/cavsdsp.h +++ b/libavcodec/cavsdsp.h @@ -38,4 +38,4 @@ typedef struct CAVSDSPContext { void ff_cavsdsp_init(CAVSDSPContext* c, AVCodecContext *avctx); void ff_cavsdsp_init_mmx(CAVSDSPContext* c, AVCodecContext *avctx); -#endif +#endif /* AVCODEC_CAVSDSP_H */ diff --git a/libavcodec/cbrt_tablegen.h b/libavcodec/cbrt_tablegen.h index 977450c124..01963a3f9d 100644 --- a/libavcodec/cbrt_tablegen.h +++ b/libavcodec/cbrt_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef CBRT_TABLEGEN_H -#define CBRT_TABLEGEN_H +#ifndef AVCODEC_CBRT_TABLEGEN_H +#define AVCODEC_CBRT_TABLEGEN_H #include #include @@ -48,4 +48,4 @@ static void cbrt_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* CBRT_TABLEGEN_H */ +#endif /* AVCODEC_CBRT_TABLEGEN_H */ diff --git a/libavcodec/cga_data.h b/libavcodec/cga_data.h index 60f572f9b6..2149cfd2f1 100644 --- a/libavcodec/cga_data.h +++ b/libavcodec/cga_data.h @@ -45,4 +45,4 @@ extern const uint32_t ff_ega_palette[64]; */ void ff_draw_pc_font(uint8_t *dst, int linesize, const uint8_t *font, int font_height, int ch, int fg, int bg); -#endif +#endif /* AVCODEC_CGA_DATA_H */ diff --git a/libavcodec/dct.h b/libavcodec/dct.h index faddaa3d7b..c898856279 100644 --- a/libavcodec/dct.h +++ b/libavcodec/dct.h @@ -49,4 +49,4 @@ void ff_dct_end (DCTContext *s); void ff_dct_init_mmx(DCTContext *s); -#endif +#endif /* AVCODEC_DCT_H */ diff --git a/libavcodec/dctref.h b/libavcodec/dctref.h index ffd3533439..ba89abd752 100644 --- a/libavcodec/dctref.h +++ b/libavcodec/dctref.h @@ -28,4 +28,4 @@ void ff_ref_fdct(DCTELEM *block); void ff_ref_idct(DCTELEM *block); void ff_ref_dct_init(void); -#endif +#endif /* AVCODEC_DCTREF_H */ diff --git a/libavcodec/dv_tablegen.h b/libavcodec/dv_tablegen.h index 0810f8e7a5..4fa8d91374 100644 --- a/libavcodec/dv_tablegen.h +++ b/libavcodec/dv_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef DV_TABLEGEN_H -#define DV_TABLEGEN_H +#ifndef AVCODEC_DV_TABLEGEN_H +#define AVCODEC_DV_TABLEGEN_H #include #include "dv_vlc_data.h" @@ -93,4 +93,4 @@ static void dv_vlc_map_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* DV_TABLEGEN_H */ +#endif /* AVCODEC_DV_TABLEGEN_H */ diff --git a/libavcodec/flv.h b/libavcodec/flv.h index 84111175c6..3d9a2d5232 100644 --- a/libavcodec/flv.h +++ b/libavcodec/flv.h @@ -30,5 +30,4 @@ void ff_flv2_encode_ac_esc(PutBitContext *pb, int slevel, int level, int run, in int ff_flv_decode_picture_header(MpegEncContext *s); void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last); -#endif - +#endif /* AVCODEC_FLV_H */ diff --git a/libavcodec/gsmdec_data.h b/libavcodec/gsmdec_data.h index 32cd01ea7a..b78daa7335 100644 --- a/libavcodec/gsmdec_data.h +++ b/libavcodec/gsmdec_data.h @@ -19,8 +19,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef GSMDEC_DATA -#define GSMDEC_DATA +#ifndef AVCODEC_GSMDEC_DATA +#define AVCODEC_GSMDEC_DATA #include @@ -44,4 +44,4 @@ typedef struct { extern const uint16_t ff_gsm_long_term_gain_tab[4]; extern const int16_t ff_gsm_dequant_tab[64][8]; -#endif +#endif /* AVCODEC_GSMDEC_DATA */ diff --git a/libavcodec/h263.h b/libavcodec/h263.h index cdbe44eb90..1dc300709e 100644 --- a/libavcodec/h263.h +++ b/libavcodec/h263.h @@ -248,4 +248,5 @@ static inline void memsetw(short *tab, int val, int n) for(i=0;i @@ -88,4 +88,4 @@ static void motionpixels_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* MOTIONPIXELS_TABLEGEN_H */ +#endif /* AVCODEC_MOTIONPIXELS_TABLEGEN_H */ diff --git a/libavcodec/mpeg4video.h b/libavcodec/mpeg4video.h index 015193e13b..d34e73149c 100644 --- a/libavcodec/mpeg4video.h +++ b/libavcodec/mpeg4video.h @@ -196,4 +196,4 @@ static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, int level, int *di return ret; } -#endif +#endif /* AVCODEC_MPEG4VIDEO_H */ diff --git a/libavcodec/mpegaudio_tablegen.h b/libavcodec/mpegaudio_tablegen.h index 2264b739d2..a222f2c423 100644 --- a/libavcodec/mpegaudio_tablegen.h +++ b/libavcodec/mpegaudio_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef MPEGAUDIO_TABLEGEN_H -#define MPEGAUDIO_TABLEGEN_H +#ifndef AVCODEC_MPEGAUDIO_TABLEGEN_H +#define AVCODEC_MPEGAUDIO_TABLEGEN_H #include #include @@ -68,4 +68,4 @@ static void mpegaudio_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* MPEGAUDIO_TABLEGEN_H */ +#endif /* AVCODEC_MPEGAUDIO_TABLEGEN_H */ diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h index 597e2533f5..a47019cc4b 100644 --- a/libavcodec/mpegaudiodsp.h +++ b/libavcodec/mpegaudiodsp.h @@ -60,4 +60,4 @@ void ff_mpadsp_apply_window_fixed(int32_t *synth_buf, int32_t *window, int *dither_state, int16_t *samples, int incr); -#endif +#endif /* AVCODEC_MPEGAUDIODSP_H */ diff --git a/libavcodec/msgsmdec.h b/libavcodec/msgsmdec.h index cf58baaa47..76c87f1bd9 100644 --- a/libavcodec/msgsmdec.h +++ b/libavcodec/msgsmdec.h @@ -19,12 +19,12 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef MSGSMDEC_H -#define MSGSMDEC_H +#ifndef AVCODEC_MSGSMDEC_H +#define AVCODEC_MSGSMDEC_H #include "avcodec.h" int ff_msgsm_decode_block(AVCodecContext *avctx, int16_t *samples, const uint8_t *buf); -#endif +#endif /* AVCODEC_MSGSMDEC_H */ diff --git a/libavcodec/nellymoser.h b/libavcodec/nellymoser.h index 88d9aa6245..027fc7ed23 100644 --- a/libavcodec/nellymoser.h +++ b/libavcodec/nellymoser.h @@ -54,4 +54,4 @@ extern const int16_t ff_nelly_delta_table[32]; void ff_nelly_get_sample_bits(const float *buf, int *bits); -#endif +#endif /* AVCODEC_NELLYMOSER_H */ diff --git a/libavcodec/opt.h b/libavcodec/opt.h index e754bb93d8..70de27d192 100644 --- a/libavcodec/opt.h +++ b/libavcodec/opt.h @@ -13,4 +13,4 @@ #include "libavutil/opt.h" #endif -#endif +#endif /* AVCODEC_OPT_H */ diff --git a/libavcodec/pcm_tablegen.h b/libavcodec/pcm_tablegen.h index 838052e0d6..79d6561646 100644 --- a/libavcodec/pcm_tablegen.h +++ b/libavcodec/pcm_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef PCM_TABLEGEN_H -#define PCM_TABLEGEN_H +#ifndef AVCODEC_PCM_TABLEGEN_H +#define AVCODEC_PCM_TABLEGEN_H #include #include "libavutil/attributes.h" @@ -116,4 +116,4 @@ static void pcm_ulaw_tableinit(void) } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* PCM_TABLEGEN_H */ +#endif /* AVCODEC_PCM_TABLEGEN_H */ diff --git a/libavcodec/qdm2_tablegen.h b/libavcodec/qdm2_tablegen.h index 769d53bb26..b2bb294f58 100644 --- a/libavcodec/qdm2_tablegen.h +++ b/libavcodec/qdm2_tablegen.h @@ -20,8 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef QDM2_TABLEGEN_H -#define QDM2_TABLEGEN_H +#ifndef AVCODEC_QDM2_TABLEGEN_H +#define AVCODEC_QDM2_TABLEGEN_H #include #include @@ -99,4 +99,4 @@ static av_cold void init_noise_samples(void) { } #endif /* CONFIG_HARDCODED_TABLES */ -#endif /* QDM2_TABLEGEN_H */ +#endif /* AVCODEC_QDM2_TABLEGEN_H */ diff --git a/libavcodec/rdft.h b/libavcodec/rdft.h index 7572c6c76d..8ff620fb59 100644 --- a/libavcodec/rdft.h +++ b/libavcodec/rdft.h @@ -71,4 +71,4 @@ void ff_rdft_end(RDFTContext *s); void ff_rdft_init_arm(RDFTContext *s); -#endif +#endif /* AVCODEC_RDFT_H */ diff --git a/libavcodec/sh4/dsputil_sh4.h b/libavcodec/sh4/dsputil_sh4.h index 1a8b7afaed..5abe34557b 100644 --- a/libavcodec/sh4/dsputil_sh4.h +++ b/libavcodec/sh4/dsputil_sh4.h @@ -25,4 +25,4 @@ void idct_sh4(DCTELEM *block); void dsputil_init_align(DSPContext* c, AVCodecContext *avctx); -#endif +#endif /* AVCODEC_SH4_DSPUTIL_SH4_H */ diff --git a/libavcodec/sinewin.h b/libavcodec/sinewin.h index 2ed386a32d..eefe5bfe7f 100644 --- a/libavcodec/sinewin.h +++ b/libavcodec/sinewin.h @@ -56,4 +56,4 @@ extern SINETABLE(4096); extern SINETABLE_CONST float * const ff_sine_windows[13]; -#endif +#endif /* AVCODEC_SINEWIN_H */ diff --git a/libavcodec/sinewin_tablegen.h b/libavcodec/sinewin_tablegen.h index 91c26c1551..720f1ab6b8 100644 --- a/libavcodec/sinewin_tablegen.h +++ b/libavcodec/sinewin_tablegen.h @@ -20,6 +20,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVCODEC_SINEWIN_TABLEGEN_H +#define AVCODEC_SINEWIN_TABLEGEN_H + #include // do not use libavutil/libm.h since this is compiled both // for the host and the target and config.h is only valid for the target @@ -58,3 +61,5 @@ av_cold void ff_init_ff_sine_windows(int index) { ff_sine_window_init(ff_sine_windows[index], 1 << index); #endif } + +#endif /* AVCODEC_SINEWIN_TABLEGEN_H */ diff --git a/libavcodec/sparc/dsputil_vis.h b/libavcodec/sparc/dsputil_vis.h index b590e59361..4be86e25e0 100644 --- a/libavcodec/sparc/dsputil_vis.h +++ b/libavcodec/sparc/dsputil_vis.h @@ -26,4 +26,4 @@ void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data); void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data); void ff_simple_idct_vis(DCTELEM *data); -#endif +#endif /* AVCODEC_SPARC_DSPUTIL_VIS_H */ diff --git a/libavcodec/targa.h b/libavcodec/targa.h index d7c3f451a1..f4ef5537b1 100644 --- a/libavcodec/targa.h +++ b/libavcodec/targa.h @@ -16,6 +16,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVCODEC_TARGA_H +#define AVCODEC_TARGA_H + /** * @file * targa file common definitions @@ -34,3 +37,5 @@ enum TargaCompr { TGA_BW = 3, // black & white or grayscale TGA_RLE = 8, // flag pointing that data is RLE-coded }; + +#endif /* AVCODEC_TARGA_H */ diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index 3a6eee52d4..5a96cd436c 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -239,4 +239,4 @@ typedef struct { AVFrame frames[5]; } VP8Context; -#endif +#endif /* AVCODEC_VP8_H */ diff --git a/libavcodec/vp8data.h b/libavcodec/vp8data.h index 6d1c070a05..4ea4581bc9 100644 --- a/libavcodec/vp8data.h +++ b/libavcodec/vp8data.h @@ -685,4 +685,4 @@ static const uint8_t vp8_mv_default_prob[2][19] = { 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 } }; -#endif +#endif /* AVCODEC_VP8DATA_H */ diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index e6eace235d..c6379050d9 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -36,4 +36,4 @@ void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); -#endif +#endif /* AVCODEC_X86_FFT_H */ diff --git a/libavcodec/xvmc_internal.h b/libavcodec/xvmc_internal.h index 7a4e908df9..3c6aed8361 100644 --- a/libavcodec/xvmc_internal.h +++ b/libavcodec/xvmc_internal.h @@ -18,8 +18,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AVCODEC_INTERNAL_XVMC_H -#define AVCODEC_INTERNAL_XVMC_H +#ifndef AVCODEC_XVMC_INTERNAL_H +#define AVCODEC_XVMC_INTERNAL_H #include "avcodec.h" #include "mpegvideo.h" @@ -30,4 +30,4 @@ int ff_xvmc_field_start(MpegEncContext*s, AVCodecContext *avctx); void ff_xvmc_field_end(MpegEncContext *s); void ff_xvmc_decode_mb(MpegEncContext *s); -#endif /* AVCODEC_INTERNAL_XVMC_H */ +#endif /* AVCODEC_XVMC_INTERNAL_H */ diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h index c126cae093..33e93e27fc 100644 --- a/libavfilter/avfilter.h +++ b/libavfilter/avfilter.h @@ -859,4 +859,4 @@ static inline void avfilter_insert_outpad(AVFilterContext *f, unsigned index, &f->output_pads, &f->outputs, p); } -#endif /* AVFILTER_AVFILTER_H */ +#endif /* AVFILTER_AVFILTER_H */ diff --git a/libavfilter/avfiltergraph.h b/libavfilter/avfiltergraph.h index 801e50176f..a0f6b2e01f 100644 --- a/libavfilter/avfiltergraph.h +++ b/libavfilter/avfiltergraph.h @@ -120,4 +120,4 @@ int avfilter_graph_parse(AVFilterGraph *graph, const char *filters, AVFilterInOut *inputs, AVFilterInOut *outputs, AVClass *log_ctx); -#endif /* AVFILTER_AVFILTERGRAPH_H */ +#endif /* AVFILTER_AVFILTERGRAPH_H */ diff --git a/libavfilter/internal.h b/libavfilter/internal.h index 0406a0d27e..64b3f3b865 100644 --- a/libavfilter/internal.h +++ b/libavfilter/internal.h @@ -52,4 +52,4 @@ int ff_avfilter_graph_config_formats(AVFilterGraph *graphctx, AVClass *log_ctx); /** default handler for freeing audio/video buffer when there are no references left */ void ff_avfilter_default_free_buffer(AVFilterBuffer *buf); -#endif /* AVFILTER_INTERNAL_H */ +#endif /* AVFILTER_INTERNAL_H */ diff --git a/libavfilter/vsrc_buffer.h b/libavfilter/vsrc_buffer.h index c7fc3824e0..6867f81e1c 100644 --- a/libavfilter/vsrc_buffer.h +++ b/libavfilter/vsrc_buffer.h @@ -19,9 +19,13 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVFILTER_VSRC_BUFFER_H +#define AVFILTER_VSRC_BUFFER_H + #include "libavcodec/avcodec.h" /* AVFrame */ #include "avfilter.h" int av_vsrc_buffer_add_frame(AVFilterContext *buffer_filter, AVFrame *frame, int64_t pts, AVRational pixel_aspect); +#endif /* AVFILTER_VSRC_BUFFER_H */ diff --git a/libavformat/avio_internal.h b/libavformat/avio_internal.h index 6630aaf61d..1369c43891 100644 --- a/libavformat/avio_internal.h +++ b/libavformat/avio_internal.h @@ -98,4 +98,4 @@ int ffio_open_dyn_packet_buf(AVIOContext **s, int max_packet_size); */ int ffio_fdopen(AVIOContext **s, URLContext *h); -#endif // AVFORMAT_AVIO_INTERNAL_H +#endif /* AVFORMAT_AVIO_INTERNAL_H */ diff --git a/libavformat/ffmeta.h b/libavformat/ffmeta.h index bce272a087..a5380ca13d 100644 --- a/libavformat/ffmeta.h +++ b/libavformat/ffmeta.h @@ -19,11 +19,11 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AVFORMAT_META_H -#define AVFORMAT_META_H +#ifndef AVFORMAT_FFMETA_H +#define AVFORMAT_FFMETA_H #define ID_STRING ";FFMETADATA" #define ID_CHAPTER "[CHAPTER]" #define ID_STREAM "[STREAM]" -#endif /* AVFORMAT_META_H */ +#endif /* AVFORMAT_FFMETA_H */ diff --git a/libavformat/mms.h b/libavformat/mms.h index 12e9ef0962..36e772c7f9 100644 --- a/libavformat/mms.h +++ b/libavformat/mms.h @@ -60,4 +60,5 @@ typedef struct { int ff_mms_asf_header_parser(MMSContext * mms); int ff_mms_read_data(MMSContext *mms, uint8_t *buf, const int size); int ff_mms_read_header(MMSContext * mms, uint8_t * buf, const int size); -#endif + +#endif /* AVFORMAT_MMS_H */ diff --git a/libavformat/spdif.h b/libavformat/spdif.h index dedb4e8832..b2a6b63be4 100644 --- a/libavformat/spdif.h +++ b/libavformat/spdif.h @@ -19,6 +19,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVFORMAT_SPDIF_H +#define AVFORMAT_SPDIF_H + #include #define SYNCWORD1 0xF872 @@ -55,3 +58,5 @@ static const uint16_t spdif_mpeg_pkt_offset[2][3] = { }; void ff_spdif_bswap_buf16(uint16_t *dst, const uint16_t *src, int w); + +#endif /* AVFORMAT_SPDIF_H */ diff --git a/libavformat/url.h b/libavformat/url.h index c5732c64c6..caafe07cce 100644 --- a/libavformat/url.h +++ b/libavformat/url.h @@ -173,4 +173,4 @@ int ffurl_register_protocol(URLProtocol *protocol, int size); int ff_udp_set_remote_url(URLContext *h, const char *uri); int ff_udp_get_local_port(URLContext *h); -#endif //AVFORMAT_URL_H +#endif /* AVFORMAT_URL_H */ diff --git a/libavformat/version.h b/libavformat/version.h index 22b5dc9791..63f419125b 100644 --- a/libavformat/version.h +++ b/libavformat/version.h @@ -69,4 +69,4 @@ #define FF_API_SDP_CREATE (LIBAVFORMAT_VERSION_MAJOR < 54) #endif -#endif //AVFORMAT_VERSION_H +#endif /* AVFORMAT_VERSION_H */ diff --git a/libavutil/avassert.h b/libavutil/avassert.h index 87333132fd..b223d26e8d 100644 --- a/libavutil/avassert.h +++ b/libavutil/avassert.h @@ -63,4 +63,4 @@ #define av_assert2(cond) ((void)0) #endif -#endif +#endif /* AVUTIL_AVASSERT_H */ diff --git a/libavutil/cpu.h b/libavutil/cpu.h index 11ba368678..777cdc01d1 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -51,4 +51,4 @@ int ff_get_cpu_flags_arm(void); int ff_get_cpu_flags_ppc(void); int ff_get_cpu_flags_x86(void); -#endif /* AVUTIL_CPU_H */ +#endif /* AVUTIL_CPU_H */ From 3758eb0eb96217c6968d47487533337f96aeecfb Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Tue, 17 May 2011 18:26:01 +0200 Subject: [PATCH 08/11] dct32: port SSE 32-point DCT to YASM --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/dct32_sse.asm | 289 ++++++++++++++++++++++++++++++++++ libavcodec/x86/dct32_sse.c | 296 ----------------------------------- 3 files changed, 291 insertions(+), 297 deletions(-) create mode 100644 libavcodec/x86/dct32_sse.asm delete mode 100644 libavcodec/x86/dct32_sse.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 5f428501e3..ba664abb1e 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -1,6 +1,8 @@ OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o + YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o @@ -54,4 +56,3 @@ OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ x86/mpegvideo_mmx.o \ x86/simple_idct_mmx.o \ -MMX-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm new file mode 100644 index 0000000000..f6d5bcf844 --- /dev/null +++ b/libavcodec/x86/dct32_sse.asm @@ -0,0 +1,289 @@ +;****************************************************************************** +;* 32 point SSE-optimized DCT transform +;* Copyright (c) 2010 Vitor Sessak +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA 32 + +align 32 +ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 + dd 0.553104, 0.582935, 0.622504, 0.674808 + dd -1.169440, -0.972568, -0.839350, -0.744536 + dd -10.190008, -3.407609, -2.057781, -1.484165 + dd 0.502419, 0.522499, 0.566944, 0.646822 + dd 0.788155, 1.060678, 1.722447, 5.101149 + dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 0.707107, 1.000000, -0.707107 + + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 + +%macro BUTTERFLY 4 + movaps %4, %1 + subps %1, %2 + addps %2, %4 + mulps %1, %3 +%endmacro + +%macro BUTTERFLY0 5 + movaps %4, %1 + shufps %1, %1, %5 + xorps %4, %2 + addps %1, %4 + mulps %1, %3 +%endmacro + +%macro BUTTERFLY2 4 + BUTTERFLY0 %1, %2, %3, %4, 0x1b +%endmacro + +%macro BUTTERFLY3 4 + BUTTERFLY0 %1, %2, %3, %4, 0xb1 +%endmacro + +INIT_XMM +section .text align=16 +; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) +cglobal dct32_float_sse, 2,3,8, out, in, tmp + ; pass 1 + + movaps m0, [inq+0] + movaps m1, [inq+112] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, [ps_cos_vec], m3 + + movaps m7, [inq+64] + movaps m4, [inq+48] + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, [ps_cos_vec+48], m3 + + + ; pass 2 + movaps m2, [ps_cos_vec+64] + BUTTERFLY m1, m4, m2, m3 + movaps [outq+48], m1 + movaps [outq+ 0], m4 + + ; pass 1 + movaps m1, [inq+16] + movaps m6, [inq+96] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, [ps_cos_vec+16], m3 + + movaps m4, [inq+80] + movaps m5, [inq+32] + shufps m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec+32], m3 + + ; pass 2 + BUTTERFLY m0, m7, m2, m3 + + movaps m2, [ps_cos_vec+80] + BUTTERFLY m6, m5, m2, m3 + + BUTTERFLY m1, m4, m2, m3 + + ; pass 3 + movaps m2, [ps_cos_vec+96] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, m2, m3 + movaps [outq+112], m0 + movaps [outq+ 96], m1 + + movaps m0, [outq+0] + shufps m5, m5, 0x1b + BUTTERFLY m0, m5, m2, m3 + + movaps m1, [outq+48] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, m2, m3 + movaps [outq+48], m1 + + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, m2, m3 + + ; pass 4 + movaps m3, [ps_p1p1m1m1+0] + movaps m2, [ps_cos_vec+112] + + BUTTERFLY2 m5, m3, m2, m1 + + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+16], m0 + + BUTTERFLY2 m6, m3, m2, m1 + movaps [outq+32], m6 + + movaps m0, [outq+48] + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+48], m0 + + BUTTERFLY2 m4, m3, m2, m1 + + BUTTERFLY2 m7, m3, m2, m1 + + movaps m6, [outq+96] + BUTTERFLY2 m6, m3, m2, m1 + + movaps m0, [outq+112] + BUTTERFLY2 m0, m3, m2, m1 + + ; pass 5 + movaps m2, [ps_cos_vec+128] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + movaps [outq+0], m5 + + movaps m1, [outq+16] + BUTTERFLY3 m1, m3, m2, m5 + movaps [outq+16], m1 + + BUTTERFLY3 m4, m3, m2, m5 + movaps [outq+64], m4 + + BUTTERFLY3 m7, m3, m2, m5 + movaps [outq+80], m7 + + movaps m5, [outq+32] + BUTTERFLY3 m5, m3, m2, m7 + movaps [outq+32], m5 + + movaps m4, [outq+48] + BUTTERFLY3 m4, m3, m2, m7 + movaps [outq+48], m4 + + BUTTERFLY3 m6, m3, m2, m7 + movaps [outq+96], m6 + + BUTTERFLY3 m0, m3, m2, m7 + movaps [outq+112], m0 + + + ; pass 6, no SIMD... + movss m3, [outq+56] + mov tmpd, [outq+4] + addss m3, [outq+60] + movss m7, [outq+72] + addss m4, m3 + movss m2, [outq+52] + addss m2, m3 + movss m3, [outq+24] + addss m3, [outq+28] + addss m7, [outq+76] + addss m1, m3 + addss m5, m4 + movss [outq+ 16], m1 + movss m1, [outq+20] + addss m1, m3 + movss m3, [outq+40] + movss [outq+ 48], m1 + addss m3, [outq+44] + movss m1, [outq+20] + addss m4, m3 + addss m3, m2 + addss m1, [outq+28] + movss [outq+ 40], m3 + addss m2, [outq+36] + movss m3, [outq+8] + movss [outq+ 56], m2 + addss m3, [outq+12] + movss [outq+ 8], m5 + movss [outq+ 32], m3 + movss m2, [outq+52] + movss m3, [outq+80] + movss m5, [outq+120] + movss [outq+ 80], m1 + movss [outq+ 24], m4 + addss m5, [outq+124] + movss m1, [outq+64] + addss m2, [outq+60] + addss m0, m5 + addss m5, [outq+116] + mov [outq+64], tmpd + addss m6, m0 + addss m1, m6 + mov tmpd, [outq+12] + movss [outq+ 4], m1 + movss m1, [outq+88] + mov [outq+ 96], tmpd + addss m1, [outq+92] + movss m4, [outq+104] + mov tmpd, [outq+28] + addss m4, [outq+108] + addss m0, m4 + addss m3, m1 + addss m1, [outq+84] + addss m4, m5 + addss m6, m3 + addss m3, m0 + addss m0, m7 + addss m5, [outq+100] + addss m7, m4 + mov [outq+112], tmpd + movss [outq+ 28], m0 + movss m0, [outq+36] + movss [outq+ 36], m7 + addss m4, m1 + movss m7, [outq+116] + addss m0, m2 + addss m7, [outq+124] + movss [outq+ 72], m0 + movss m0, [outq+44] + movss [outq+ 12], m6 + movss [outq+ 20], m3 + addss m2, m0 + movss [outq+ 44], m4 + movss [outq+ 88], m2 + addss m0, [outq+60] + mov tmpd, [outq+60] + mov [outq+120], tmpd + movss [outq+104], m0 + addss m1, m5 + addss m5, [outq+68] + movss [outq+52], m1 + movss [outq+60], m5 + movss m1, [outq+68] + movss m5, [outq+100] + addss m5, m7 + addss m7, [outq+108] + addss m1, m5 + movss m2, [outq+84] + addss m2, [outq+92] + addss m5, m2 + movss [outq+ 68], m1 + addss m2, m7 + movss m1, [outq+76] + movss [outq+ 84], m2 + movss [outq+ 76], m5 + movss m2, [outq+108] + addss m7, m1 + addss m2, [outq+124] + addss m1, m2 + addss m2, [outq+92] + movss [outq+100], m1 + movss [outq+108], m2 + movss m2, [outq+92] + movss [outq+ 92], m7 + addss m2, [outq+124] + movss [outq+116], m2 + RET diff --git a/libavcodec/x86/dct32_sse.c b/libavcodec/x86/dct32_sse.c deleted file mode 100644 index 5303c6d5ef..0000000000 --- a/libavcodec/x86/dct32_sse.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * 32 point SSE-optimized DCT transform - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavutil/x86_cpu.h" -#include "libavutil/mem.h" -#include "libavcodec/dsputil.h" -#include "fft.h" - -DECLARE_ALIGNED(16, static const float, b1)[] = { - 0.500603, 0.505471, 0.515447, 0.531043, - 0.553104, 0.582935, 0.622504, 0.674808, - -1.169440, -0.972568, -0.839350, -0.744536, - -10.190008, -3.407609, -2.057781, -1.484165, - 0.502419, 0.522499, 0.566944, 0.646822, - 0.788155, 1.060678, 1.722447, 5.101149, - 0.509796, 0.601345, 0.899976, 2.562916, - 1.000000, 1.000000, 1.306563, 0.541196, - 1.000000, 0.707107, 1.000000, -0.707107 -}; - -DECLARE_ALIGNED(16, static const int32_t, smask)[4] = { - 0, 0, 0x80000000, 0x80000000 -}; - -/* butterfly operator */ -#define BUTTERFLY(a,b,c,tmp) \ - "movaps %%" #a ", %%" #tmp " \n\t" \ - "subps %%" #b ", %%" #a " \n\t" \ - "addps %%" #tmp ", %%" #b " \n\t" \ - "mulps " #c ", %%" #a " \n\t" - -///* Same as BUTTERFLY when vectors a and b overlap */ -#define BUTTERFLY0(val, mask, cos, tmp, shuf) \ - "movaps %%" #val ", %%" #tmp " \n\t" \ - "shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ - "xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ - "addps %%" #tmp ", %%" #val " \n\t" \ - "mulps %%" #cos ", %%" #val " \n\t" - -#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) -#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) - -void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -{ - int32_t tmp1 = 0; - __asm__ volatile( - /* pass 1 */ - - "movaps (%4), %%xmm0 \n\t" - "movaps 112(%4), %%xmm1 \n\t" - "shufps $0x1b, %%xmm1, %%xmm1 \n\t" - BUTTERFLY(xmm0, xmm1, (%2), xmm3) - - "movaps 64(%4), %%xmm7 \n\t" - "movaps 48(%4), %%xmm4 \n\t" - "shufps $0x1b, %%xmm4, %%xmm4 \n\t" - BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) - - - /* pass 2 */ - "movaps 64(%2), %%xmm2 \n\t" - BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) - "movaps %%xmm1, 48(%1) \n\t" - "movaps %%xmm4, (%1) \n\t" - - /* pass 1 */ - "movaps 16(%4), %%xmm1 \n\t" - "movaps 96(%4), %%xmm6 \n\t" - "shufps $0x1b, %%xmm6, %%xmm6 \n\t" - BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) - - "movaps 80(%4), %%xmm4 \n\t" - "movaps 32(%4), %%xmm5 \n\t" - "shufps $0x1b, %%xmm5, %%xmm5 \n\t" - BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) - - /* pass 2 */ - BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) - - "movaps 80(%2), %%xmm2 \n\t" - BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) - - BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) - - /* pass 3 */ - "movaps 96(%2), %%xmm2 \n\t" - "shufps $0x1b, %%xmm1, %%xmm1 \n\t" - BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) - "movaps %%xmm0, 112(%1) \n\t" - "movaps %%xmm1, 96(%1) \n\t" - - "movaps 0(%1), %%xmm0 \n\t" - "shufps $0x1b, %%xmm5, %%xmm5 \n\t" - BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) - - "movaps 48(%1), %%xmm1 \n\t" - "shufps $0x1b, %%xmm6, %%xmm6 \n\t" - BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) - "movaps %%xmm1, 48(%1) \n\t" - - "shufps $0x1b, %%xmm4, %%xmm4 \n\t" - BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) - - /* pass 4 */ - "movaps (%3), %%xmm3 \n\t" - "movaps 112(%2), %%xmm2 \n\t" - - BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) - - BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) - "movaps %%xmm0, 16(%1) \n\t" - - BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) - "movaps %%xmm6, 32(%1) \n\t" - - "movaps 48(%1), %%xmm0 \n\t" - BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) - "movaps %%xmm0, 48(%1) \n\t" - - BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) - - BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) - - "movaps 96(%1), %%xmm6 \n\t" - BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) - - "movaps 112(%1), %%xmm0 \n\t" - BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) - - /* pass 5 */ - "movaps 128(%2), %%xmm2 \n\t" - "shufps $0xCC, %%xmm3,%%xmm3 \n\t" - - BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) - "movaps %%xmm5, (%1) \n\t" - - "movaps 16(%1), %%xmm1 \n\t" - BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) - "movaps %%xmm1, 16(%1) \n\t" - - BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) - "movaps %%xmm4, 64(%1) \n\t" - - BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) - "movaps %%xmm7, 80(%1) \n\t" - - "movaps 32(%1), %%xmm5 \n\t" - BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) - "movaps %%xmm5, 32(%1) \n\t" - - "movaps 48(%1), %%xmm4 \n\t" - BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) - "movaps %%xmm4, 48(%1) \n\t" - - BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) - "movaps %%xmm6, 96(%1) \n\t" - - BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) - "movaps %%xmm0, 112(%1) \n\t" - - - /* pass 6, no SIMD... */ - "movss 56(%1), %%xmm3 \n\t" - "movl 4(%1), %0 \n\t" - "addss 60(%1), %%xmm3 \n\t" - "movss 72(%1), %%xmm7 \n\t" - "addss %%xmm3, %%xmm4 \n\t" - "movss 52(%1), %%xmm2 \n\t" - "addss %%xmm3, %%xmm2 \n\t" - "movss 24(%1), %%xmm3 \n\t" - "addss 28(%1), %%xmm3 \n\t" - "addss 76(%1), %%xmm7 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "addss %%xmm4, %%xmm5 \n\t" - "movss %%xmm1, 16(%1) \n\t" - "movss 20(%1), %%xmm1 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "movss 40(%1), %%xmm3 \n\t" - "movss %%xmm1, 48(%1) \n\t" - "addss 44(%1), %%xmm3 \n\t" - "movss 20(%1), %%xmm1 \n\t" - "addss %%xmm3, %%xmm4 \n\t" - "addss %%xmm2, %%xmm3 \n\t" - "addss 28(%1), %%xmm1 \n\t" - "movss %%xmm3, 40(%1) \n\t" - "addss 36(%1), %%xmm2 \n\t" - "movss 8(%1), %%xmm3 \n\t" - "movss %%xmm2, 56(%1) \n\t" - "addss 12(%1), %%xmm3 \n\t" - "movss %%xmm5, 8(%1) \n\t" - "movss %%xmm3, 32(%1) \n\t" - "movss 52(%1), %%xmm2 \n\t" - "movss 80(%1), %%xmm3 \n\t" - "movss 120(%1), %%xmm5 \n\t" - "movss %%xmm1, 80(%1) \n\t" - "movss %%xmm4, 24(%1) \n\t" - "addss 124(%1), %%xmm5 \n\t" - "movss 64(%1), %%xmm1 \n\t" - "addss 60(%1), %%xmm2 \n\t" - "addss %%xmm5, %%xmm0 \n\t" - "addss 116(%1), %%xmm5 \n\t" - "movl %0, 64(%1) \n\t" - "addss %%xmm0, %%xmm6 \n\t" - "addss %%xmm6, %%xmm1 \n\t" - "movl 12(%1), %0 \n\t" - "movss %%xmm1, 4(%1) \n\t" - "movss 88(%1), %%xmm1 \n\t" - "movl %0, 96(%1) \n\t" - "addss 92(%1), %%xmm1 \n\t" - "movss 104(%1), %%xmm4 \n\t" - "movl 28(%1), %0 \n\t" - "addss 108(%1), %%xmm4 \n\t" - "addss %%xmm4, %%xmm0 \n\t" - "addss %%xmm1, %%xmm3 \n\t" - "addss 84(%1), %%xmm1 \n\t" - "addss %%xmm5, %%xmm4 \n\t" - "addss %%xmm3, %%xmm6 \n\t" - "addss %%xmm0, %%xmm3 \n\t" - "addss %%xmm7, %%xmm0 \n\t" - "addss 100(%1), %%xmm5 \n\t" - "addss %%xmm4, %%xmm7 \n\t" - "movl %0, 112(%1) \n\t" - "movss %%xmm0, 28(%1) \n\t" - "movss 36(%1), %%xmm0 \n\t" - "movss %%xmm7, 36(%1) \n\t" - "addss %%xmm1, %%xmm4 \n\t" - "movss 116(%1), %%xmm7 \n\t" - "addss %%xmm2, %%xmm0 \n\t" - "addss 124(%1), %%xmm7 \n\t" - "movss %%xmm0, 72(%1) \n\t" - "movss 44(%1), %%xmm0 \n\t" - "movss %%xmm6, 12(%1) \n\t" - "movss %%xmm3, 20(%1) \n\t" - "addss %%xmm0, %%xmm2 \n\t" - "movss %%xmm4, 44(%1) \n\t" - "movss %%xmm2, 88(%1) \n\t" - "addss 60(%1), %%xmm0 \n\t" - "movl 60(%1), %0 \n\t" - "movl %0, 120(%1) \n\t" - "movss %%xmm0, 104(%1) \n\t" - "addss %%xmm5, %%xmm1 \n\t" - "addss 68(%1), %%xmm5 \n\t" - "movss %%xmm1, 52(%1) \n\t" - "movss %%xmm5, 60(%1) \n\t" - "movss 68(%1), %%xmm1 \n\t" - "movss 100(%1), %%xmm5 \n\t" - "addss %%xmm7, %%xmm5 \n\t" - "addss 108(%1), %%xmm7 \n\t" - "addss %%xmm5, %%xmm1 \n\t" - "movss 84(%1), %%xmm2 \n\t" - "addss 92(%1), %%xmm2 \n\t" - "addss %%xmm2, %%xmm5 \n\t" - "movss %%xmm1, 68(%1) \n\t" - "addss %%xmm7, %%xmm2 \n\t" - "movss 76(%1), %%xmm1 \n\t" - "movss %%xmm2, 84(%1) \n\t" - "movss %%xmm5, 76(%1) \n\t" - "movss 108(%1), %%xmm2 \n\t" - "addss %%xmm1, %%xmm7 \n\t" - "addss 124(%1), %%xmm2 \n\t" - "addss %%xmm2, %%xmm1 \n\t" - "addss 92(%1), %%xmm2 \n\t" - "movss %%xmm1, 100(%1) \n\t" - "movss %%xmm2, 108(%1) \n\t" - "movss 92(%1), %%xmm2 \n\t" - "movss %%xmm7, 92(%1) \n\t" - "addss 124(%1), %%xmm2 \n\t" - "movss %%xmm2, 116(%1) \n\t" - :"+&r"(tmp1) - :"r"(out), "r"(b1), "r"(smask), "r"(in) - :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7",) - "memory" - ); -} - From 4e653b98c888a922ee192c6c8f914dde6ea2dc40 Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Sat, 14 May 2011 14:16:30 +0200 Subject: [PATCH 09/11] dct32: Change pass 6 permutation to allow for AVX implementation --- libavcodec/x86/dct32_sse.asm | 98 ++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index f6d5bcf844..fa0a502acf 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -156,7 +156,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp movaps m1, [outq+16] BUTTERFLY3 m1, m3, m2, m5 - movaps [outq+16], m1 + movaps [outq+96], m1 BUTTERFLY3 m4, m3, m2, m5 movaps [outq+64], m4 @@ -173,47 +173,46 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp movaps [outq+48], m4 BUTTERFLY3 m6, m3, m2, m7 - movaps [outq+96], m6 + movaps [outq+16], m6 BUTTERFLY3 m0, m3, m2, m7 movaps [outq+112], m0 ; pass 6, no SIMD... - movss m3, [outq+56] mov tmpd, [outq+4] - addss m3, [outq+60] movss m7, [outq+72] + addss m7, [outq+76] + movss m3, [outq+56] + addss m3, [outq+60] addss m4, m3 movss m2, [outq+52] addss m2, m3 - movss m3, [outq+24] - addss m3, [outq+28] - addss m7, [outq+76] + movss m3, [outq+104] + addss m3, [outq+108] addss m1, m3 addss m5, m4 movss [outq+ 16], m1 - movss m1, [outq+20] + movss m1, [outq+100] addss m1, m3 movss m3, [outq+40] movss [outq+ 48], m1 addss m3, [outq+44] - movss m1, [outq+20] + movss m1, [outq+100] addss m4, m3 addss m3, m2 - addss m1, [outq+28] + addss m1, [outq+108] movss [outq+ 40], m3 addss m2, [outq+36] movss m3, [outq+8] movss [outq+ 56], m2 addss m3, [outq+12] - movss [outq+ 8], m5 movss [outq+ 32], m3 - movss m2, [outq+52] movss m3, [outq+80] - movss m5, [outq+120] + movss [outq+ 8], m5 movss [outq+ 80], m1 - movss [outq+ 24], m4 + movss m2, [outq+52] + movss m5, [outq+120] addss m5, [outq+124] movss m1, [outq+64] addss m2, [outq+60] @@ -223,67 +222,68 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp addss m6, m0 addss m1, m6 mov tmpd, [outq+12] - movss [outq+ 4], m1 - movss m1, [outq+88] mov [outq+ 96], tmpd - addss m1, [outq+92] - movss m4, [outq+104] - mov tmpd, [outq+28] - addss m4, [outq+108] - addss m0, m4 - addss m3, m1 - addss m1, [outq+84] - addss m4, m5 + movss [outq+ 4], m1 + movss m1, [outq+24] + movss [outq+ 24], m4 + movss m4, [outq+88] + addss m4, [outq+92] + addss m3, m4 + addss m4, [outq+84] + mov tmpd, [outq+108] + addss m1, [outq+28] + addss m0, m1 + addss m1, m5 addss m6, m3 addss m3, m0 addss m0, m7 - addss m5, [outq+100] - addss m7, m4 + addss m5, [outq+20] + addss m7, m1 + movss [outq+ 12], m6 mov [outq+112], tmpd + movss m6, [outq+28] movss [outq+ 28], m0 movss m0, [outq+36] movss [outq+ 36], m7 - addss m4, m1 + addss m1, m4 movss m7, [outq+116] addss m0, m2 addss m7, [outq+124] movss [outq+ 72], m0 movss m0, [outq+44] - movss [outq+ 12], m6 - movss [outq+ 20], m3 addss m2, m0 - movss [outq+ 44], m4 + movss [outq+ 44], m1 movss [outq+ 88], m2 addss m0, [outq+60] mov tmpd, [outq+60] mov [outq+120], tmpd movss [outq+104], m0 - addss m1, m5 + addss m4, m5 addss m5, [outq+68] - movss [outq+52], m1 + movss [outq+52], m4 movss [outq+60], m5 - movss m1, [outq+68] - movss m5, [outq+100] + movss m4, [outq+68] + movss m5, [outq+20] + movss [outq+ 20], m3 addss m5, m7 - addss m7, [outq+108] - addss m1, m5 + addss m7, m6 + addss m4, m5 movss m2, [outq+84] addss m2, [outq+92] addss m5, m2 - movss [outq+ 68], m1 + movss [outq+ 68], m4 addss m2, m7 - movss m1, [outq+76] + movss m4, [outq+76] movss [outq+ 84], m2 movss [outq+ 76], m5 - movss m2, [outq+108] - addss m7, m1 - addss m2, [outq+124] - addss m1, m2 - addss m2, [outq+92] - movss [outq+100], m1 - movss [outq+108], m2 - movss m2, [outq+92] - movss [outq+ 92], m7 - addss m2, [outq+124] - movss [outq+116], m2 + addss m7, m4 + addss m6, [outq+124] + addss m4, m6 + addss m6, [outq+92] + movss [outq+100], m4 + movss [outq+108], m6 + movss m6, [outq+92] + movss [outq+92], m7 + addss m6, [outq+124] + movss [outq+116], m6 RET From 6204feb160c843320f6001d7e2bb2361c82b90ca Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Sat, 14 May 2011 14:17:15 +0200 Subject: [PATCH 10/11] dct32: Add AVX implementation of 32-point DCT --- libavcodec/mpegaudiodec.c | 4 +- libavcodec/x86/dct32_sse.asm | 350 ++++++++++++++++++++++------------- libavcodec/x86/fft.c | 4 +- libavcodec/x86/fft.h | 1 + 4 files changed, 232 insertions(+), 127 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 960d13d1e8..ccc93ad78a 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -69,9 +69,9 @@ typedef struct MPADecodeContext { uint32_t free_format_next_header; GetBitContext gb; GetBitContext in_gb; - DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; int synth_buf_offset[MPA_MAX_CHANNELS]; - DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; + DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */ GranuleDef granules[2][2]; /* Used in Layer 3 */ #ifdef DEBUG diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index fa0a502acf..2e1176cd84 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -20,31 +20,41 @@ ;****************************************************************************** %include "x86inc.asm" +%include "config.asm" SECTION_RODATA 32 align 32 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 dd 0.553104, 0.582935, 0.622504, 0.674808 - dd -1.169440, -0.972568, -0.839350, -0.744536 dd -10.190008, -3.407609, -2.057781, -1.484165 + dd -1.169440, -0.972568, -0.839350, -0.744536 dd 0.502419, 0.522499, 0.566944, 0.646822 dd 0.788155, 1.060678, 1.722447, 5.101149 dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 0.509796, 0.601345, 0.899976, 2.562916 dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 0.707107, 1.000000, -0.707107 dd 1.000000, 0.707107, 1.000000, -0.707107 -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 -%macro BUTTERFLY 4 +%macro BUTTERFLY_SSE 4 movaps %4, %1 subps %1, %2 addps %2, %4 mulps %1, %3 %endmacro -%macro BUTTERFLY0 5 +%macro BUTTERFLY_AVX 4 + vsubps %4, %1, %2 + vaddps %2, %2, %1 + vmulps %1, %4, %3 +%endmacro + +%macro BUTTERFLY0_SSE 5 movaps %4, %1 shufps %1, %1, %5 xorps %4, %2 @@ -52,6 +62,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 mulps %1, %3 %endmacro +%macro BUTTERFLY0_AVX 5 + vshufps %4, %1, %1, %5 + vxorps %1, %1, %2 + vaddps %4, %4, %1 + vmulps %1, %4, %3 +%endmacro + %macro BUTTERFLY2 4 BUTTERFLY0 %1, %2, %3, %4, 0x1b %endmacro @@ -60,126 +77,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 BUTTERFLY0 %1, %2, %3, %4, 0xb1 %endmacro -INIT_XMM -section .text align=16 -; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -cglobal dct32_float_sse, 2,3,8, out, in, tmp - ; pass 1 - - movaps m0, [inq+0] - movaps m1, [inq+112] - shufps m1, m1, 0x1b - BUTTERFLY m0, m1, [ps_cos_vec], m3 - - movaps m7, [inq+64] - movaps m4, [inq+48] - shufps m4, m4, 0x1b - BUTTERFLY m7, m4, [ps_cos_vec+48], m3 - - - ; pass 2 - movaps m2, [ps_cos_vec+64] - BUTTERFLY m1, m4, m2, m3 - movaps [outq+48], m1 - movaps [outq+ 0], m4 - - ; pass 1 - movaps m1, [inq+16] - movaps m6, [inq+96] - shufps m6, m6, 0x1b - BUTTERFLY m1, m6, [ps_cos_vec+16], m3 - - movaps m4, [inq+80] - movaps m5, [inq+32] - shufps m5, m5, 0x1b - BUTTERFLY m4, m5, [ps_cos_vec+32], m3 - - ; pass 2 - BUTTERFLY m0, m7, m2, m3 - - movaps m2, [ps_cos_vec+80] - BUTTERFLY m6, m5, m2, m3 - - BUTTERFLY m1, m4, m2, m3 - - ; pass 3 - movaps m2, [ps_cos_vec+96] - shufps m1, m1, 0x1b - BUTTERFLY m0, m1, m2, m3 - movaps [outq+112], m0 - movaps [outq+ 96], m1 - - movaps m0, [outq+0] - shufps m5, m5, 0x1b - BUTTERFLY m0, m5, m2, m3 - - movaps m1, [outq+48] - shufps m6, m6, 0x1b - BUTTERFLY m1, m6, m2, m3 - movaps [outq+48], m1 - - shufps m4, m4, 0x1b - BUTTERFLY m7, m4, m2, m3 - - ; pass 4 - movaps m3, [ps_p1p1m1m1+0] - movaps m2, [ps_cos_vec+112] - - BUTTERFLY2 m5, m3, m2, m1 - - BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+16], m0 - - BUTTERFLY2 m6, m3, m2, m1 - movaps [outq+32], m6 - - movaps m0, [outq+48] - BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+48], m0 - - BUTTERFLY2 m4, m3, m2, m1 - - BUTTERFLY2 m7, m3, m2, m1 - - movaps m6, [outq+96] - BUTTERFLY2 m6, m3, m2, m1 - - movaps m0, [outq+112] - BUTTERFLY2 m0, m3, m2, m1 - - ; pass 5 - movaps m2, [ps_cos_vec+128] - shufps m3, m3, 0xcc - - BUTTERFLY3 m5, m3, m2, m1 - movaps [outq+0], m5 - - movaps m1, [outq+16] - BUTTERFLY3 m1, m3, m2, m5 - movaps [outq+96], m1 - - BUTTERFLY3 m4, m3, m2, m5 - movaps [outq+64], m4 - - BUTTERFLY3 m7, m3, m2, m5 - movaps [outq+80], m7 - - movaps m5, [outq+32] - BUTTERFLY3 m5, m3, m2, m7 - movaps [outq+32], m5 - - movaps m4, [outq+48] - BUTTERFLY3 m4, m3, m2, m7 - movaps [outq+48], m4 - - BUTTERFLY3 m6, m3, m2, m7 - movaps [outq+16], m6 - - BUTTERFLY3 m0, m3, m2, m7 - movaps [outq+112], m0 - - - ; pass 6, no SIMD... +%macro PASS6_AND_PERMUTE 0 mov tmpd, [outq+4] movss m7, [outq+72] addss m7, [outq+76] @@ -286,4 +184,208 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp movss [outq+92], m7 addss m6, [outq+124] movss [outq+116], m6 +%endmacro + +%define BUTTERFLY BUTTERFLY_AVX +%define BUTTERFLY0 BUTTERFLY0_AVX + +INIT_YMM +section .text align=16 +%ifdef HAVE_AVX +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) +cglobal dct32_float_avx, 2,3,8, out, in, tmp + ; pass 1 + vmovaps m4, [inq+0] + vinsertf128 m5, m5, [inq+96], 1 + vinsertf128 m5, m5, [inq+112], 0 + vshufps m5, m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec], m6 + + vmovaps m2, [inq+64] + vinsertf128 m6, m6, [inq+32], 1 + vinsertf128 m6, m6, [inq+48], 0 + vshufps m6, m6, m6, 0x1b + BUTTERFLY m2, m6, [ps_cos_vec+32], m0 + + ; pass 2 + + BUTTERFLY m5, m6, [ps_cos_vec+64], m0 + BUTTERFLY m4, m2, [ps_cos_vec+64], m7 + + + ; pass 3 + vperm2f128 m3, m6, m4, 0x31 + vperm2f128 m1, m6, m4, 0x20 + vshufps m3, m3, m3, 0x1b + + BUTTERFLY m1, m3, [ps_cos_vec+96], m6 + + + vperm2f128 m4, m5, m2, 0x20 + vperm2f128 m5, m5, m2, 0x31 + vshufps m5, m5, m5, 0x1b + + BUTTERFLY m4, m5, [ps_cos_vec+96], m6 + + ; pass 4 + vmovaps m6, [ps_p1p1m1m1+0] + vmovaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m6, m2, m7 + BUTTERFLY2 m4, m6, m2, m7 + BUTTERFLY2 m1, m6, m2, m7 + BUTTERFLY2 m3, m6, m2, m7 + + + ; pass 5 + vshufps m6, m6, m6, 0xcc + vmovaps m2, [ps_cos_vec+160] + + BUTTERFLY3 m5, m6, m2, m7 + BUTTERFLY3 m4, m6, m2, m7 + BUTTERFLY3 m1, m6, m2, m7 + BUTTERFLY3 m3, m6, m2, m7 + + vperm2f128 m6, m3, m3, 0x31 + vmovaps [outq], m3 + + vextractf128 [outq+64], m5, 1 + vextractf128 [outq+32], m5, 0 + + vextractf128 [outq+80], m4, 1 + vextractf128 [outq+48], m4, 0 + + vperm2f128 m0, m1, m1, 0x31 + vmovaps [outq+96], m1 + + vzeroupper + + ; pass 6, no SIMD... +INIT_XMM + PASS6_AND_PERMUTE + RET +%endif + +%define BUTTERFLY BUTTERFLY_SSE +%define BUTTERFLY0 BUTTERFLY0_SSE + +INIT_XMM +; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) +cglobal dct32_float_sse, 2,3,8, out, in, tmp + ; pass 1 + + movaps m0, [inq+0] + movaps m1, [inq+112] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, [ps_cos_vec], m3 + + movaps m7, [inq+64] + movaps m4, [inq+48] + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, [ps_cos_vec+32], m3 + + ; pass 2 + movaps m2, [ps_cos_vec+64] + BUTTERFLY m1, m4, m2, m3 + movaps [outq+48], m1 + movaps [outq+ 0], m4 + + ; pass 1 + movaps m1, [inq+16] + movaps m6, [inq+96] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, [ps_cos_vec+16], m3 + + movaps m4, [inq+80] + movaps m5, [inq+32] + shufps m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec+48], m3 + + ; pass 2 + BUTTERFLY m0, m7, m2, m3 + + movaps m2, [ps_cos_vec+80] + BUTTERFLY m6, m5, m2, m3 + + BUTTERFLY m1, m4, m2, m3 + + ; pass 3 + movaps m2, [ps_cos_vec+96] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, m2, m3 + movaps [outq+112], m0 + movaps [outq+ 96], m1 + + movaps m0, [outq+0] + shufps m5, m5, 0x1b + BUTTERFLY m0, m5, m2, m3 + + movaps m1, [outq+48] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, m2, m3 + movaps [outq+48], m1 + + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, m2, m3 + + ; pass 4 + movaps m3, [ps_p1p1m1m1+0] + movaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m3, m2, m1 + + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+16], m0 + + BUTTERFLY2 m6, m3, m2, m1 + movaps [outq+32], m6 + + movaps m0, [outq+48] + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+48], m0 + + BUTTERFLY2 m4, m3, m2, m1 + + BUTTERFLY2 m7, m3, m2, m1 + + movaps m6, [outq+96] + BUTTERFLY2 m6, m3, m2, m1 + + movaps m0, [outq+112] + BUTTERFLY2 m0, m3, m2, m1 + + ; pass 5 + movaps m2, [ps_cos_vec+160] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + movaps [outq+0], m5 + + movaps m1, [outq+16] + BUTTERFLY3 m1, m3, m2, m5 + movaps [outq+96], m1 + + BUTTERFLY3 m4, m3, m2, m5 + movaps [outq+64], m4 + + BUTTERFLY3 m7, m3, m2, m5 + movaps [outq+80], m7 + + movaps m5, [outq+32] + BUTTERFLY3 m5, m3, m2, m7 + movaps [outq+32], m5 + + movaps m4, [outq+48] + BUTTERFLY3 m4, m3, m2, m7 + movaps [outq+48], m4 + + BUTTERFLY3 m6, m3, m2, m7 + movaps [outq+16], m6 + + BUTTERFLY3 m0, m3, m2, m7 + movaps [outq+112], m0 + + + ; pass 6, no SIMD... + PASS6_AND_PERMUTE RET diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index b29412c1dc..8eef4214a2 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) av_cold void ff_dct_init_mmx(DCTContext *s) { int has_vectors = av_get_cpu_flags(); - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) + s->dct32 = ff_dct32_float_avx; + else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; } #endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index c6379050d9..0ade2b2e7b 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); #endif /* AVCODEC_X86_FFT_H */ From 71cc331cab8d61130048f3003f2ca77cfb94e3f3 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 21 May 2011 12:05:49 +0200 Subject: [PATCH 11/11] ffmpeg: get rid of the -vglobal option. It's badly documented and does the same thing as -flags global_header, so it's redundant. --- ffmpeg.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index 71be22e05e..0c9545172f 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -208,7 +208,6 @@ static float audio_drift_threshold= 0.1; static int copy_ts= 0; static int copy_tb; static int opt_shortest = 0; -static int video_global_header = 0; static char *vstats_filename; static FILE *vstats_file; static int opt_programid = 0; @@ -3450,15 +3449,10 @@ static void new_video_stream(AVFormatContext *oc, int file_idx) if(video_codec_tag) video_enc->codec_tag= video_codec_tag; - if( (video_global_header&1) - || (video_global_header==0 && (oc->oformat->flags & AVFMT_GLOBALHEADER))){ + if(oc->oformat->flags & AVFMT_GLOBALHEADER) { video_enc->flags |= CODEC_FLAG_GLOBAL_HEADER; avcodec_opts[AVMEDIA_TYPE_VIDEO]->flags|= CODEC_FLAG_GLOBAL_HEADER; } - if(video_global_header&2){ - video_enc->flags2 |= CODEC_FLAG2_LOCAL_HEADER; - avcodec_opts[AVMEDIA_TYPE_VIDEO]->flags2|= CODEC_FLAG2_LOCAL_HEADER; - } if (video_stream_copy) { st->stream_copy = 1; @@ -4310,7 +4304,6 @@ static const OptionDef options[] = { { "vsync", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&video_sync_method}, "video sync method", "" }, { "async", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&audio_sync_method}, "audio sync method", "" }, { "adrift_threshold", HAS_ARG | OPT_FLOAT | OPT_EXPERT, {(void*)&audio_drift_threshold}, "audio drift threshold", "threshold" }, - { "vglobal", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&video_global_header}, "video global header storage type", "" }, { "copyts", OPT_BOOL | OPT_EXPERT, {(void*)©_ts}, "copy timestamps" }, { "copytb", OPT_BOOL | OPT_EXPERT, {(void*)©_tb}, "copy input stream time base when stream copying" }, { "shortest", OPT_BOOL | OPT_EXPERT, {(void*)&opt_shortest}, "finish encoding within shortest input" }, //