From 921063877e39e8cb7826928e6a7b08e2422e1f5c Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 31 Jul 2012 21:32:25 +0200 Subject: [PATCH 01/10] apetag: change a forgotten return to return 0 --- libavformat/apetag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/apetag.c b/libavformat/apetag.c index dd746313b5..d075040166 100644 --- a/libavformat/apetag.c +++ b/libavformat/apetag.c @@ -159,7 +159,7 @@ int64_t ff_ape_parse_tag(AVFormatContext *s) val = avio_rl32(pb); /* flags */ if (val & APE_TAG_FLAG_IS_HEADER) { av_log(s, AV_LOG_ERROR, "APE Tag is a header\n"); - return; + return 0; } avio_seek(pb, file_size - tag_bytes, SEEK_SET); From 6376a3ad24cb6a3c8ccaaa87e82846931d48045f Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 31 Jul 2012 17:12:51 +0200 Subject: [PATCH 02/10] x86: h264dsp: Remove unused variable ff_pb_3_1 --- libavcodec/x86/h264dsp_mmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 4bea78807f..16de15e66f 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -23,8 +23,6 @@ #include "libavcodec/h264dsp.h" #include "dsputil_mmx.h" -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; - /***********************************/ /* IDCT */ #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ From 13a79cf84e073d0ca8489047660352eee216d059 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 31 Jul 2012 20:00:35 +0200 Subject: [PATCH 03/10] dca: Rename dca.c ---> dcadec.c This will allow adding dca.c with tables used from other files. --- libavcodec/Makefile | 4 ++-- libavcodec/{dca.c => dcadec.c} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename libavcodec/{dca.c => dcadec.c} (100%) diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 3a74338518..2d0006f74b 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -119,8 +119,8 @@ OBJS-$(CONFIG_CLJR_ENCODER) += cljr.o OBJS-$(CONFIG_COOK_DECODER) += cook.o OBJS-$(CONFIG_CSCD_DECODER) += cscd.o OBJS-$(CONFIG_CYUV_DECODER) += cyuv.o -OBJS-$(CONFIG_DCA_DECODER) += dca.o synth_filter.o dcadsp.o \ - dca_parser.o +OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dcadsp.o \ + dca_parser.o synth_filter.o OBJS-$(CONFIG_DFA_DECODER) += dfa.o OBJS-$(CONFIG_DNXHD_DECODER) += dnxhddec.o dnxhddata.o OBJS-$(CONFIG_DNXHD_ENCODER) += dnxhdenc.o dnxhddata.o \ diff --git a/libavcodec/dca.c b/libavcodec/dcadec.c similarity index 100% rename from libavcodec/dca.c rename to libavcodec/dcadec.c From 9e4bca16f89bc12c58b58f4611d580a30d5f9638 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 31 Jul 2012 20:09:23 +0200 Subject: [PATCH 04/10] dca: Move tables used outside of dcadec.c to a separate file. --- libavcodec/Makefile | 5 +++-- libavcodec/dca.c | 29 +++++++++++++++++++++++++++++ libavcodec/dca.h | 4 ++++ libavcodec/dca_parser.c | 3 +-- libavcodec/dcadata.h | 6 ------ libavcodec/dcadec.c | 2 +- libavformat/spdifenc.c | 3 +-- 7 files changed, 39 insertions(+), 13 deletions(-) create mode 100644 libavcodec/dca.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 2d0006f74b..7fc50594ff 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -119,7 +119,7 @@ OBJS-$(CONFIG_CLJR_ENCODER) += cljr.o OBJS-$(CONFIG_COOK_DECODER) += cook.o OBJS-$(CONFIG_CSCD_DECODER) += cscd.o OBJS-$(CONFIG_CYUV_DECODER) += cyuv.o -OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dcadsp.o \ +OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dca.o dcadsp.o \ dca_parser.o synth_filter.o OBJS-$(CONFIG_DFA_DECODER) += dfa.o OBJS-$(CONFIG_DNXHD_DECODER) += dnxhddec.o dnxhddata.o @@ -596,6 +596,7 @@ OBJS-$(CONFIG_OGG_DEMUXER) += xiph.o flac.o flacdata.o \ OBJS-$(CONFIG_OGG_MUXER) += xiph.o flac.o flacdata.o OBJS-$(CONFIG_RTP_MUXER) += mpeg4audio.o mpegvideo.o xiph.o OBJS-$(CONFIG_SPDIF_DEMUXER) += aacadtsdec.o mpeg4audio.o +OBJS-$(CONFIG_SPDIF_MUXER) += dca.o OBJS-$(CONFIG_WEBM_MUXER) += mpeg4audio.o mpegaudiodata.o \ xiph.o flac.o flacdata.o OBJS-$(CONFIG_WTV_DEMUXER) += mpeg4audio.o mpegaudiodata.o @@ -641,7 +642,7 @@ OBJS-$(CONFIG_AC3_PARSER) += ac3_parser.o ac3tab.o \ OBJS-$(CONFIG_ADX_PARSER) += adx_parser.o adx.o OBJS-$(CONFIG_CAVSVIDEO_PARSER) += cavs_parser.o OBJS-$(CONFIG_COOK_PARSER) += cook_parser.o -OBJS-$(CONFIG_DCA_PARSER) += dca_parser.o +OBJS-$(CONFIG_DCA_PARSER) += dca_parser.o dca.o OBJS-$(CONFIG_DIRAC_PARSER) += dirac_parser.o OBJS-$(CONFIG_DNXHD_PARSER) += dnxhd_parser.o OBJS-$(CONFIG_DVBSUB_PARSER) += dvbsub_parser.o diff --git a/libavcodec/dca.c b/libavcodec/dca.c new file mode 100644 index 0000000000..4194f58aa9 --- /dev/null +++ b/libavcodec/dca.c @@ -0,0 +1,29 @@ +/* + * DCA compatible decoder data + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "dca.h" + +const uint32_t ff_dca_sample_rates[16] = +{ + 0, 8000, 16000, 32000, 0, 0, 11025, 22050, 44100, 0, 0, + 12000, 24000, 48000, 96000, 192000 +}; diff --git a/libavcodec/dca.h b/libavcodec/dca.h index 8ea6049e0d..9235fa4f0b 100644 --- a/libavcodec/dca.h +++ b/libavcodec/dca.h @@ -25,6 +25,8 @@ #ifndef AVCODEC_DCA_H #define AVCODEC_DCA_H +#include + /** DCA syncwords, also used for bitstream type detection */ #define DCA_MARKER_RAW_BE 0x7FFE8001 #define DCA_MARKER_RAW_LE 0xFE7F0180 @@ -34,4 +36,6 @@ /** DCA-HD specific block starts with this marker. */ #define DCA_HD_MARKER 0x64582025 +extern const uint32_t ff_dca_sample_rates[16]; + #endif /* AVCODEC_DCA_H */ diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c index e7b2ce42cc..553e69c41c 100644 --- a/libavcodec/dca_parser.c +++ b/libavcodec/dca_parser.c @@ -24,7 +24,6 @@ #include "parser.h" #include "dca.h" -#include "dcadata.h" #include "dca_parser.h" #include "get_bits.h" #include "put_bits.h" @@ -162,7 +161,7 @@ static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration, skip_bits(&gb, 20); sr_code = get_bits(&gb, 4); - *sample_rate = dca_sample_rates[sr_code]; + *sample_rate = ff_dca_sample_rates[sr_code]; if (*sample_rate == 0) return AVERROR_INVALIDDATA; diff --git a/libavcodec/dcadata.h b/libavcodec/dcadata.h index 4b58ef7c38..324e40f104 100644 --- a/libavcodec/dcadata.h +++ b/libavcodec/dcadata.h @@ -28,12 +28,6 @@ /* Generic tables */ -static const uint32_t dca_sample_rates[16] = -{ - 0, 8000, 16000, 32000, 0, 0, 11025, 22050, 44100, 0, 0, - 12000, 24000, 48000, 96000, 192000 -}; - static const uint32_t dca_bit_rates[32] = { 32000, 56000, 64000, 96000, 112000, 128000, diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c index b37dc49d3f..f488da6d3f 100644 --- a/libavcodec/dcadec.c +++ b/libavcodec/dcadec.c @@ -561,7 +561,7 @@ static int dca_parse_frame_header(DCAContext *s) if (s->frame_size < 95) return AVERROR_INVALIDDATA; s->amode = get_bits(&s->gb, 6); - s->sample_rate = dca_sample_rates[get_bits(&s->gb, 4)]; + s->sample_rate = ff_dca_sample_rates[get_bits(&s->gb, 4)]; if (!s->sample_rate) return AVERROR_INVALIDDATA; s->bit_rate_index = get_bits(&s->gb, 5); diff --git a/libavformat/spdifenc.c b/libavformat/spdifenc.c index b25c7fa722..c563008b2e 100644 --- a/libavformat/spdifenc.c +++ b/libavformat/spdifenc.c @@ -49,7 +49,6 @@ #include "spdif.h" #include "libavcodec/ac3.h" #include "libavcodec/dca.h" -#include "libavcodec/dcadata.h" #include "libavcodec/aacadtsdec.h" #include "libavutil/opt.h" @@ -253,7 +252,7 @@ static int spdif_header_dts(AVFormatContext *s, AVPacket *pkt) case DCA_MARKER_RAW_BE: blocks = (AV_RB16(pkt->data + 4) >> 2) & 0x7f; core_size = ((AV_RB24(pkt->data + 5) >> 4) & 0x3fff) + 1; - sample_rate = dca_sample_rates[(pkt->data[8] >> 2) & 0x0f]; + sample_rate = ff_dca_sample_rates[(pkt->data[8] >> 2) & 0x0f]; break; case DCA_MARKER_RAW_LE: blocks = (AV_RL16(pkt->data + 4) >> 2) & 0x7f; From 53dfaedc014ba6bd073fece6582da1805b3c44c2 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 25 Jul 2012 20:30:19 -0700 Subject: [PATCH 05/10] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM. --- libavcodec/x86/dsputilenc_mmx.c | 80 +++++++++++++++++++-------------- libavcodec/x86/fdct_mmx.c | 4 ++ libavcodec/x86/motion_est_mmx.c | 6 +++ libavcodec/x86/mpegvideo_mmx.c | 6 +++ 4 files changed, 63 insertions(+), 33 deletions(-) diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 47fa5ca43c..3cac979ef0 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,6 +30,8 @@ #include "dsputil_mmx.h" +#if HAVE_INLINE_ASM + static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) { __asm__ volatile( @@ -323,8 +325,6 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int return tmp; } -int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); - static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { int tmp; __asm__ volatile ( @@ -925,17 +925,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c "paddusw "#t", "#a" \n\t"\ "movd "#a", "#dst" \n\t"\ -#define hadamard_func(cpu) \ -int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); \ -int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); - -hadamard_func(mmx) -hadamard_func(mmx2) -hadamard_func(sse2) -hadamard_func(ssse3) - #define DCT_SAD4(m,mm,o)\ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ @@ -1094,10 +1083,26 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int si #undef PHADDD #endif //HAVE_SSSE3 +#endif /* HAVE_INLINE_ASM */ + +int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); + +#define hadamard_func(cpu) \ +int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); \ +int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); + +hadamard_func(mmx) +hadamard_func(mmx2) +hadamard_func(sse2) +hadamard_func(ssse3) void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); + +#if HAVE_INLINE_ASM int bit_depth = avctx->bits_per_raw_sample; if (mm_flags & AV_CPU_FLAG_MMX) { @@ -1121,11 +1126,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; -#if HAVE_YASM - c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx; - c->hadamard8_diff[1]= ff_hadamard8_diff_mmx; -#endif - c->pix_norm1 = pix_norm1_mmx; c->sse[0] = sse16_mmx; c->sse[1] = sse8_mmx; @@ -1146,10 +1146,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) if (mm_flags & AV_CPU_FLAG_MMX2) { -#if HAVE_YASM - c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; - c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2; -#endif c->sum_abs_dctelem= sum_abs_dctelem_mmx2; c->vsad[4]= vsad_intra16_mmx2; @@ -1164,13 +1160,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) if (bit_depth <= 8) c->get_pixels = get_pixels_sse2; c->sum_abs_dctelem= sum_abs_dctelem_sse2; -#if HAVE_YASM - c->sse[0] = ff_sse16_sse2; -#if HAVE_ALIGNED_STACK - c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; - c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; -#endif -#endif } #if HAVE_SSSE3 @@ -1180,10 +1169,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } c->add_8x8basis= add_8x8basis_ssse3; c->sum_abs_dctelem= sum_abs_dctelem_ssse3; -#if HAVE_YASM && HAVE_ALIGNED_STACK - c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; - c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3; -#endif } #endif @@ -1194,6 +1179,35 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) c->add_8x8basis= add_8x8basis_3dnow; } } +#endif /* HAVE_INLINE_ASM */ + +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_MMX) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; + c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; + + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2; + c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2; + } + + if (mm_flags & AV_CPU_FLAG_SSE2){ + c->sse[0] = ff_sse16_sse2; + +#if HAVE_ALIGNED_STACK + c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; + c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; +#endif + } + +#if HAVE_SSSE3 && HAVE_ALIGNED_STACK + if (mm_flags & AV_CPU_FLAG_SSSE3) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; + c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; + } +#endif + } +#endif /* HAVE_YASM */ ff_dsputil_init_pix_mmx(c, avctx); } diff --git a/libavcodec/x86/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c index cc3036bc33..3614fd151a 100644 --- a/libavcodec/x86/fdct_mmx.c +++ b/libavcodec/x86/fdct_mmx.c @@ -34,6 +34,8 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" +#if HAVE_INLINE_ASM + ////////////////////////////////////////////////////////////////////// // // constants for the forward DCT @@ -579,3 +581,5 @@ void ff_fdct_sse2(int16_t *block) fdct_col_sse2(block, block1, 0); fdct_row_sse2(block1, block); } + +#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c index a522a5e7ff..5aed655657 100644 --- a/libavcodec/x86/motion_est_mmx.c +++ b/libavcodec/x86/motion_est_mmx.c @@ -26,6 +26,8 @@ #include "libavcodec/dsputil.h" #include "dsputil_mmx.h" +#if HAVE_INLINE_ASM + DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={ 0x0000000000000000ULL, 0x0001000100010001ULL, @@ -422,8 +424,11 @@ static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, PIX_SAD(mmx) PIX_SAD(mmx2) +#endif /* HAVE_INLINE_ASM */ + void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) { +#if HAVE_INLINE_ASM int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX) { @@ -458,4 +463,5 @@ void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != CODEC_ID_SNOW) { c->sad[0]= sad16_sse2; } +#endif /* HAVE_INLINE_ASM */ } diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c index dcce48638b..a242c19aec 100644 --- a/libavcodec/x86/mpegvideo_mmx.c +++ b/libavcodec/x86/mpegvideo_mmx.c @@ -29,6 +29,8 @@ #include "libavcodec/mpegvideo.h" #include "dsputil_mmx.h" +#if HAVE_INLINE_ASM + extern uint16_t ff_inv_zigzag_direct16[64]; @@ -626,8 +628,11 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ #include "mpegvideo_mmx_template.c" #endif +#endif /* HAVE_INLINE_ASM */ + void ff_MPV_common_init_mmx(MpegEncContext *s) { +#if HAVE_INLINE_ASM int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX) { @@ -662,4 +667,5 @@ void ff_MPV_common_init_mmx(MpegEncContext *s) } } } +#endif /* HAVE_INLINE_ASM */ } From ddbe71b44fc810cc39b576bf0047f89090cfabcb Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 31 Jul 2012 20:30:29 -0700 Subject: [PATCH 06/10] dct-test: allow to compile without HAVE_INLINE_ASM. --- libavcodec/dct-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index 5046544500..9e19e0c6df 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = { { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM }, { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM }, -#if HAVE_MMX +#if HAVE_MMX && HAVE_INLINE_ASM { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 }, { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 }, From b6a3849adb0381a437952a785d39e22cb3b00282 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 31 Jul 2012 20:23:39 -0700 Subject: [PATCH 07/10] fft: port FFT/IMDCT 3dnow functions to yasm, and disable on x86-64. 64-bit CPUs always have SSE available, thus there is no need to compile in the 3dnow functions. This results in smaller binaries. --- libavcodec/x86/Makefile | 2 - libavcodec/x86/fft.c | 2 + libavcodec/x86/fft_3dn.c | 23 ----- libavcodec/x86/fft_3dn2.c | 173 -------------------------------- libavcodec/x86/fft_mmx.asm | 196 +++++++++++++++++++++++++++++++------ 5 files changed, 170 insertions(+), 226 deletions(-) delete mode 100644 libavcodec/x86/fft_3dn.c delete mode 100644 libavcodec/x86/fft_3dn2.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 1d2635749f..0ae70b2dd0 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -38,8 +38,6 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o -YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o -YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ $(YASM-OBJS-FFT-yes) YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 6349c239c3..f1c1c9d36b 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -25,6 +25,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) { #if HAVE_YASM int has_vectors = av_get_cpu_flags(); +#if ARCH_X86_32 if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { /* 3DNow! for K6-2/3 */ s->imdct_calc = ff_imdct_calc_3dnow; @@ -37,6 +38,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) s->imdct_half = ff_imdct_half_3dnow2; s->fft_calc = ff_fft_calc_3dnow2; } +#endif if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { /* SSE for P3/P4/K8 */ s->imdct_calc = ff_imdct_calc_sse; diff --git a/libavcodec/x86/fft_3dn.c b/libavcodec/x86/fft_3dn.c deleted file mode 100644 index 5a4d3ad2c8..0000000000 --- a/libavcodec/x86/fft_3dn.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * FFT/MDCT transform with 3DNow! optimizations - * Copyright (c) 2008 Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define EMULATE_3DNOWEXT -#include "fft_3dn2.c" diff --git a/libavcodec/x86/fft_3dn2.c b/libavcodec/x86/fft_3dn2.c deleted file mode 100644 index e684cc745f..0000000000 --- a/libavcodec/x86/fft_3dn2.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * FFT/MDCT transform with Extended 3DNow! optimizations - * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "fft.h" - -DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; - -#ifdef EMULATE_3DNOWEXT -#define PSWAPD(s,d)\ - "movq "#s","#d"\n"\ - "psrlq $32,"#d"\n"\ - "punpckldq "#s","#d"\n" -#define ff_fft_calc_3dnow2 ff_fft_calc_3dnow -#define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow -#define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow -#define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow -#define ff_imdct_half_3dnow2 ff_imdct_half_3dnow -#else -#define PSWAPD(s,d) "pswapd "#s","#d"\n" -#endif - -void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits); - -void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z) -{ - int n = 1<nbits; - int i; - ff_fft_dispatch_interleave_3dnow2(z, s->nbits); - __asm__ volatile("femms"); - if(n <= 8) - for(i=0; imdct_size; - long n2 = n >> 1; - long n4 = n >> 2; - long n8 = n >> 3; - const uint16_t *revtab = s->revtab; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)output; - - /* pre rotation */ - in1 = input; - in2 = input + n2 - 1; -#ifdef EMULATE_3DNOWEXT - __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31)); -#endif - for(k = 0; k < n4; k++) { - // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it - __asm__ volatile( - "movd %0, %%mm0 \n" - "movd %2, %%mm1 \n" - "punpckldq %1, %%mm0 \n" - "punpckldq %3, %%mm1 \n" - "movq %%mm0, %%mm2 \n" - PSWAPD( %%mm1, %%mm3 ) - "pfmul %%mm1, %%mm0 \n" - "pfmul %%mm3, %%mm2 \n" -#ifdef EMULATE_3DNOWEXT - "movq %%mm0, %%mm1 \n" - "punpckhdq %%mm2, %%mm0 \n" - "punpckldq %%mm2, %%mm1 \n" - "pxor %%mm7, %%mm0 \n" - "pfadd %%mm1, %%mm0 \n" -#else - "pfpnacc %%mm2, %%mm0 \n" -#endif - ::"m"(in2[-2*k]), "m"(in1[2*k]), - "m"(tcos[k]), "m"(tsin[k]) - ); - __asm__ volatile( - "movq %%mm0, %0 \n\t" - :"=m"(z[revtab[k]]) - ); - } - - ff_fft_dispatch_3dnow2(z, s->nbits); - -#define CMUL(j,mm0,mm1)\ - "movq (%2,"#j",2), %%mm6 \n"\ - "movq 8(%2,"#j",2), "#mm0"\n"\ - "movq %%mm6, "#mm1"\n"\ - "movq "#mm0",%%mm7 \n"\ - "pfmul (%3,"#j"), %%mm6 \n"\ - "pfmul (%4,"#j"), "#mm0"\n"\ - "pfmul (%4,"#j"), "#mm1"\n"\ - "pfmul (%3,"#j"), %%mm7 \n"\ - "pfsub %%mm6, "#mm0"\n"\ - "pfadd %%mm7, "#mm1"\n" - - /* post rotation */ - j = -n2; - k = n2-8; - __asm__ volatile( - "1: \n" - CMUL(%0, %%mm0, %%mm1) - CMUL(%1, %%mm2, %%mm3) - "movd %%mm0, (%2,%0,2) \n" - "movd %%mm1,12(%2,%1,2) \n" - "movd %%mm2, (%2,%1,2) \n" - "movd %%mm3,12(%2,%0,2) \n" - "psrlq $32, %%mm0 \n" - "psrlq $32, %%mm1 \n" - "psrlq $32, %%mm2 \n" - "psrlq $32, %%mm3 \n" - "movd %%mm0, 8(%2,%0,2) \n" - "movd %%mm1, 4(%2,%1,2) \n" - "movd %%mm2, 8(%2,%1,2) \n" - "movd %%mm3, 4(%2,%0,2) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) - :"memory" - ); - __asm__ volatile("femms"); -} - -void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - x86_reg j, k; - long n = s->mdct_size; - long n4 = n >> 2; - - ff_imdct_half_3dnow2(s, output+n4, input); - - j = -n; - k = n-8; - __asm__ volatile( - "movq %4, %%mm7 \n" - "1: \n" - PSWAPD((%2,%1), %%mm0) - PSWAPD((%3,%0), %%mm1) - "pxor %%mm7, %%mm0 \n" - "movq %%mm1, (%3,%1) \n" - "movq %%mm0, (%2,%0) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(output+n4*3), - "m"(*m1m1) - ); - __asm__ volatile("femms"); -} diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 5c6583b3b7..81e4411dcb 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -29,6 +29,7 @@ ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) %include "x86inc.asm" +%include "x86util.asm" %if ARCH_X86_64 %define pointer resq @@ -105,7 +106,7 @@ SECTION_TEXT pfadd %5, %4 ; {t6,t5} pxor %3, [ps_m1p1] ; {t8,t7} mova %6, %1 - pswapd %3, %3 + PSWAPD %3, %3 pfadd %1, %5 ; {r0,i0} pfsub %6, %5 ; {r2,i2} mova %4, %2 @@ -396,7 +397,6 @@ fft32_interleave_avx: %endif INIT_XMM sse -%define movdqa movaps align 16 fft4_avx: @@ -469,8 +469,8 @@ fft8 %+ SUFFIX: mova Z(2), m2 T2_3DN m4, m5, Z(4), Z(5) T2_3DN m6, m7, Z2(6), Z2(7) - pswapd m0, m5 - pswapd m2, m7 + PSWAPD m0, m5 + PSWAPD m2, m7 pxor m0, [ps_m1p1] pxor m2, [ps_m1p1] pfsub m5, m0 @@ -498,11 +498,11 @@ fft8 %+ SUFFIX: ret %endmacro -INIT_MMX 3dnow2 -FFT48_3DN - -%macro pswapd 2 -%ifidn %1, %2 +%if ARCH_X86_32 +%macro PSWAPD 2 +%if cpuflag(3dnow2) + pswapd %1, %2 +%elifidn %1, %2 movd [r0+12], %1 punpckhdq %1, [r0+8] %else @@ -512,9 +512,12 @@ FFT48_3DN %endif %endmacro -INIT_MMX 3dnow +INIT_MMX 3dnow2 FFT48_3DN +INIT_MMX 3dnow +FFT48_3DN +%endif %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] %define Z2(x) [zcq + o3q + mmsize*(x&1)] @@ -588,6 +591,7 @@ INIT_XMM sse DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_interleave_sse, PASS_BIG 0 +%macro FFT_CALC_FUNC 0 cglobal fft_calc, 2,5,8 mov r3d, [r0 + FFTContext.nbits] PUSH r1 @@ -597,23 +601,43 @@ cglobal fft_calc, 2,5,8 FFT_DISPATCH _interleave %+ SUFFIX, r1 POP rcx POP r4 - cmp rcx, 4 + cmp rcx, 3+(mmsize/16) jg .end mov r2, -1 add rcx, 3 shl r2, cl sub r4, r2 .loop +%if mmsize == 8 + PSWAPD m0, [r4 + r2 + 4] + mova [r4 + r2 + 4], m0 +%else movaps xmm0, [r4 + r2] movaps xmm1, xmm0 unpcklps xmm0, [r4 + r2 + 16] unpckhps xmm1, [r4 + r2 + 16] movaps [r4 + r2], xmm0 movaps [r4 + r2 + 16], xmm1 - add r2, 32 +%endif + add r2, mmsize*2 jl .loop .end: +%if cpuflag(3dnow) + femms + RET +%else REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +FFT_CALC_FUNC +INIT_MMX 3dnow2 +FFT_CALC_FUNC +%endif +INIT_XMM sse +FFT_CALC_FUNC cglobal fft_permute, 2,7,1 mov r4, [r0 + FFTContext.revtab] @@ -648,6 +672,7 @@ cglobal fft_permute, 2,7,1 jl .loopcopy REP_RET +%macro IMDCT_CALC_FUNC 0 cglobal imdct_calc, 3,5,3 mov r3d, [r0 + FFTContext.mdctsize] mov r4, [r0 + FFTContext.imdcthalf] @@ -671,22 +696,45 @@ cglobal imdct_calc, 3,5,3 POP r3 lea r0, [r1 + 2*r3] mov r2, r3 - sub r3, 16 + sub r3, mmsize neg r2 - movaps xmm2, [ps_m1m1m1m1] + mova m2, [ps_m1m1m1m1] .loop: - movaps xmm0, [r1 + r3] - movaps xmm1, [r0 + r2] - shufps xmm0, xmm0, 0x1b - shufps xmm1, xmm1, 0x1b - xorps xmm0, xmm2 - movaps [r0 + r3], xmm1 - movaps [r1 + r2], xmm0 - sub r3, 16 - add r2, 16 +%if mmsize == 8 + PSWAPD m0, [r1 + r3] + PSWAPD m1, [r0 + r2] + pxor m0, m2 +%else + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 +%endif + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize jl .loop +%if cpuflag(3dnow) + femms + RET +%else REP_RET +%endif +%endmacro +%if ARCH_X86_32 +INIT_MMX 3dnow +IMDCT_CALC_FUNC +INIT_MMX 3dnow2 +IMDCT_CALC_FUNC +%endif + +INIT_XMM sse +IMDCT_CALC_FUNC + +%if ARCH_X86_32 INIT_MMX 3dnow %define mulps pfmul %define addps pfadd @@ -697,6 +745,7 @@ DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] DECL_PASS pass_interleave_3dnow, PASS_BIG 0 %define pass_3dnow2 pass_3dnow %define pass_interleave_3dnow2 pass_interleave_3dnow +%endif %ifdef PIC %define SECTION_REL - $$ @@ -760,12 +809,14 @@ DECL_FFT 6, _interleave INIT_XMM sse DECL_FFT 5 DECL_FFT 5, _interleave +%if ARCH_X86_32 INIT_MMX 3dnow DECL_FFT 4 DECL_FFT 4, _interleave INIT_MMX 3dnow2 DECL_FFT 4 DECL_FFT 4, _interleave +%endif INIT_XMM sse %undef mulps @@ -775,6 +826,37 @@ INIT_XMM sse %undef unpckhps %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 +%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 + PSWAPD m0, [%3+%2*4] + movq m2, [%3+%1*4-8] + movq m3, m0 + punpckldq m0, m2 + punpckhdq m2, m3 + movd m1, [%4+%1*2-4] ; tcos[j] + movd m3, [%4+%2*2] ; tcos[n4-j-1] + punpckldq m1, [%5+%1*2-4] ; tsin[j] + punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] + + mova m4, m0 + PSWAPD m5, m1 + pfmul m0, m1 + pfmul m4, m5 + mova m6, m2 + PSWAPD m5, m3 + pfmul m2, m3 + pfmul m6, m5 +%if cpuflag(3dnow2) + pfpnacc m0, m4 + pfpnacc m2, m6 +%else + SBUTTERFLY dq, 0, 4, 1 + SBUTTERFLY dq, 2, 6, 3 + pxor m4, m7 + pxor m6, m7 + pfadd m0, m4 + pfadd m2, m6 +%endif +%else movaps xmm0, [%3+%2*4] movaps xmm1, [%3+%1*4-0x10] movaps xmm2, xmm0 @@ -795,6 +877,7 @@ INIT_XMM sse movaps xmm0, xmm1 unpcklps xmm1, xmm2 unpckhps xmm0, xmm2 +%endif %endmacro %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 @@ -863,6 +946,40 @@ INIT_XMM sse jl .post %endmacro +%macro CMUL_3DNOW 6 + mova m6, [%1+%2*2] + mova %3, [%1+%2*2+8] + mova %4, m6 + mova m7, %3 + pfmul m6, [%5+%2] + pfmul %3, [%6+%2] + pfmul %4, [%6+%2] + pfmul m7, [%5+%2] + pfsub %3, m6 + pfadd %4, m7 +%endmacro + +%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + CMUL_3DNOW %3, %1, m0, m1, %4, %5 + CMUL_3DNOW %3, %2, m2, m3, %4, %5 + movd [%3+%1*2+ 0], m0 + movd [%3+%2*2+12], m1 + movd [%3+%2*2+ 0], m2 + movd [%3+%1*2+12], m3 + psrlq m0, 32 + psrlq m1, 32 + psrlq m2, 32 + psrlq m3, 32 + movd [%3+%1*2+ 8], m0 + movd [%3+%2*2+ 4], m1 + movd [%3+%2*2+ 8], m2 + movd [%3+%1*2+ 4], m3 + sub %2, 8 + add %1, 8 + jl .post +%endmacro + %macro DECL_IMDCT 1 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input %if ARCH_X86_64 @@ -892,22 +1009,34 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i push rrevtab %endif - sub r3, 4 -%if ARCH_X86_64 + sub r3, mmsize/4 +%if ARCH_X86_64 || mmsize == 8 xor r4, r4 sub r4, r3 %endif +%if notcpuflag(3dnow2) && mmsize == 8 + movd m7, [ps_m1m1m1m1] +%endif .pre: %if ARCH_X86_64 == 0 ;unspill +%if mmsize != 8 xor r4, r4 sub r4, r3 - mov rtsin, [esp+4] +%endif mov rtcos, [esp+8] + mov rtsin, [esp+4] %endif PREROTATER r4, r3, r2, rtcos, rtsin -%if ARCH_X86_64 +%if mmsize == 8 + mov r6, [esp] ; rrevtab = ptr+n8 + movzx r5, word [rrevtab+r4-2] ; rrevtab[j] + movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] + mova [r1+r5*8], m0 + mova [r1+r6*8], m2 + add r4, 2 +%elif ARCH_X86_64 movzx r5, word [rrevtab+r4-4] movzx r6, word [rrevtab+r4-2] movzx r10, word [rrevtab+r3] @@ -928,7 +1057,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i movlps [r1+r5*8], xmm1 movhps [r1+r4*8], xmm1 %endif - sub r3, 4 + sub r3, mmsize/4 jns .pre mov r5, r0 @@ -953,12 +1082,23 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i %1 r0, r1, r6, rtcos, rtsin %if ARCH_X86_64 == 0 add esp, 12 +%endif +%if mmsize == 8 + femms %endif RET %endmacro DECL_IMDCT POSROTATESHUF +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_IMDCT POSROTATESHUF_3DNOW + +INIT_MMX 3dnow2 +DECL_IMDCT POSROTATESHUF_3DNOW +%endif + INIT_YMM avx %if HAVE_AVX From 998170913c759c45f913f9c20d7b18e6505f7cde Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Tue, 31 Jul 2012 23:58:58 +0100 Subject: [PATCH 08/10] ARM: use standard syntax for all LDRD/STRD instructions The standard syntax requires two destination registers for LDRD/STRD instructions. Some versions of the GNU assembler allow using only one with the second implicit, others are more strict. Signed-off-by: Mans Rullgard --- libavcodec/arm/h264cmc_neon.S | 4 ++-- libavcodec/arm/h264dsp_neon.S | 12 ++++++------ libavcodec/arm/mpegvideo_armv5te_s.S | 6 +++--- libavcodec/arm/simple_idct_armv5te.S | 14 +++++++------- libavcodec/arm/simple_idct_neon.S | 8 ++++---- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S index e82394d899..c7e54605bb 100644 --- a/libavcodec/arm/h264cmc_neon.S +++ b/libavcodec/arm/h264cmc_neon.S @@ -24,7 +24,7 @@ .macro h264_chroma_mc8 type, codec=h264 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 push {r4-r7, lr} - ldrd r4, [sp, #20] + ldrd r4, r5, [sp, #20] .ifc \type,avg mov lr, r0 .endif @@ -182,7 +182,7 @@ endfunc .macro h264_chroma_mc4 type, codec=h264 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 push {r4-r7, lr} - ldrd r4, [sp, #20] + ldrd r4, r5, [sp, #20] .ifc \type,avg mov lr, r0 .endif diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index 4ad886334c..9daabe03b6 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -886,7 +886,7 @@ T mov sp, r0 mov r12, #8 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 mov r3, r2 add r12, sp, #64 sub r1, r1, r2, lsl #1 @@ -913,7 +913,7 @@ T mov sp, r0 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon mov r4, r0 - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 @@ -958,7 +958,7 @@ T mov sp, r0 vpush {d8-d15} bl put_h264_qpel8_v_lowpass_neon mov r4, r0 - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 sub r1, r1, r3, lsl #1 sub r1, r1, #2 sub r2, r4, #64 @@ -1071,7 +1071,7 @@ T mov sp, r0 mov r3, #16 vpush {d8-d15} bl put_h264_qpel16_h_lowpass_neon - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 mov r3, r2 add r12, sp, #64 sub r1, r1, r2, lsl #1 @@ -1096,7 +1096,7 @@ T mov sp, r0 vpush {d8-d15} bl put_h264_qpel16_h_lowpass_neon_packed mov r4, r0 - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 @@ -1139,7 +1139,7 @@ T mov sp, r0 vpush {d8-d15} bl put_h264_qpel16_v_lowpass_neon_packed mov r4, r0 - ldrd r0, [r11], #8 + ldrd r0, r1, [r11], #8 sub r1, r1, r3, lsl #1 sub r1, r1, #2 mov r2, r3 diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S index ec95346d37..4426e15e91 100644 --- a/libavcodec/arm/mpegvideo_armv5te_s.S +++ b/libavcodec/arm/mpegvideo_armv5te_s.S @@ -61,9 +61,9 @@ function ff_dct_unquantize_h263_armv5te, export=1 mov ip, #0 subs r3, r3, #2 ble 2f - ldrd r4, [r0, #0] + ldrd r4, r5, [r0, #0] 1: - ldrd r6, [r0, #8] + ldrd r6, r7, [r0, #8] dequant_t r9, r4, r1, r2, r9 dequant_t lr, r5, r1, r2, lr @@ -87,7 +87,7 @@ function ff_dct_unquantize_h263_armv5te, export=1 subs r3, r3, #8 it gt - ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ + ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */ bgt 1b adds r3, r3, #2 diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S index e880a8aab2..804becd539 100644 --- a/libavcodec/arm/simple_idct_armv5te.S +++ b/libavcodec/arm/simple_idct_armv5te.S @@ -46,8 +46,8 @@ w57: .long W57 function idct_row_armv5te str lr, [sp, #-4]! - ldrd v1, [a1, #8] - ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ + ldrd v1, v2, [a1, #8] + ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */ orrs v1, v1, v2 itt eq cmpeq v1, a4 @@ -78,7 +78,7 @@ function idct_row_armv5te smultt fp, lr, a3 sub v7, v7, a2 smulbt a2, lr, a4 - ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ + ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ sub fp, fp, a2 orrs a2, a3, a4 @@ -121,7 +121,7 @@ function idct_row_armv5te add a2, v4, fp mov a2, a2, lsr #11 add a4, a4, a2, lsl #16 - strd a3, [a1] + strd a3, a4, [a1] sub a2, v4, fp mov a3, a2, lsr #11 @@ -135,7 +135,7 @@ function idct_row_armv5te sub a2, v1, v5 mov a2, a2, lsr #11 add a4, a4, a2, lsl #16 - strd a3, [a1, #8] + strd a3, a4, [a1, #8] ldr pc, [sp], #4 @@ -144,8 +144,8 @@ row_dc_only: bic a3, a3, #0xe000 mov a3, a3, lsl #3 mov a4, a3 - strd a3, [a1] - strd a3, [a1, #8] + strd a3, a4, [a1] + strd a3, a4, [a1, #8] ldr pc, [sp], #4 endfunc diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S index df24a45270..b3e97d515d 100644 --- a/libavcodec/arm/simple_idct_neon.S +++ b/libavcodec/arm/simple_idct_neon.S @@ -159,8 +159,8 @@ function idct_col4_neon vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1< Date: Wed, 1 Aug 2012 00:59:35 +0100 Subject: [PATCH 09/10] ARM: use =const syntax instead of explicit literal pools Signed-off-by: Mans Rullgard --- libavcodec/arm/simple_idct_arm.S | 56 ++++++++-------------------- libavcodec/arm/simple_idct_armv5te.S | 20 ++++------ libavcodec/arm/simple_idct_armv6.S | 33 ++++++---------- 3 files changed, 35 insertions(+), 74 deletions(-) diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S index c6540a1e40..8ba6c48b5a 100644 --- a/libavcodec/arm/simple_idct_arm.S +++ b/libavcodec/arm/simple_idct_arm.S @@ -25,8 +25,7 @@ #include "libavutil/arm/asm.S" -/* useful constants for the algorithm, they are save in __constant_ptr__ at */ -/* the end of the source code.*/ +/* useful constants for the algorithm */ #define W1 22725 #define W2 21407 #define W3 19266 @@ -36,16 +35,6 @@ #define W7 4520 #define MASK_MSHW 0xFFFF0000 -/* offsets of the constants in the vector */ -#define offW1 0 -#define offW2 4 -#define offW3 8 -#define offW4 12 -#define offW5 16 -#define offW6 20 -#define offW7 24 -#define offMASK_MSHW 28 - #define ROW_SHIFT 11 #define ROW_SHIFT2MSHW (16-11) #define COL_SHIFT 20 @@ -63,7 +52,6 @@ function ff_simple_idct_arm, export=1 stmfd sp!, {r4-r11, r14} @ R14 is also called LR @@ at this point, R0=block, other registers are free. add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. - adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it @@ add 2 temporary variables in the stack: R0 and R14 sub sp, sp, #8 @ allow 2 local variables str r0, [sp, #0] @ save block in sp[0] @@ -109,13 +97,13 @@ __b_evaluation: @@ MAC16(b1, -W7, row[3]); @@ MAC16(b2, -W1, row[3]); @@ MAC16(b3, -W5, row[3]); - ldr r8, [r12, #offW1] @ R8=W1 + ldr r8, =W1 @ R8=W1 mov r2, r2, asr #16 @ R2=ROWr16[3] mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 + ldr r11, =W7 @ R11=W7 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) teq r2, #0 @ if null avoid muls @@ -177,14 +165,14 @@ __a_evaluation: @@ a2 = a0 - W6 * row[2]; @@ a3 = a0 - W2 * row[2]; @@ a0 = a0 + W2 * row[2]; - ldr r9, [r12, #offW4] @ R9=W4 + ldr r9, =W4 @ R9=W4 mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 + ldr r10, =W6 @ R10=W6 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 + ldr r8, =W2 @ R8=W2 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; @@ if (temp != 0) {} @@ -248,7 +236,7 @@ __end_a_evaluation: add r9, r2, r1 @ R9=a1+b1 @@ put 2 16 bits half-words in a 32bits word @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) - ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 + ldr r10, =MASK_MSHW @ R10=0xFFFF0000 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) mvn r11, r10 @ R11= NOT R10= 0x0000FFFF and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) @@ -319,13 +307,13 @@ __b_evaluation2: @@ MAC16(b1, -W7, col[8x3]); @@ MAC16(b2, -W1, col[8x3]); @@ MAC16(b3, -W5, col[8x3]); - ldr r8, [r12, #offW1] @ R8=W1 + ldr r8, =W1 @ R8=W1 ldrsh r7, [r14, #16] mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 + ldr r11, =W7 @ R11=W7 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) ldrsh r2, [r14, #48] mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) @@ -381,13 +369,13 @@ __a_evaluation2: @@ a3 = a0 - W2 * row[2]; @@ a0 = a0 + W2 * row[2]; ldrsh r6, [r14, #0] - ldr r9, [r12, #offW4] @ R9=W4 + ldr r9, =W4 @ R9=W4 mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 + ldr r10, =W6 @ R10=W6 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 + ldr r8, =W2 @ R8=W2 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) mul r11, r8, r4 @ R11=W2*ROWr16[2] @@ -489,15 +477,3 @@ __end_bef_a_evaluation: sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) bal __end_a_evaluation - - - .align -__constant_ptr__: @@ see #defines at the beginning of the source code for values. - .word W1 - .word W2 - .word W3 - .word W4 - .word W5 - .word W6 - .word W7 - .word MASK_MSHW diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S index 804becd539..bf509eeffc 100644 --- a/libavcodec/arm/simple_idct_armv5te.S +++ b/libavcodec/arm/simple_idct_armv5te.S @@ -37,12 +37,6 @@ #define W26 (W2 | (W6 << 16)) #define W57 (W5 | (W7 << 16)) - .text - .align -w13: .long W13 -w26: .long W26 -w57: .long W57 - function idct_row_armv5te str lr, [sp, #-4]! @@ -58,7 +52,7 @@ function idct_row_armv5te mov ip, #16384 sub ip, ip, #1 /* ip = W4 */ smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ - ldr ip, w26 /* ip = W2 | (W6 << 16) */ + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ smultb a2, ip, a4 smulbb lr, ip, a4 add v2, v1, a2 @@ -66,8 +60,8 @@ function idct_row_armv5te sub v4, v1, lr add v1, v1, lr - ldr ip, w13 /* ip = W1 | (W3 << 16) */ - ldr lr, w57 /* lr = W5 | (W7 << 16) */ + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr lr, =W57 /* lr = W5 | (W7 << 16) */ smulbt v5, ip, a3 smultt v6, lr, a4 smlatt v5, ip, a4, v5 @@ -94,7 +88,7 @@ function idct_row_armv5te smlatt v7, ip, a4, v7 sub fp, fp, a2 - ldr ip, w26 /* ip = W2 | (W6 << 16) */ + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ mov a2, #16384 sub a2, a2, #1 /* a2 = W4 */ smulbb a2, a2, a3 /* a2 = W4*row[4] */ @@ -178,7 +172,7 @@ endfunc sub v4, v2, a3 sub v6, v2, a3 add fp, v2, a3 - ldr ip, w26 + ldr ip, =W26 ldr a4, [a1, #(16*2)] add v2, v2, a3 @@ -211,9 +205,9 @@ endfunc stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} - ldr ip, w13 + ldr ip, =W13 ldr a4, [a1, #(16*1)] - ldr lr, w57 + ldr lr, =W57 smulbb v1, ip, a4 smultb v3, ip, a4 smulbb v5, lr, a4 diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S index 9395f88084..0342b0619a 100644 --- a/libavcodec/arm/simple_idct_armv6.S +++ b/libavcodec/arm/simple_idct_armv6.S @@ -40,15 +40,6 @@ #define W46 (W4 | (W6 << 16)) #define W57 (W5 | (W7 << 16)) - .text - .align -w13: .long W13 -w26: .long W26 -w42: .long W42 -w42n: .long W42n -w46: .long W46 -w57: .long W57 - /* Compute partial IDCT of single row. shift = left-shift amount @@ -60,12 +51,12 @@ w57: .long W57 Output in registers r4--r11 */ .macro idct_row shift - ldr lr, w46 /* lr = W4 | (W6 << 16) */ + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ mov r1, #(1<<(\shift-1)) smlad r4, r2, ip, r1 smlsd r7, r2, ip, r1 - ldr ip, w13 /* ip = W1 | (W3 << 16) */ - ldr r10,w57 /* r10 = W5 | (W7 << 16) */ + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ smlad r5, r2, lr, r1 smlsd r6, r2, lr, r1 @@ -78,11 +69,11 @@ w57: .long W57 smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ - ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ + ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ ldr r2, [r0, #4] /* r2 = row[6,4] */ smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ - ldr ip, w46 /* ip = W4 | (W6 << 16) */ + ldr ip, =W46 /* ip = W4 | (W6 << 16) */ smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ @@ -101,12 +92,12 @@ w57: .long W57 Output in registers r4--r11 */ .macro idct_row4 shift - ldr lr, w46 /* lr = W4 | (W6 << 16) */ - ldr r10,w57 /* r10 = W5 | (W7 << 16) */ + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ mov r1, #(1<<(\shift-1)) smlad r4, r2, ip, r1 smlsd r7, r2, ip, r1 - ldr ip, w13 /* ip = W1 | (W3 << 16) */ + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ smlad r5, r2, lr, r1 smlsd r6, r2, lr, r1 smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ @@ -205,7 +196,7 @@ function idct_row_armv6 cmpeq lr, r2, lsr #16 beq 1f push {r1} - ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ cmp lr, #0 beq 2f @@ -249,7 +240,7 @@ function idct_col_armv6 push {r1, lr} ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ ldr r3, [r0, #8] /* r3 = row[3,1] */ idct_row COL_SHIFT pop {r1} @@ -277,7 +268,7 @@ function idct_col_put_armv6 push {r1, r2, lr} ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ ldr r3, [r0, #8] /* r3 = row[3,1] */ idct_row COL_SHIFT pop {r1, r2} @@ -307,7 +298,7 @@ function idct_col_add_armv6 push {r1, r2, lr} ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ ldr r3, [r0, #8] /* r3 = row[3,1] */ idct_row COL_SHIFT pop {r1, r2} From 19cf7163c1576e7b03ea33d7bf633e14d7516db8 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 1 Aug 2012 11:12:08 +0200 Subject: [PATCH 10/10] dca: Switch dca_sample_rates to avpriv_ prefix; it is used across libs --- libavcodec/dca.c | 2 +- libavcodec/dca.h | 2 +- libavcodec/dca_parser.c | 2 +- libavcodec/dcadec.c | 2 +- libavformat/spdifenc.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libavcodec/dca.c b/libavcodec/dca.c index 4194f58aa9..0f1eeecf7b 100644 --- a/libavcodec/dca.c +++ b/libavcodec/dca.c @@ -22,7 +22,7 @@ #include "dca.h" -const uint32_t ff_dca_sample_rates[16] = +const uint32_t avpriv_dca_sample_rates[16] = { 0, 8000, 16000, 32000, 0, 0, 11025, 22050, 44100, 0, 0, 12000, 24000, 48000, 96000, 192000 diff --git a/libavcodec/dca.h b/libavcodec/dca.h index 9235fa4f0b..1515270471 100644 --- a/libavcodec/dca.h +++ b/libavcodec/dca.h @@ -36,6 +36,6 @@ /** DCA-HD specific block starts with this marker. */ #define DCA_HD_MARKER 0x64582025 -extern const uint32_t ff_dca_sample_rates[16]; +extern const uint32_t avpriv_dca_sample_rates[16]; #endif /* AVCODEC_DCA_H */ diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c index 553e69c41c..73611e0233 100644 --- a/libavcodec/dca_parser.c +++ b/libavcodec/dca_parser.c @@ -161,7 +161,7 @@ static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration, skip_bits(&gb, 20); sr_code = get_bits(&gb, 4); - *sample_rate = ff_dca_sample_rates[sr_code]; + *sample_rate = avpriv_dca_sample_rates[sr_code]; if (*sample_rate == 0) return AVERROR_INVALIDDATA; diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c index f488da6d3f..d4fd23e215 100644 --- a/libavcodec/dcadec.c +++ b/libavcodec/dcadec.c @@ -561,7 +561,7 @@ static int dca_parse_frame_header(DCAContext *s) if (s->frame_size < 95) return AVERROR_INVALIDDATA; s->amode = get_bits(&s->gb, 6); - s->sample_rate = ff_dca_sample_rates[get_bits(&s->gb, 4)]; + s->sample_rate = avpriv_dca_sample_rates[get_bits(&s->gb, 4)]; if (!s->sample_rate) return AVERROR_INVALIDDATA; s->bit_rate_index = get_bits(&s->gb, 5); diff --git a/libavformat/spdifenc.c b/libavformat/spdifenc.c index c563008b2e..f8c38c44ab 100644 --- a/libavformat/spdifenc.c +++ b/libavformat/spdifenc.c @@ -252,7 +252,7 @@ static int spdif_header_dts(AVFormatContext *s, AVPacket *pkt) case DCA_MARKER_RAW_BE: blocks = (AV_RB16(pkt->data + 4) >> 2) & 0x7f; core_size = ((AV_RB24(pkt->data + 5) >> 4) & 0x3fff) + 1; - sample_rate = ff_dca_sample_rates[(pkt->data[8] >> 2) & 0x0f]; + sample_rate = avpriv_dca_sample_rates[(pkt->data[8] >> 2) & 0x0f]; break; case DCA_MARKER_RAW_LE: blocks = (AV_RL16(pkt->data + 4) >> 2) & 0x7f;