From 07eeeb1d4fa6dea0fb3ad7cd11859db760a76528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 4 Jul 2012 22:25:53 +0300 Subject: [PATCH 01/16] vp8: Add ifdef guards around the sse2 loopfilter in the sse2slow branch too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was missed in the the previous commit in 70a1c800. Signed-off-by: Martin Storsjö --- libavcodec/x86/vp8dsp-init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index a0e8f9be1f..589804fa34 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -389,11 +389,13 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; +#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; +#endif } if (mm_flags & AV_CPU_FLAG_SSE2) { From 715129cdc409499245fc9519da3db1436b660d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 30 Jun 2012 11:26:11 +0300 Subject: [PATCH 02/16] avconv: Set audio filter time base to the sample rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the output frame size is smaller than the input sample rate, and the input stream time base corresponds exactly to the input frame size (getting input packet timestamps like 0, 1, 2, 3, 4 etc), the output timestamps from the filter will be like 0, 1, 2, 3, 4, 4, 5 ..., leadning to non-monotone timestamps later. A concrete example is input mp3 data having frame sizes of 1152 samples, transcoded to aac with 1024 sample frames. By setting the audio filter time base to the sample rate, we will get sensible timestamps for all output packets, regardless of the ratio between the input and output frame sizes. Signed-off-by: Martin Storsjö --- avconv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/avconv.c b/avconv.c index 3a7cebfc95..961356752a 100644 --- a/avconv.c +++ b/avconv.c @@ -853,7 +853,7 @@ static int configure_input_audio_filter(FilterGraph *fg, InputFilter *ifilter, snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s" ":channel_layout=0x%"PRIx64, - ist->st->time_base.num, ist->st->time_base.den, + 1, ist->st->codec->sample_rate, ist->st->codec->sample_rate, av_get_sample_fmt_name(ist->st->codec->sample_fmt), ist->st->codec->channel_layout); @@ -2029,6 +2029,10 @@ static int decode_audio(InputStream *ist, AVPacket *pkt, int *got_output) } } + if (decoded_frame->pts != AV_NOPTS_VALUE) + decoded_frame->pts = av_rescale_q(decoded_frame->pts, + ist->st->time_base, + (AVRational){1, ist->st->codec->sample_rate}); for (i = 0; i < ist->nb_filters; i++) av_buffersrc_write_frame(ist->filters[i]->filter, decoded_frame); From 5b54a90c8b54e3db862ec3bb3a28b0b9e4fc6554 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 4 Jul 2012 13:33:41 +0200 Subject: [PATCH 03/16] flvdec: optionally trust the metadata In certain conditions video or audio frames might appear way later in the stream. --- libavformat/flvdec.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c index 093cd0a39f..602cb9ebee 100644 --- a/libavformat/flvdec.c +++ b/libavformat/flvdec.c @@ -26,6 +26,7 @@ #include "libavutil/avstring.h" #include "libavutil/dict.h" +#include "libavutil/opt.h" #include "libavutil/intfloat.h" #include "libavutil/mathematics.h" #include "libavcodec/bytestream.h" @@ -42,6 +43,8 @@ #define VALIDATE_INDEX_TS_THRESH 2500 typedef struct { + const AVClass *class; ///< Class for private options. + int trust_metadata; ///< configure streams according onMetaData int wrong_dts; ///< wrong dts due to negative cts uint8_t *new_extradata[2]; int new_extradata_size[2]; @@ -327,6 +330,7 @@ finish: static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vstream, const char *key, int64_t max_pos, int depth) { AVCodecContext *acodec, *vcodec; + FLVContext *flv = s->priv_data; AVIOContext *ioc; AMFDataType amf_type; char str_val[256]; @@ -406,6 +410,22 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst if (!st) return AVERROR(ENOMEM); st->codec->codec_id = CODEC_ID_TEXT; + } else if (flv->trust_metadata) { + if (!strcmp(key, "videocodecid") && vcodec) { + flv_set_video_codec(s, vstream, num_val); + } else + if (!strcmp(key, "audiocodecid") && acodec) { + flv_set_audio_codec(s, astream, acodec, num_val); + } else + if (!strcmp(key, "audiosamplerate") && acodec) { + acodec->sample_rate = num_val; + } else + if (!strcmp(key, "width") && vcodec) { + vcodec->width = num_val; + } else + if (!strcmp(key, "height") && vcodec) { + vcodec->height = num_val; + } } } @@ -844,6 +864,20 @@ static int flv_read_seek(AVFormatContext *s, int stream_index, return avio_seek_time(s->pb, stream_index, ts, flags); } +#define OFFSET(x) offsetof(FLVContext, x) +#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM +static const AVOption options[] = { + { "flv_metadata", "Allocate streams according the onMetaData array", OFFSET(trust_metadata), AV_OPT_TYPE_INT, { 0 }, 0, 1, VD}, + { NULL } +}; + +static const AVClass class = { + .class_name = "flvdec", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + AVInputFormat ff_flv_demuxer = { .name = "flv", .long_name = NULL_IF_CONFIG_SMALL("FLV format"), @@ -854,4 +888,5 @@ AVInputFormat ff_flv_demuxer = { .read_seek = flv_read_seek, .read_close = flv_read_close, .extensions = "flv", + .priv_class = &class, }; From 669bbedfa863f8a1491a186fac4238baba407037 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 5 Jul 2012 09:50:59 +0200 Subject: [PATCH 04/16] blowfish: invert branch and loop precedence Should slightly improve performance depending on the compiler used. --- libavutil/blowfish.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/libavutil/blowfish.c b/libavutil/blowfish.c index b7f5294677..554953e865 100644 --- a/libavutil/blowfish.c +++ b/libavutil/blowfish.c @@ -381,8 +381,8 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, uint32_t v0, v1; int i; - while (count > 0) { - if (decrypt) { + if (decrypt) { + while (count > 0) { v0 = AV_RB32(src); v1 = AV_RB32(src + 4); @@ -396,7 +396,13 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, dst[i] = dst[i] ^ iv[i]; memcpy(iv, src, 8); } - } else { + + src += 8; + dst += 8; + count -= 8; + } + } else { + while (count > 0) { if (iv) { for (i = 0; i < 8; i++) dst[i] = src[i] ^ iv[i]; @@ -414,11 +420,11 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, if (iv) memcpy(iv, dst, 8); - } - src += 8; - dst += 8; - count -= 8; + src += 8; + dst += 8; + count -= 8; + } } } From f6687bf5f8989d397cdef6d9d05bcb13a7ef8c4f Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 5 Jul 2012 09:52:04 +0200 Subject: [PATCH 05/16] xtea: invert branch and loop precedence Should slightly improve performance depending on the compiler used. --- libavutil/xtea.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/libavutil/xtea.c b/libavutil/xtea.c index 138657f88b..07a66e5666 100644 --- a/libavutil/xtea.c +++ b/libavutil/xtea.c @@ -71,8 +71,8 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, { int i; - while (count > 0) { - if (decrypt) { + if (decrypt) { + while (count > 0) { xtea_crypt_ecb(ctx, dst, src, decrypt); if (iv) { @@ -80,7 +80,13 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, dst[i] = dst[i] ^ iv[i]; memcpy(iv, src, 8); } - } else { + + src += 8; + dst += 8; + count -= 8; + } + } else { + while (count > 0) { if (iv) { for (i = 0; i < 8; i++) dst[i] = src[i] ^ iv[i]; @@ -89,11 +95,10 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, } else { xtea_crypt_ecb(ctx, dst, src, decrypt); } + src += 8; + dst += 8; + count -= 8; } - - src += 8; - dst += 8; - count -= 8; } } From 18f2d5cb9c48d06895960f37467576725c9dc2d1 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Sun, 20 Nov 2011 17:19:25 +0100 Subject: [PATCH 06/16] mpegvideo: Don't use ff_mspel_motion() for vc1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using ff_mspel_motion assumes that s (a MpegEncContext poiinter) really is a Wmv2Context. This fixes crashes in error resilience on vc1/wmv3 videos. CC: libav-stable@libav.org Signed-off-by: Martin Storsjö --- libavcodec/mpegvideo_common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/mpegvideo_common.h b/libavcodec/mpegvideo_common.h index ebf9c7d619..0a731266e2 100644 --- a/libavcodec/mpegvideo_common.h +++ b/libavcodec/mpegvideo_common.h @@ -719,7 +719,8 @@ static av_always_inline void MPV_motion_internal(MpegEncContext *s, 0, 0, 0, ref_picture, pix_op, qpix_op, s->mv[dir][0][0], s->mv[dir][0][1], 16); - }else if(!is_mpeg12 && (CONFIG_WMV2_DECODER || CONFIG_WMV2_ENCODER) && s->mspel){ + } else if (!is_mpeg12 && (CONFIG_WMV2_DECODER || CONFIG_WMV2_ENCODER) && + s->mspel && s->codec_id == CODEC_ID_WMV2) { ff_mspel_motion(s, dest_y, dest_cb, dest_cr, ref_picture, pix_op, s->mv[dir][0][0], s->mv[dir][0][1], 16); From e4a7fb3da33d98e3c5bbd4e58faf8b8945a07f9c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 5 Jul 2012 11:33:54 +0200 Subject: [PATCH 07/16] blowfish: Make the count parameter match the documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously it was interpreted as number of bytes, while the documentation stated that it was the number of 8 byte blocks. This makes it behave similarly to the existing AES code. Signed-off-by: Martin Storsjö --- libavutil/blowfish.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libavutil/blowfish.c b/libavutil/blowfish.c index 554953e865..5df3dfcf1e 100644 --- a/libavutil/blowfish.c +++ b/libavutil/blowfish.c @@ -382,7 +382,7 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, int i; if (decrypt) { - while (count > 0) { + while (count--) { v0 = AV_RB32(src); v1 = AV_RB32(src + 4); @@ -399,10 +399,9 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, src += 8; dst += 8; - count -= 8; } } else { - while (count > 0) { + while (count--) { if (iv) { for (i = 0; i < 8; i++) dst[i] = src[i] ^ iv[i]; @@ -423,7 +422,6 @@ void av_blowfish_crypt(AVBlowfish *ctx, uint8_t *dst, const uint8_t *src, src += 8; dst += 8; - count -= 8; } } } From 983db9b2b4c753507d1cf8427675fca80d598b4c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 5 Jul 2012 11:19:13 +0200 Subject: [PATCH 08/16] xtea: Make the count parameter match the documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously it was interpreted as number of bytes, while the documentation stated that it was the number of 8 byte blocks. This makes it behave similarly to the existing AES code. Signed-off-by: Martin Storsjö --- libavutil/xtea.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libavutil/xtea.c b/libavutil/xtea.c index 07a66e5666..7c3a14c2be 100644 --- a/libavutil/xtea.c +++ b/libavutil/xtea.c @@ -72,7 +72,7 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, int i; if (decrypt) { - while (count > 0) { + while (count--) { xtea_crypt_ecb(ctx, dst, src, decrypt); if (iv) { @@ -83,10 +83,9 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, src += 8; dst += 8; - count -= 8; } } else { - while (count > 0) { + while (count--) { if (iv) { for (i = 0; i < 8; i++) dst[i] = src[i] ^ iv[i]; @@ -97,7 +96,6 @@ void av_xtea_crypt(AVXTEA *ctx, uint8_t *dst, const uint8_t *src, int count, } src += 8; dst += 8; - count -= 8; } } } From 28fff0d9740e00c2ee82f72a4be55bdbb5e0c8c6 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Tue, 3 Jul 2012 23:16:11 +0100 Subject: [PATCH 09/16] h264: use templates to avoid excessive inlining Instead of inlining everything into ff_h264_hl_decode_mb(), use explicit templating to create versions of the called functions with constant parameters filled in. This greatly speeds up compilation of h264.c and reduces the code size without any measurable impact on performance. Compilation time for h264.c on an i7 goes from 30s to 5.5s. Code size is reduced by 430kB. Signed-off-by: Mans Rullgard --- libavcodec/h264.c | 543 +--------------------------------- libavcodec/h264_mb_template.c | 380 ++++++++++++++++++++++++ libavcodec/h264_mc_template.c | 160 ++++++++++ 3 files changed, 550 insertions(+), 533 deletions(-) create mode 100644 libavcodec/h264_mb_template.c create mode 100644 libavcodec/h264_mc_template.c diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 2d6a08e032..a4afcc870e 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -714,33 +714,6 @@ static av_always_inline void mc_part_weighted(H264Context *h, int n, int square, } } -static av_always_inline void mc_part(H264Context *h, int n, int square, - int height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, - uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, - h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, - h264_chroma_mc_func chroma_avg, - h264_weight_func *weight_op, - h264_biweight_func *weight_avg, - int list0, int list1, - int pixel_shift, int chroma_idc) -{ - if ((h->use_weight == 2 && list0 && list1 && - (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) || - h->use_weight == 1) - mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, - weight_op[0], weight_op[1], weight_avg[0], - weight_avg[1], list0, list1, pixel_shift, chroma_idc); - else - mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, qpix_avg, - chroma_avg, list0, list1, pixel_shift, chroma_idc); -} - static av_always_inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc) { @@ -768,146 +741,6 @@ static av_always_inline void prefetch_motion(H264Context *h, int list, } } -static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, - uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func(*qpix_put)[16], - h264_chroma_mc_func(*chroma_put), - qpel_mc_func(*qpix_avg)[16], - h264_chroma_mc_func(*chroma_avg), - h264_weight_func *weight_op, - h264_biweight_func *weight_avg, - int pixel_shift, int chroma_idc) -{ - MpegEncContext *const s = &h->s; - const int mb_xy = h->mb_xy; - const int mb_type = s->current_picture.f.mb_type[mb_xy]; - - assert(IS_INTER(mb_type)); - - if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME)) - await_references(h); - prefetch_motion(h, 0, pixel_shift, chroma_idc); - - if (IS_16X16(mb_type)) { - mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], - weight_op, weight_avg, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma_idc); - } else if (IS_16X8(mb_type)) { - mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - weight_op, weight_avg, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma_idc); - mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - weight_op, weight_avg, - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma_idc); - } else if (IS_8X16(mb_type)) { - mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma_idc); - mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma_idc); - } else { - int i; - - assert(IS_8X8(mb_type)); - - for (i = 0; i < 4; i++) { - const int sub_mb_type = h->sub_mb_type[i]; - const int n = 4 * i; - int x_offset = (i & 1) << 2; - int y_offset = (i & 2) << 1; - - if (IS_SUB_8X8(sub_mb_type)) { - mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, - x_offset, y_offset, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[1], &weight_avg[1], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - } else if (IS_SUB_8X4(sub_mb_type)) { - mc_part(h, n, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, - x_offset, y_offset, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[1], &weight_avg[1], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - mc_part(h, n + 2, 0, 4, 4 << pixel_shift, - dest_y, dest_cb, dest_cr, x_offset, y_offset + 2, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[1], &weight_avg[1], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - } else if (IS_SUB_4X8(sub_mb_type)) { - mc_part(h, n, 0, 8, 4 * h->mb_linesize, - dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[2], &weight_avg[2], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize, - dest_y, dest_cb, dest_cr, x_offset + 2, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[2], &weight_avg[2], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - } else { - int j; - assert(IS_SUB_4X4(sub_mb_type)); - for (j = 0; j < 4; j++) { - int sub_x_offset = x_offset + 2 * (j & 1); - int sub_y_offset = y_offset + (j & 2); - mc_part(h, n + j, 1, 4, 0, - dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[2], &weight_avg[2], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma_idc); - } - } - } - } - - prefetch_motion(h, 1, pixel_shift, chroma_idc); -} - -static av_always_inline void hl_motion_420(H264Context *h, uint8_t *dest_y, - uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func(*qpix_put)[16], - h264_chroma_mc_func(*chroma_put), - qpel_mc_func(*qpix_avg)[16], - h264_chroma_mc_func(*chroma_avg), - h264_weight_func *weight_op, - h264_biweight_func *weight_avg, - int pixel_shift) -{ - hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, - qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1); -} - -static av_always_inline void hl_motion_422(H264Context *h, uint8_t *dest_y, - uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func(*qpix_put)[16], - h264_chroma_mc_func(*chroma_put), - qpel_mc_func(*qpix_avg)[16], - h264_chroma_mc_func(*chroma_avg), - h264_weight_func *weight_op, - h264_biweight_func *weight_avg, - int pixel_shift) -{ - hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, - qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2); -} - static void free_tables(H264Context *h, int free_rbsp) { int i; @@ -2077,373 +1910,17 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, } } -static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, - int pixel_shift) -{ - MpegEncContext *const s = &h->s; - const int mb_x = s->mb_x; - const int mb_y = s->mb_y; - const int mb_xy = h->mb_xy; - const int mb_type = s->current_picture.f.mb_type[mb_xy]; - uint8_t *dest_y, *dest_cb, *dest_cr; - int linesize, uvlinesize /*dct_offset*/; - int i, j; - int *block_offset = &h->block_offset[0]; - const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass); - /* is_h264 should always be true if SVQ3 is disabled. */ - const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264; - void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); - const int block_h = 16 >> s->chroma_y_shift; - const int chroma422 = CHROMA422; +#define BITS 8 +#define SIMPLE 1 +#include "h264_mb_template.c" - dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize) * 16; - dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h; - dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h; +#undef BITS +#define BITS 16 +#include "h264_mb_template.c" - s->dsp.prefetch(dest_y + (s->mb_x & 3) * 4 * s->linesize + (64 << pixel_shift), s->linesize, 4); - s->dsp.prefetch(dest_cb + (s->mb_x & 7) * s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2); - - h->list_counts[mb_xy] = h->list_count; - - if (!simple && MB_FIELD) { - linesize = h->mb_linesize = s->linesize * 2; - uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; - block_offset = &h->block_offset[48]; - if (mb_y & 1) { // FIXME move out of this function? - dest_y -= s->linesize * 15; - dest_cb -= s->uvlinesize * (block_h - 1); - dest_cr -= s->uvlinesize * (block_h - 1); - } - if (FRAME_MBAFF) { - int list; - for (list = 0; list < h->list_count; list++) { - if (!USES_LIST(mb_type, list)) - continue; - if (IS_16X16(mb_type)) { - int8_t *ref = &h->ref_cache[list][scan8[0]]; - fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1); - } else { - for (i = 0; i < 16; i += 4) { - int ref = h->ref_cache[list][scan8[i]]; - if (ref >= 0) - fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, - 8, (16 + ref) ^ (s->mb_y & 1), 1); - } - } - } - } - } else { - linesize = h->mb_linesize = s->linesize; - uvlinesize = h->mb_uvlinesize = s->uvlinesize; - // dct_offset = s->linesize * 16; - } - - if (!simple && IS_INTRA_PCM(mb_type)) { - if (pixel_shift) { - const int bit_depth = h->sps.bit_depth_luma; - int j; - GetBitContext gb; - init_get_bits(&gb, (uint8_t *)h->mb, - ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth); - - for (i = 0; i < 16; i++) { - uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize); - for (j = 0; j < 16; j++) - tmp_y[j] = get_bits(&gb, bit_depth); - } - if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { - if (!h->sps.chroma_format_idc) { - for (i = 0; i < block_h; i++) { - uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize); - for (j = 0; j < 8; j++) - tmp_cb[j] = 1 << (bit_depth - 1); - } - for (i = 0; i < block_h; i++) { - uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize); - for (j = 0; j < 8; j++) - tmp_cr[j] = 1 << (bit_depth - 1); - } - } else { - for (i = 0; i < block_h; i++) { - uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize); - for (j = 0; j < 8; j++) - tmp_cb[j] = get_bits(&gb, bit_depth); - } - for (i = 0; i < block_h; i++) { - uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize); - for (j = 0; j < 8; j++) - tmp_cr[j] = get_bits(&gb, bit_depth); - } - } - } - } else { - for (i = 0; i < 16; i++) - memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16); - if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { - if (!h->sps.chroma_format_idc) { - for (i = 0; i < block_h; i++) { - memset(dest_cb + i * uvlinesize, 128, 8); - memset(dest_cr + i * uvlinesize, 128, 8); - } - } else { - uint8_t *src_cb = (uint8_t *)h->mb + 256; - uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8; - for (i = 0; i < block_h; i++) { - memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8); - memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8); - } - } - } - } - } else { - if (IS_INTRA(mb_type)) { - if (h->deblocking_filter) - xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, - uvlinesize, 1, 0, simple, pixel_shift); - - if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { - h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize); - h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize); - } - - hl_decode_mb_predict_luma(h, mb_type, is_h264, simple, - transform_bypass, pixel_shift, - block_offset, linesize, dest_y, 0); - - if (h->deblocking_filter) - xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, - uvlinesize, 0, 0, simple, pixel_shift); - } else if (is_h264) { - if (chroma422) { - hl_motion_422(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, - pixel_shift); - } else { - hl_motion_420(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, - pixel_shift); - } - } - - hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, - pixel_shift, block_offset, linesize, dest_y, 0); - - if ((simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) && - (h->cbp & 0x30)) { - uint8_t *dest[2] = { dest_cb, dest_cr }; - if (transform_bypass) { - if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 && - (h->chroma_pred_mode == VERT_PRED8x8 || - h->chroma_pred_mode == HOR_PRED8x8)) { - h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], - block_offset + 16, - h->mb + (16 * 16 * 1 << pixel_shift), - uvlinesize); - h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], - block_offset + 32, - h->mb + (16 * 16 * 2 << pixel_shift), - uvlinesize); - } else { - idct_add = s->dsp.add_pixels4; - for (j = 1; j < 3; j++) { - for (i = j * 16; i < j * 16 + 4; i++) - if (h->non_zero_count_cache[scan8[i]] || - dctcoef_get(h->mb, pixel_shift, i * 16)) - idct_add(dest[j - 1] + block_offset[i], - h->mb + (i * 16 << pixel_shift), - uvlinesize); - if (chroma422) { - for (i = j * 16 + 4; i < j * 16 + 8; i++) - if (h->non_zero_count_cache[scan8[i + 4]] || - dctcoef_get(h->mb, pixel_shift, i * 16)) - idct_add(dest[j - 1] + block_offset[i + 4], - h->mb + (i * 16 << pixel_shift), - uvlinesize); - } - } - } - } else { - if (is_h264) { - int qp[2]; - if (chroma422) { - qp[0] = h->chroma_qp[0] + 3; - qp[1] = h->chroma_qp[1] + 3; - } else { - qp[0] = h->chroma_qp[0]; - qp[1] = h->chroma_qp[1]; - } - if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]]) - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << pixel_shift), - h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]); - if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]]) - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << pixel_shift), - h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]); - h->h264dsp.h264_idct_add8(dest, block_offset, - h->mb, uvlinesize, - h->non_zero_count_cache); - } else if (CONFIG_SVQ3_DECODER) { - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1, - h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]); - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2, - h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]); - for (j = 1; j < 3; j++) { - for (i = j * 16; i < j * 16 + 4; i++) - if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) { - uint8_t *const ptr = dest[j - 1] + block_offset[i]; - ff_svq3_add_idct_c(ptr, h->mb + i * 16, - uvlinesize, - ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2); - } - } - } - } - } - } - if (h->cbp || IS_INTRA(mb_type)) { - s->dsp.clear_blocks(h->mb); - s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift)); - } -} - -static av_always_inline void hl_decode_mb_444_internal(H264Context *h, - int simple, - int pixel_shift) -{ - MpegEncContext *const s = &h->s; - const int mb_x = s->mb_x; - const int mb_y = s->mb_y; - const int mb_xy = h->mb_xy; - const int mb_type = s->current_picture.f.mb_type[mb_xy]; - uint8_t *dest[3]; - int linesize; - int i, j, p; - int *block_offset = &h->block_offset[0]; - const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass); - const int plane_count = (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1; - - for (p = 0; p < plane_count; p++) { - dest[p] = s->current_picture.f.data[p] + - ((mb_x << pixel_shift) + mb_y * s->linesize) * 16; - s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << pixel_shift), - s->linesize, 4); - } - - h->list_counts[mb_xy] = h->list_count; - - if (!simple && MB_FIELD) { - linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2; - block_offset = &h->block_offset[48]; - if (mb_y & 1) // FIXME move out of this function? - for (p = 0; p < 3; p++) - dest[p] -= s->linesize * 15; - if (FRAME_MBAFF) { - int list; - for (list = 0; list < h->list_count; list++) { - if (!USES_LIST(mb_type, list)) - continue; - if (IS_16X16(mb_type)) { - int8_t *ref = &h->ref_cache[list][scan8[0]]; - fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1); - } else { - for (i = 0; i < 16; i += 4) { - int ref = h->ref_cache[list][scan8[i]]; - if (ref >= 0) - fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, - 8, (16 + ref) ^ (s->mb_y & 1), 1); - } - } - } - } - } else { - linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize; - } - - if (!simple && IS_INTRA_PCM(mb_type)) { - if (pixel_shift) { - const int bit_depth = h->sps.bit_depth_luma; - GetBitContext gb; - init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth); - - for (p = 0; p < plane_count; p++) - for (i = 0; i < 16; i++) { - uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize); - for (j = 0; j < 16; j++) - tmp[j] = get_bits(&gb, bit_depth); - } - } else { - for (p = 0; p < plane_count; p++) - for (i = 0; i < 16; i++) - memcpy(dest[p] + i * linesize, - (uint8_t *)h->mb + p * 256 + i * 16, 16); - } - } else { - if (IS_INTRA(mb_type)) { - if (h->deblocking_filter) - xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, - linesize, 1, 1, simple, pixel_shift); - - for (p = 0; p < plane_count; p++) - hl_decode_mb_predict_luma(h, mb_type, 1, simple, - transform_bypass, pixel_shift, - block_offset, linesize, dest[p], p); - - if (h->deblocking_filter) - xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, - linesize, 0, 1, simple, pixel_shift); - } else { - hl_motion(h, dest[0], dest[1], dest[2], - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3); - } - - for (p = 0; p < plane_count; p++) - hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass, - pixel_shift, block_offset, linesize, - dest[p], p); - } - if (h->cbp || IS_INTRA(mb_type)) { - s->dsp.clear_blocks(h->mb); - s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift)); - } -} - -/** - * Process a macroblock; this case avoids checks for expensive uncommon cases. - */ -#define hl_decode_mb_simple(sh, bits) \ -static void hl_decode_mb_simple_ ## bits(H264Context *h) \ -{ \ - hl_decode_mb_internal(h, 1, sh); \ -} - -hl_decode_mb_simple(0, 8) -hl_decode_mb_simple(1, 16) - -/** - * Process a macroblock; this handles edge cases, such as interlacing. - */ -static av_noinline void hl_decode_mb_complex(H264Context *h) -{ - hl_decode_mb_internal(h, 0, h->pixel_shift); -} - -static av_noinline void hl_decode_mb_444_complex(H264Context *h) -{ - hl_decode_mb_444_internal(h, 0, h->pixel_shift); -} - -static av_noinline void hl_decode_mb_444_simple(H264Context *h) -{ - hl_decode_mb_444_internal(h, 1, 0); -} +#undef SIMPLE +#define SIMPLE 0 +#include "h264_mb_template.c" void ff_h264_hl_decode_mb(H264Context *h) { @@ -2456,7 +1933,7 @@ void ff_h264_hl_decode_mb(H264Context *h) if (is_complex || h->pixel_shift) hl_decode_mb_444_complex(h); else - hl_decode_mb_444_simple(h); + hl_decode_mb_444_simple_8(h); } else if (is_complex) { hl_decode_mb_complex(h); } else if (h->pixel_shift) { diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c new file mode 100644 index 0000000000..b7856cb3d8 --- /dev/null +++ b/libavcodec/h264_mb_template.c @@ -0,0 +1,380 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef FUNC +#undef PIXEL_SHIFT + +#if SIMPLE +# define FUNC(n) AV_JOIN(n ## _simple_, BITS) +# define PIXEL_SHIFT (BITS >> 4) +#else +# define FUNC(n) n ## _complex +# define PIXEL_SHIFT h->pixel_shift +#endif + +#undef CHROMA_IDC +#define CHROMA_IDC 1 +#include "h264_mc_template.c" + +#undef CHROMA_IDC +#define CHROMA_IDC 2 +#include "h264_mc_template.c" + +static av_noinline void FUNC(hl_decode_mb)(H264Context *h) +{ + MpegEncContext *const s = &h->s; + const int mb_x = s->mb_x; + const int mb_y = s->mb_y; + const int mb_xy = h->mb_xy; + const int mb_type = s->current_picture.f.mb_type[mb_xy]; + uint8_t *dest_y, *dest_cb, *dest_cr; + int linesize, uvlinesize /*dct_offset*/; + int i, j; + int *block_offset = &h->block_offset[0]; + const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass); + /* is_h264 should always be true if SVQ3 is disabled. */ + const int is_h264 = !CONFIG_SVQ3_DECODER || SIMPLE || s->codec_id == CODEC_ID_H264; + void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + const int block_h = 16 >> s->chroma_y_shift; + const int chroma422 = CHROMA422; + + dest_y = s->current_picture.f.data[0] + ((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16; + dest_cb = s->current_picture.f.data[1] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h; + dest_cr = s->current_picture.f.data[2] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h; + + s->dsp.prefetch(dest_y + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT), s->linesize, 4); + s->dsp.prefetch(dest_cb + (s->mb_x & 7) * s->uvlinesize + (64 << PIXEL_SHIFT), dest_cr - dest_cb, 2); + + h->list_counts[mb_xy] = h->list_count; + + if (!SIMPLE && MB_FIELD) { + linesize = h->mb_linesize = s->linesize * 2; + uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; + block_offset = &h->block_offset[48]; + if (mb_y & 1) { // FIXME move out of this function? + dest_y -= s->linesize * 15; + dest_cb -= s->uvlinesize * (block_h - 1); + dest_cr -= s->uvlinesize * (block_h - 1); + } + if (FRAME_MBAFF) { + int list; + for (list = 0; list < h->list_count; list++) { + if (!USES_LIST(mb_type, list)) + continue; + if (IS_16X16(mb_type)) { + int8_t *ref = &h->ref_cache[list][scan8[0]]; + fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1); + } else { + for (i = 0; i < 16; i += 4) { + int ref = h->ref_cache[list][scan8[i]]; + if (ref >= 0) + fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, + 8, (16 + ref) ^ (s->mb_y & 1), 1); + } + } + } + } + } else { + linesize = h->mb_linesize = s->linesize; + uvlinesize = h->mb_uvlinesize = s->uvlinesize; + // dct_offset = s->linesize * 16; + } + + if (!SIMPLE && IS_INTRA_PCM(mb_type)) { + if (PIXEL_SHIFT) { + const int bit_depth = h->sps.bit_depth_luma; + int j; + GetBitContext gb; + init_get_bits(&gb, (uint8_t *)h->mb, + ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth); + + for (i = 0; i < 16; i++) { + uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize); + for (j = 0; j < 16; j++) + tmp_y[j] = get_bits(&gb, bit_depth); + } + if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { + if (!h->sps.chroma_format_idc) { + for (i = 0; i < block_h; i++) { + uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize); + for (j = 0; j < 8; j++) + tmp_cb[j] = 1 << (bit_depth - 1); + } + for (i = 0; i < block_h; i++) { + uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize); + for (j = 0; j < 8; j++) + tmp_cr[j] = 1 << (bit_depth - 1); + } + } else { + for (i = 0; i < block_h; i++) { + uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize); + for (j = 0; j < 8; j++) + tmp_cb[j] = get_bits(&gb, bit_depth); + } + for (i = 0; i < block_h; i++) { + uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize); + for (j = 0; j < 8; j++) + tmp_cr[j] = get_bits(&gb, bit_depth); + } + } + } + } else { + for (i = 0; i < 16; i++) + memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16); + if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { + if (!h->sps.chroma_format_idc) { + for (i = 0; i < block_h; i++) { + memset(dest_cb + i * uvlinesize, 128, 8); + memset(dest_cr + i * uvlinesize, 128, 8); + } + } else { + uint8_t *src_cb = (uint8_t *)h->mb + 256; + uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8; + for (i = 0; i < block_h; i++) { + memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8); + memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8); + } + } + } + } + } else { + if (IS_INTRA(mb_type)) { + if (h->deblocking_filter) + xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, + uvlinesize, 1, 0, SIMPLE, PIXEL_SHIFT); + + if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { + h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize); + h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize); + } + + hl_decode_mb_predict_luma(h, mb_type, is_h264, SIMPLE, + transform_bypass, PIXEL_SHIFT, + block_offset, linesize, dest_y, 0); + + if (h->deblocking_filter) + xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, + uvlinesize, 0, 0, SIMPLE, PIXEL_SHIFT); + } else if (is_h264) { + if (chroma422) { + FUNC(hl_motion_422)(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab); + } else { + FUNC(hl_motion_420)(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab); + } + } + + hl_decode_mb_idct_luma(h, mb_type, is_h264, SIMPLE, transform_bypass, + PIXEL_SHIFT, block_offset, linesize, dest_y, 0); + + if ((SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) && + (h->cbp & 0x30)) { + uint8_t *dest[2] = { dest_cb, dest_cr }; + if (transform_bypass) { + if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 && + (h->chroma_pred_mode == VERT_PRED8x8 || + h->chroma_pred_mode == HOR_PRED8x8)) { + h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], + block_offset + 16, + h->mb + (16 * 16 * 1 << PIXEL_SHIFT), + uvlinesize); + h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], + block_offset + 32, + h->mb + (16 * 16 * 2 << PIXEL_SHIFT), + uvlinesize); + } else { + idct_add = s->dsp.add_pixels4; + for (j = 1; j < 3; j++) { + for (i = j * 16; i < j * 16 + 4; i++) + if (h->non_zero_count_cache[scan8[i]] || + dctcoef_get(h->mb, PIXEL_SHIFT, i * 16)) + idct_add(dest[j - 1] + block_offset[i], + h->mb + (i * 16 << PIXEL_SHIFT), + uvlinesize); + if (chroma422) { + for (i = j * 16 + 4; i < j * 16 + 8; i++) + if (h->non_zero_count_cache[scan8[i + 4]] || + dctcoef_get(h->mb, PIXEL_SHIFT, i * 16)) + idct_add(dest[j - 1] + block_offset[i + 4], + h->mb + (i * 16 << PIXEL_SHIFT), + uvlinesize); + } + } + } + } else { + if (is_h264) { + int qp[2]; + if (chroma422) { + qp[0] = h->chroma_qp[0] + 3; + qp[1] = h->chroma_qp[1] + 3; + } else { + qp[0] = h->chroma_qp[0]; + qp[1] = h->chroma_qp[1]; + } + if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]]) + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << PIXEL_SHIFT), + h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]); + if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]]) + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << PIXEL_SHIFT), + h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]); + h->h264dsp.h264_idct_add8(dest, block_offset, + h->mb, uvlinesize, + h->non_zero_count_cache); + } else if (CONFIG_SVQ3_DECODER) { + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1, + h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]); + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2, + h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]); + for (j = 1; j < 3; j++) { + for (i = j * 16; i < j * 16 + 4; i++) + if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) { + uint8_t *const ptr = dest[j - 1] + block_offset[i]; + ff_svq3_add_idct_c(ptr, h->mb + i * 16, + uvlinesize, + ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2); + } + } + } + } + } + } + if (h->cbp || IS_INTRA(mb_type)) { + s->dsp.clear_blocks(h->mb); + s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); + } +} + +#if !SIMPLE || BITS == 8 + +#undef CHROMA_IDC +#define CHROMA_IDC 3 +#include "h264_mc_template.c" + +static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h) +{ + MpegEncContext *const s = &h->s; + const int mb_x = s->mb_x; + const int mb_y = s->mb_y; + const int mb_xy = h->mb_xy; + const int mb_type = s->current_picture.f.mb_type[mb_xy]; + uint8_t *dest[3]; + int linesize; + int i, j, p; + int *block_offset = &h->block_offset[0]; + const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass); + const int plane_count = (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1; + + for (p = 0; p < plane_count; p++) { + dest[p] = s->current_picture.f.data[p] + + ((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16; + s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT), + s->linesize, 4); + } + + h->list_counts[mb_xy] = h->list_count; + + if (!SIMPLE && MB_FIELD) { + linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2; + block_offset = &h->block_offset[48]; + if (mb_y & 1) // FIXME move out of this function? + for (p = 0; p < 3; p++) + dest[p] -= s->linesize * 15; + if (FRAME_MBAFF) { + int list; + for (list = 0; list < h->list_count; list++) { + if (!USES_LIST(mb_type, list)) + continue; + if (IS_16X16(mb_type)) { + int8_t *ref = &h->ref_cache[list][scan8[0]]; + fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1); + } else { + for (i = 0; i < 16; i += 4) { + int ref = h->ref_cache[list][scan8[i]]; + if (ref >= 0) + fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, + 8, (16 + ref) ^ (s->mb_y & 1), 1); + } + } + } + } + } else { + linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize; + } + + if (!SIMPLE && IS_INTRA_PCM(mb_type)) { + if (PIXEL_SHIFT) { + const int bit_depth = h->sps.bit_depth_luma; + GetBitContext gb; + init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth); + + for (p = 0; p < plane_count; p++) + for (i = 0; i < 16; i++) { + uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize); + for (j = 0; j < 16; j++) + tmp[j] = get_bits(&gb, bit_depth); + } + } else { + for (p = 0; p < plane_count; p++) + for (i = 0; i < 16; i++) + memcpy(dest[p] + i * linesize, + (uint8_t *)h->mb + p * 256 + i * 16, 16); + } + } else { + if (IS_INTRA(mb_type)) { + if (h->deblocking_filter) + xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, + linesize, 1, 1, SIMPLE, PIXEL_SHIFT); + + for (p = 0; p < plane_count; p++) + hl_decode_mb_predict_luma(h, mb_type, 1, SIMPLE, + transform_bypass, PIXEL_SHIFT, + block_offset, linesize, dest[p], p); + + if (h->deblocking_filter) + xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, + linesize, 0, 1, SIMPLE, PIXEL_SHIFT); + } else { + FUNC(hl_motion_444)(h, dest[0], dest[1], dest[2], + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab); + } + + for (p = 0; p < plane_count; p++) + hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass, + PIXEL_SHIFT, block_offset, linesize, + dest[p], p); + } + if (h->cbp || IS_INTRA(mb_type)) { + s->dsp.clear_blocks(h->mb); + s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); + } +} + +#endif diff --git a/libavcodec/h264_mc_template.c b/libavcodec/h264_mc_template.c new file mode 100644 index 0000000000..a3af39bb39 --- /dev/null +++ b/libavcodec/h264_mc_template.c @@ -0,0 +1,160 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef MCFUNC + +#if CHROMA_IDC == 1 +# define MCFUNC(n) FUNC(n ## _420) +#elif CHROMA_IDC == 2 +# define MCFUNC(n) FUNC(n ## _422) +#elif CHROMA_IDC == 3 +# define MCFUNC(n) FUNC(n ## _444) +#endif + +#undef mc_part +#define mc_part MCFUNC(mc_part) + +static void mc_part(H264Context *h, int n, int square, + int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, + uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, + h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, + h264_chroma_mc_func chroma_avg, + h264_weight_func *weight_op, + h264_biweight_func *weight_avg, + int list0, int list1) +{ + if ((h->use_weight == 2 && list0 && list1 && + (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) || + h->use_weight == 1) + mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, + weight_op[0], weight_op[1], weight_avg[0], + weight_avg[1], list0, list1, PIXEL_SHIFT, CHROMA_IDC); + else + mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, qpix_avg, + chroma_avg, list0, list1, PIXEL_SHIFT, CHROMA_IDC); +} + +static void MCFUNC(hl_motion)(H264Context *h, uint8_t *dest_y, + uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func(*qpix_put)[16], + h264_chroma_mc_func(*chroma_put), + qpel_mc_func(*qpix_avg)[16], + h264_chroma_mc_func(*chroma_avg), + h264_weight_func *weight_op, + h264_biweight_func *weight_avg) +{ + MpegEncContext *const s = &h->s; + const int mb_xy = h->mb_xy; + const int mb_type = s->current_picture.f.mb_type[mb_xy]; + + assert(IS_INTER(mb_type)); + + if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME)) + await_references(h); + prefetch_motion(h, 0, PIXEL_SHIFT, CHROMA_IDC); + + if (IS_16X16(mb_type)) { + mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], + weight_op, weight_avg, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + } else if (IS_16X8(mb_type)) { + mc_part(h, 0, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], + weight_op, weight_avg, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + mc_part(h, 8, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 4, + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], + weight_op, weight_avg, + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + } else if (IS_8X16(mb_type)) { + mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[1], &weight_avg[1], + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[1], &weight_avg[1], + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + } else { + int i; + + assert(IS_8X8(mb_type)); + + for (i = 0; i < 4; i++) { + const int sub_mb_type = h->sub_mb_type[i]; + const int n = 4 * i; + int x_offset = (i & 1) << 2; + int y_offset = (i & 2) << 1; + + if (IS_SUB_8X8(sub_mb_type)) { + mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, + x_offset, y_offset, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[1], &weight_avg[1], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } else if (IS_SUB_8X4(sub_mb_type)) { + mc_part(h, n, 0, 4, 4 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, + x_offset, y_offset, + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], + &weight_op[1], &weight_avg[1], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + mc_part(h, n + 2, 0, 4, 4 << PIXEL_SHIFT, + dest_y, dest_cb, dest_cr, x_offset, y_offset + 2, + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], + &weight_op[1], &weight_avg[1], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } else if (IS_SUB_4X8(sub_mb_type)) { + mc_part(h, n, 0, 8, 4 * h->mb_linesize, + dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[2], &weight_avg[2], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize, + dest_y, dest_cb, dest_cr, x_offset + 2, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[2], &weight_avg[2], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } else { + int j; + assert(IS_SUB_4X4(sub_mb_type)); + for (j = 0; j < 4; j++) { + int sub_x_offset = x_offset + 2 * (j & 1); + int sub_y_offset = y_offset + (j & 2); + mc_part(h, n + j, 1, 4, 0, + dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[2], &weight_avg[2], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } + } + } + } + + prefetch_motion(h, 1, PIXEL_SHIFT, CHROMA_IDC); +} + From 4c679750cb4cb112c19f862bd733bf6660a935bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Mon, 30 Apr 2012 22:48:42 +0200 Subject: [PATCH 10/16] avconv: fix parsing of -force_key_frames option. Currently it always exits with an error when more than one position is specified. CC: libav-stable@libav.org --- avconv.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/avconv.c b/avconv.c index 961356752a..6517c4bcfc 100644 --- a/avconv.c +++ b/avconv.c @@ -2350,10 +2350,18 @@ static void parse_forced_key_frames(char *kf, OutputStream *ost, av_log(NULL, AV_LOG_FATAL, "Could not allocate forced key frames array.\n"); exit_program(1); } + + p = kf; for (i = 0; i < n; i++) { - p = i ? strchr(p, ',') + 1 : kf; + char *next = strchr(p, ','); + + if (next) + *next++ = 0; + t = parse_time_or_die("force_key_frames", p, 1); ost->forced_kf_pts[i] = av_rescale_q(t, AV_TIME_BASE_Q, avctx->time_base); + + p = next; } } From 2696789c52cd4a8a2ff554d2329c60001459c21b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 4 Jul 2012 11:35:18 +0200 Subject: [PATCH 11/16] avplay: update input filter pointer when the filtergraph is reset. Fixes an invalid read on size change. --- avplay.c | 1 + 1 file changed, 1 insertion(+) diff --git a/avplay.c b/avplay.c index a5d4e63417..1961f5fb8c 100644 --- a/avplay.c +++ b/avplay.c @@ -1644,6 +1644,7 @@ static int video_thread(void *arg) graph = avfilter_graph_alloc(); if ((ret = configure_video_filters(graph, is, vfilters)) < 0) goto the_end; + filt_in = is->in_video_filter; filt_out = is->out_video_filter; last_w = is->video_st->codec->width; last_h = is->video_st->codec->height; From d20f133ef962da71326bc3635e086696f45ab64e Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 4 Jul 2012 01:07:42 +0200 Subject: [PATCH 12/16] x86: h264_intrapred: port to cpuflag macros --- libavcodec/x86/h264_intrapred.asm | 290 ++++++++++++++------------- libavcodec/x86/h264_intrapred_init.c | 24 +-- 2 files changed, 168 insertions(+), 146 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 3beb3b9d6d..87d32c4ee3 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -87,23 +87,23 @@ cglobal pred16x16_vertical_sse, 2,3 ; void pred16x16_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_H 1 -cglobal pred16x16_horizontal_%1, 2,3 +%macro PRED16x16_H 0 +cglobal pred16x16_horizontal, 2,3 mov r2, 8 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 +%if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 %else punpcklbw m0, m0 punpcklbw m1, m1 -%ifidn %1, mmxext +%if cpuflag(mmx2) pshufw m0, m0, 0xff pshufw m1, m1, 0xff %else @@ -124,18 +124,20 @@ cglobal pred16x16_horizontal_%1, 2,3 REP_RET %endmacro -INIT_MMX -PRED16x16_H mmx -PRED16x16_H mmxext +INIT_MMX mmx +PRED16x16_H +INIT_MMX mmx2 +PRED16x16_H +INIT_XMM ssse3 +PRED16x16_H INIT_XMM -PRED16x16_H ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_dc(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_DC 1 -cglobal pred16x16_dc_%1, 2,7 +%macro PRED16x16_DC 0 +cglobal pred16x16_dc, 2,7 mov r4, r0 sub r0, r1 pxor mm0, mm0 @@ -158,19 +160,19 @@ cglobal pred16x16_dc_%1, 2,7 add r5d, r6d lea r2d, [r2+r5+16] shr r2d, 5 -%ifidn %1, mmxext +%if cpuflag(ssse3) + pxor m1, m1 movd m0, r2d - punpcklbw m0, m0 - pshufw m0, m0, 0 -%elifidn %1, sse2 + pshufb m0, m1 +%elif cpuflag(sse2) movd m0, r2d punpcklbw m0, m0 pshuflw m0, m0, 0 punpcklqdq m0, m0 -%elifidn %1, ssse3 - pxor m1, m1 +%elif cpuflag(mmx2) movd m0, r2d - pshufb m0, m1 + punpcklbw m0, m0 + pshufw m0, m0, 0 %endif %if mmsize==8 @@ -195,18 +197,20 @@ cglobal pred16x16_dc_%1, 2,7 REP_RET %endmacro -INIT_MMX -PRED16x16_DC mmxext +INIT_MMX mmx2 +PRED16x16_DC +INIT_XMM sse2 +PRED16x16_DC +INIT_XMM ssse3 +PRED16x16_DC INIT_XMM -PRED16x16_DC sse2 -PRED16x16_DC ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_TM_MMX 1 -cglobal pred16x16_tm_vp8_%1, 2,5 +%macro PRED16x16_TM_MMX 0 +cglobal pred16x16_tm_vp8, 2,5 sub r0, r1 pxor mm7, mm7 movq mm0, [r0+0] @@ -223,11 +227,11 @@ cglobal pred16x16_tm_vp8_%1, 2,5 movzx r2d, byte [r0+r1-1] sub r2d, r3d movd mm4, r2d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm4, mm4, 0 +%else punpcklwd mm4, mm4 punpckldq mm4, mm4 -%else - pshufw mm4, mm4, 0 %endif movq mm5, mm4 movq mm6, mm4 @@ -246,8 +250,11 @@ cglobal pred16x16_tm_vp8_%1, 2,5 REP_RET %endmacro -PRED16x16_TM_MMX mmx -PRED16x16_TM_MMX mmxext +INIT_MMX mmx +PRED16x16_TM_MMX +INIT_MMX mmx2 +PRED16x16_TM_MMX +INIT_MMX cglobal pred16x16_tm_vp8_sse2, 2,6,6 sub r0, r1 @@ -288,8 +295,8 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6 ; void pred16x16_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED16x16_PLANE 3 -cglobal pred16x16_plane_%3_%1, 2, 9, %2 +%macro H264_PRED16x16_PLANE 1 +cglobal pred16x16_plane_%1, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -310,7 +317,10 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 paddw m0, m2 paddw m1, m3 %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +8] + pmaddubsw m0, [plane_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movh m1, [r0+r1 +8] punpcklbw m0, m2 @@ -318,29 +328,26 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 pmullw m0, [pw_m8tom1] pmullw m1, [pw_1to8] paddw m0, m1 -%else ; ssse3 - movhps m0, [r0+r1 +8] - pmaddubsw m0, [plane_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0xE +%elif cpuflag(mmx2) + pshufw m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0x1 +%elif cpuflag(mmx2) + pshufw m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients @@ -424,13 +431,13 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 mov r0, r0m %endif -%ifidn %3, h264 +%ifidn %1, h264 lea r5, [r5*5+32] sar r5, 6 -%elifidn %3, rv40 +%elifidn %1, rv40 lea r5, [r5*5] sar r5, 6 -%elifidn %3, svq3 +%elifidn %1, svq3 test r5, r5 lea r6, [r5+3] cmovs r5, r6 @@ -449,8 +456,8 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 movd r1d, m0 movsx r1d, r1w -%ifnidn %3, svq3 -%ifidn %3, h264 +%ifnidn %1, svq3 +%ifidn %1, h264 lea r1d, [r1d*5+32] %else ; rv40 lea r1d, [r1d*5] @@ -476,26 +483,26 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else +%if cpuflag(sse2) pshuflw m0, m0, 0x0 pshuflw m1, m1, 0x0 pshuflw m3, m3, 0x0 punpcklqdq m0, m0 ; splat H (words) punpcklqdq m1, m1 ; splat V (words) punpcklqdq m3, m3 ; splat a (words) +%elif cpuflag(mmx2) + pshufw m0, m0, 0x0 + pshufw m1, m1, 0x0 + pshufw m3, m3, 0x0 +%elif cpuflag(mmx) + punpcklwd m0, m0 + punpcklwd m1, m1 + punpcklwd m3, m3 + punpckldq m0, m0 + punpckldq m1, m1 + punpckldq m3, m3 %endif -%ifidn %3, svq3 +%ifidn %1, svq3 SWAP 0, 1 %endif mova m2, m0 @@ -568,27 +575,30 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 REP_RET %endmacro -INIT_MMX -H264_PRED16x16_PLANE mmx, 0, h264 -H264_PRED16x16_PLANE mmx, 0, rv40 -H264_PRED16x16_PLANE mmx, 0, svq3 -H264_PRED16x16_PLANE mmx2, 0, h264 -H264_PRED16x16_PLANE mmx2, 0, rv40 -H264_PRED16x16_PLANE mmx2, 0, svq3 +INIT_MMX mmx +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_MMX mmx2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM sse2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM ssse3 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 INIT_XMM -H264_PRED16x16_PLANE sse2, 8, h264 -H264_PRED16x16_PLANE sse2, 8, rv40 -H264_PRED16x16_PLANE sse2, 8, svq3 -H264_PRED16x16_PLANE ssse3, 8, h264 -H264_PRED16x16_PLANE ssse3, 8, rv40 -H264_PRED16x16_PLANE ssse3, 8, svq3 ;----------------------------------------------------------------------------- ; void pred8x8_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED8x8_PLANE 2 -cglobal pred8x8_plane_%1, 2, 9, %2 +%macro H264_PRED8x8_PLANE 0 +cglobal pred8x8_plane, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -601,39 +611,39 @@ cglobal pred8x8_plane_%1, 2, 9, %2 pmullw m0, [pw_m4to4] pmullw m1, [pw_m4to4+8] %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary + pmaddubsw m0, [plane8_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movd m1, [r0+r1 +4] punpckldq m0, m1 punpcklbw m0, m2 pmullw m0, [pw_m4to4] -%else ; ssse3 - movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary - pmaddubsw m0, [plane8_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifnidn %1, ssse3 -%ifidn %1, mmx +%if notcpuflag(ssse3) +%if cpuflag(sse2) ; mmsize == 16 + pshuflw m1, m0, 0xE +%elif cpuflag(mmx2) + pshufw m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 %endif ; !ssse3 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0x1 +%elif cpuflag(mmx2) + pshufw m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients @@ -701,24 +711,24 @@ cglobal pred8x8_plane_%1, 2, 9, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else +%if cpuflag(sse2) pshuflw m0, m0, 0x0 pshuflw m1, m1, 0x0 pshuflw m3, m3, 0x0 punpcklqdq m0, m0 ; splat H (words) punpcklqdq m1, m1 ; splat V (words) punpcklqdq m3, m3 ; splat a (words) +%elif cpuflag(mmx2) + pshufw m0, m0, 0x0 + pshufw m1, m1, 0x0 + pshufw m3, m3, 0x0 +%elif cpuflag(mmx) + punpcklwd m0, m0 + punpcklwd m1, m1 + punpcklwd m3, m3 + punpckldq m0, m0 + punpckldq m1, m1 + punpckldq m3, m3 %endif %if mmsize == 8 mova m2, m0 @@ -768,12 +778,15 @@ ALIGN 16 REP_RET %endmacro -INIT_MMX -H264_PRED8x8_PLANE mmx, 0 -H264_PRED8x8_PLANE mmx2, 0 +INIT_MMX mmx +H264_PRED8x8_PLANE +INIT_MMX mmx2 +H264_PRED8x8_PLANE +INIT_XMM sse2 +H264_PRED8x8_PLANE +INIT_XMM ssse3 +H264_PRED8x8_PLANE INIT_XMM -H264_PRED8x8_PLANE sse2, 8 -H264_PRED8x8_PLANE ssse3, 8 ;----------------------------------------------------------------------------- ; void pred8x8_vertical(uint8_t *src, int stride) @@ -795,22 +808,22 @@ cglobal pred8x8_vertical_mmx, 2,2 ; void pred8x8_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_H 1 -cglobal pred8x8_horizontal_%1, 2,3 +%macro PRED8x8_H 0 +cglobal pred8x8_horizontal, 2,3 mov r2, 4 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 +%if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 %else punpcklbw m0, m0 punpcklbw m1, m1 -%ifidn %1, mmxext +%if cpuflag(mmx2) pshufw m0, m0, 0xff pshufw m1, m1, 0xff %else @@ -828,10 +841,13 @@ cglobal pred8x8_horizontal_%1, 2,3 REP_RET %endmacro +INIT_MMX mmx +PRED8x8_H +INIT_MMX mmx2 +PRED8x8_H +INIT_MMX ssse3 +PRED8x8_H INIT_MMX -PRED8x8_H mmx -PRED8x8_H mmxext -PRED8x8_H ssse3 ;----------------------------------------------------------------------------- ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride) @@ -967,8 +983,8 @@ cglobal pred8x8_dc_rv40_mmxext, 2,7 ; void pred8x8_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TM_MMX 1 -cglobal pred8x8_tm_vp8_%1, 2,6 +%macro PRED8x8_TM_MMX 0 +cglobal pred8x8_tm_vp8, 2,6 sub r0, r1 pxor mm7, mm7 movq mm0, [r0] @@ -984,14 +1000,14 @@ cglobal pred8x8_tm_vp8_%1, 2,6 sub r3d, r4d movd mm2, r2d movd mm4, r3d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else punpcklwd mm2, mm2 punpcklwd mm4, mm4 punpckldq mm2, mm2 punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 %endif movq mm3, mm2 movq mm5, mm4 @@ -1009,8 +1025,11 @@ cglobal pred8x8_tm_vp8_%1, 2,6 REP_RET %endmacro -PRED8x8_TM_MMX mmx -PRED8x8_TM_MMX mmxext +INIT_MMX mmx +PRED8x8_TM_MMX +INIT_MMX mmx2 +PRED8x8_TM_MMX +INIT_MMX cglobal pred8x8_tm_vp8_sse2, 2,6,4 sub r0, r1 @@ -2510,8 +2529,8 @@ cglobal pred4x4_dc_mmxext, 3,5 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) ;----------------------------------------------------------------------------- -%macro PRED4x4_TM_MMX 1 -cglobal pred4x4_tm_vp8_%1, 3,6 +%macro PRED4x4_TM_MMX 0 +cglobal pred4x4_tm_vp8, 3,6 sub r0, r2 pxor mm7, mm7 movd mm0, [r0] @@ -2525,14 +2544,14 @@ cglobal pred4x4_tm_vp8_%1, 3,6 sub r3d, r4d movd mm2, r1d movd mm4, r3d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else punpcklwd mm2, mm2 punpcklwd mm4, mm4 punpckldq mm2, mm2 punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 %endif paddw mm2, mm0 paddw mm4, mm0 @@ -2546,8 +2565,11 @@ cglobal pred4x4_tm_vp8_%1, 3,6 REP_RET %endmacro -PRED4x4_TM_MMX mmx -PRED4x4_TM_MMX mmxext +INIT_MMX mmx +PRED4x4_TM_MMX +INIT_MMX mmx2 +PRED4x4_TM_MMX +INIT_MMX cglobal pred4x4_tm_vp8_ssse3, 3,3 sub r0, r2 diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 6bad9696c7..c6d4c709ec 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -96,9 +96,9 @@ PRED16x16(horizontal, 10, sse2) void ff_pred16x16_vertical_mmx (uint8_t *src, int stride); void ff_pred16x16_vertical_sse (uint8_t *src, int stride); void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride); -void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride); +void ff_pred16x16_horizontal_mmx2 (uint8_t *src, int stride); void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride); -void ff_pred16x16_dc_mmxext (uint8_t *src, int stride); +void ff_pred16x16_dc_mmx2 (uint8_t *src, int stride); void ff_pred16x16_dc_sse2 (uint8_t *src, int stride); void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride); void ff_pred16x16_plane_h264_mmx (uint8_t *src, int stride); @@ -114,21 +114,21 @@ void ff_pred16x16_plane_svq3_mmx2 (uint8_t *src, int stride); void ff_pred16x16_plane_svq3_sse2 (uint8_t *src, int stride); void ff_pred16x16_plane_svq3_ssse3 (uint8_t *src, int stride); void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride); -void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_mmx2 (uint8_t *src, int stride); void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride); void ff_pred8x8_top_dc_mmxext (uint8_t *src, int stride); void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride); void ff_pred8x8_dc_mmxext (uint8_t *src, int stride); void ff_pred8x8_vertical_mmx (uint8_t *src, int stride); void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride); -void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride); +void ff_pred8x8_horizontal_mmx2 (uint8_t *src, int stride); void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride); void ff_pred8x8_plane_mmx (uint8_t *src, int stride); void ff_pred8x8_plane_mmx2 (uint8_t *src, int stride); void ff_pred8x8_plane_sse2 (uint8_t *src, int stride); void ff_pred8x8_plane_ssse3 (uint8_t *src, int stride); void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride); -void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_mmx2 (uint8_t *src, int stride); void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride); void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride); void ff_pred8x8l_top_dc_mmxext (uint8_t *src, int has_topleft, int has_topright, int stride); @@ -163,7 +163,7 @@ void ff_pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int void ff_pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride); void ff_pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride); void ff_pred4x4_tm_vp8_mmx (uint8_t *src, const uint8_t *topright, int stride); -void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int stride); +void ff_pred4x4_tm_vp8_mmx2 (uint8_t *src, const uint8_t *topright, int stride); void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride); void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride); @@ -199,10 +199,10 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth } if (mm_flags & AV_CPU_FLAG_MMX2) { - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext; - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx2; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmx2; if (chroma_format_idc == 1) - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx2; h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_mmxext; h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_mmxext; h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_mmxext; @@ -232,10 +232,10 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth } } if (codec_id == CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_mmxext; + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_mmx2; h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext; - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_mmxext; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext; + h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_mmx2; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx2; h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext; } else { if (chroma_format_idc == 1) From 2cd1f5cadcab6c6c992c3d07575f76e3e6e59c0e Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 3 Jul 2012 17:51:43 +0000 Subject: [PATCH 13/16] x86inc: modify ALIGN to not generate long nops on i586 Signed-off-by: Diego Biurrun --- libavutil/x86/x86inc.asm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index c167057921..42ba97ade5 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -571,6 +571,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-2 + CPU amdnop %if %0 >= 1 %xdefine cpuname %1 %assign cpuflags cpuflags_%1 @@ -592,6 +593,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %elifidn %1, sse3 %define movu lddqu %endif + %if notcpuflag(mmx2) + CPU basicnop + %endif %else %xdefine SUFFIX %undef cpuname From 4d4752366f873fde444815b9a0a8f1077073d266 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 4 Jul 2012 11:07:49 +0000 Subject: [PATCH 14/16] x86inc: add SPLATB_LOAD, SPLATB_REG, PSHUFLW macros Signed-off-by: Diego Biurrun --- libavcodec/x86/vp8dsp.asm | 21 -------------------- libavutil/x86/x86util.asm | 41 +++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 82f21fefae..531b205b7b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1465,27 +1465,6 @@ VP8_DC_WHT %endif %endmacro -%macro SPLATB_REG 2-3 -%if cpuflag(ssse3) - movd %1, %2d - pshufb %1, %3 -%elif cpuflag(sse2) - movd %1, %2d - punpcklbw %1, %1 - pshuflw %1, %1, 0x0 - punpcklqdq %1, %1 -%elif cpuflag(mmx2) - movd %1, %2d - punpcklbw %1, %1 - pshufw %1, %1, 0x0 -%else - movd %1, %2d - punpcklbw %1, %1 - punpcklwd %1, %1 - punpckldq %1, %1 -%endif -%endmacro - %macro SIMPLE_LOOPFILTER 2 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %if mmsize == 8 ; mmx/mmxext diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 066384b4c7..941ec76084 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -256,15 +256,26 @@ %define ABSB ABSB_MMX %define ABSB2 ABSB2_MMX -%macro SPLATB_MMX 3 +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else movd %1, [%2-3] ;to avoid crossing a cacheline punpcklbw %1, %1 SPLATW %1, %1, 3 +%endif %endmacro -%macro SPLATB_SSSE3 3 - movd %1, [%2-3] +%macro SPLATB_REG 3 +%if cpuflag(ssse3) + movd %1, %2d pshufb %1, %3 +%else + movd %1, %2d + punpcklbw %1, %1 + SPLATW %1, %1, 0 +%endif %endmacro %macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp @@ -296,6 +307,14 @@ %endif %endmacro +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from %ifnum %5 pand m%3, m%5, m%4 ; src .. y6 .. y4 @@ -521,8 +540,22 @@ %if mmsize == 16 pshuflw %1, %2, (%3)*0x55 punpcklqdq %1, %1 -%else +%elif cpuflag(mmx2) pshufw %1, %2, (%3)*0x55 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + %if %3 & 2 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif + %if %3 & 1 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif %endif %endmacro From 878e669029e3c73f95008bc1db3a903a998034f5 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 4 Jul 2012 15:32:16 +0200 Subject: [PATCH 15/16] x86: h264_intrapred: use newly introduced SPLAT* and PSHUFLW macros --- libavcodec/x86/h264_intrapred.asm | 129 +++++------------------------- 1 file changed, 22 insertions(+), 107 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 87d32c4ee3..5984454824 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -103,15 +103,8 @@ cglobal pred16x16_horizontal, 2,3 %else punpcklbw m0, m0 punpcklbw m1, m1 -%if cpuflag(mmx2) - pshufw m0, m0, 0xff - pshufw m1, m1, 0xff -%else - punpckhwd m0, m0 - punpckhwd m1, m1 - punpckhdq m0, m0 - punpckhdq m1, m1 -%endif + SPLATW m0, m0, 3 + SPLATW m1, m1, 3 mova [r0+r1*0+8], m0 mova [r0+r1*1+8], m1 %endif @@ -162,18 +155,8 @@ cglobal pred16x16_dc, 2,7 shr r2d, 5 %if cpuflag(ssse3) pxor m1, m1 - movd m0, r2d - pshufb m0, m1 -%elif cpuflag(sse2) - movd m0, r2d - punpcklbw m0, m0 - pshuflw m0, m0, 0 - punpcklqdq m0, m0 -%elif cpuflag(mmx2) - movd m0, r2d - punpcklbw m0, m0 - pshufw m0, m0, 0 %endif + SPLATB_REG m0, r2d, m1 %if mmsize==8 mov r3d, 8 @@ -227,12 +210,7 @@ cglobal pred16x16_tm_vp8, 2,5 movzx r2d, byte [r0+r1-1] sub r2d, r3d movd mm4, r2d -%if cpuflag(mmx2) - pshufw mm4, mm4, 0 -%else - punpcklwd mm4, mm4 - punpckldq mm4, mm4 -%endif + SPLATW mm4, mm4, 0 movq mm5, mm4 movq mm6, mm4 movq mm7, mm4 @@ -332,19 +310,15 @@ cglobal pred16x16_plane_%1, 2,9,7 movhlps m1, m0 %endif paddw m0, m1 -%if cpuflag(sse2) - pshuflw m1, m0, 0xE -%elif cpuflag(mmx2) - pshufw m1, m0, 0xE +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0xE %elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 %endif paddw m0, m1 -%if cpuflag(sse2) - pshuflw m1, m0, 0x1 -%elif cpuflag(mmx2) - pshufw m1, m0, 0x1 +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0x1 %elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 @@ -483,25 +457,9 @@ cglobal pred16x16_plane_%1, 2,9,7 movd m1, r5d movd m3, r3d -%if cpuflag(sse2) - pshuflw m0, m0, 0x0 - pshuflw m1, m1, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m0, m0 ; splat H (words) - punpcklqdq m1, m1 ; splat V (words) - punpcklqdq m3, m3 ; splat a (words) -%elif cpuflag(mmx2) - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%elif cpuflag(mmx) - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%endif + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a %ifidn %1, svq3 SWAP 0, 1 %endif @@ -626,10 +584,8 @@ cglobal pred8x8_plane, 2,9,7 paddw m0, m1 %if notcpuflag(ssse3) -%if cpuflag(sse2) ; mmsize == 16 - pshuflw m1, m0, 0xE -%elif cpuflag(mmx2) - pshufw m1, m0, 0xE +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0xE %elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 @@ -637,10 +593,8 @@ cglobal pred8x8_plane, 2,9,7 paddw m0, m1 %endif ; !ssse3 -%if cpuflag(sse2) - pshuflw m1, m0, 0x1 -%elif cpuflag(mmx2) - pshufw m1, m0, 0x1 +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0x1 %elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 @@ -711,25 +665,9 @@ cglobal pred8x8_plane, 2,9,7 movd m1, r5d movd m3, r3d -%if cpuflag(sse2) - pshuflw m0, m0, 0x0 - pshuflw m1, m1, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m0, m0 ; splat H (words) - punpcklqdq m1, m1 ; splat V (words) - punpcklqdq m3, m3 ; splat a (words) -%elif cpuflag(mmx2) - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%elif cpuflag(mmx) - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%endif + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a %if mmsize == 8 mova m2, m0 %endif @@ -815,24 +753,8 @@ cglobal pred8x8_horizontal, 2,3 mova m2, [pb_3] %endif .loop: - movd m0, [r0+r1*0-4] - movd m1, [r0+r1*1-4] -%if cpuflag(ssse3) - pshufb m0, m2 - pshufb m1, m2 -%else - punpcklbw m0, m0 - punpcklbw m1, m1 -%if cpuflag(mmx2) - pshufw m0, m0, 0xff - pshufw m1, m1, 0xff -%else - punpckhwd m0, m0 - punpckhwd m1, m1 - punpckhdq m0, m0 - punpckhdq m1, m1 -%endif -%endif + SPLATB_LOAD m0, r0+r1*0-1, m2 + SPLATB_LOAD m1, r0+r1*1-1, m2 mova [r0+r1*0], m0 mova [r0+r1*1], m1 lea r0, [r0+r1*2] @@ -1000,15 +922,8 @@ cglobal pred8x8_tm_vp8, 2,6 sub r3d, r4d movd mm2, r2d movd mm4, r3d -%if cpuflag(mmx2) - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 -%else - punpcklwd mm2, mm2 - punpcklwd mm4, mm4 - punpckldq mm2, mm2 - punpckldq mm4, mm4 -%endif + SPLATW mm2, mm2, 0 + SPLATW mm4, mm4, 0 movq mm3, mm2 movq mm5, mm4 paddw mm2, mm0 From bb58c43c69078c6cf29a9efee12e14469e2c21f8 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Thu, 5 Jul 2012 13:20:57 +0200 Subject: [PATCH 16/16] qdm2: remove broken and disabled dump_context() debug function --- libavcodec/qdm2.c | 48 ----------------------------------------------- 1 file changed, 48 deletions(-) diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c index 64a821acdc..4b5ef5cd00 100644 --- a/libavcodec/qdm2.c +++ b/libavcodec/qdm2.c @@ -1672,51 +1672,6 @@ static av_cold void qdm2_init(QDM2Context *q) { } -#if 0 -static void dump_context(QDM2Context *q) -{ - int i; -#define PRINT(a,b) av_log(NULL,AV_LOG_DEBUG," %s = %d\n", a, b); - PRINT("compressed_data",q->compressed_data); - PRINT("compressed_size",q->compressed_size); - PRINT("frame_size",q->frame_size); - PRINT("checksum_size",q->checksum_size); - PRINT("channels",q->channels); - PRINT("nb_channels",q->nb_channels); - PRINT("fft_size",q->fft_size); - PRINT("sub_sampling",q->sub_sampling); - PRINT("fft_order",q->fft_order); - PRINT("group_order",q->group_order); - PRINT("group_size",q->group_size); - PRINT("sub_packet",q->sub_packet); - PRINT("frequency_range",q->frequency_range); - PRINT("has_errors",q->has_errors); - PRINT("fft_tone_end",q->fft_tone_end); - PRINT("fft_tone_start",q->fft_tone_start); - PRINT("fft_coefs_index",q->fft_coefs_index); - PRINT("coeff_per_sb_select",q->coeff_per_sb_select); - PRINT("cm_table_select",q->cm_table_select); - PRINT("noise_idx",q->noise_idx); - - for (i = q->fft_tone_start; i < q->fft_tone_end; i++) - { - FFTTone *t = &q->fft_tones[i]; - - av_log(NULL,AV_LOG_DEBUG,"Tone (%d) dump:\n", i); - av_log(NULL,AV_LOG_DEBUG," level = %f\n", t->level); -// PRINT(" level", t->level); - PRINT(" phase", t->phase); - PRINT(" phase_shift", t->phase_shift); - PRINT(" duration", t->duration); - PRINT(" samples_im", t->samples_im); - PRINT(" samples_re", t->samples_re); - PRINT(" table", t->table); - } - -} -#endif - - /** * Init parameters from codec extradata */ @@ -1895,7 +1850,6 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx) avcodec_get_frame_defaults(&s->frame); avctx->coded_frame = &s->frame; -// dump_context(s); return 0; } @@ -1919,8 +1873,6 @@ static int qdm2_decode (QDM2Context *q, const uint8_t *in, int16_t *out) q->compressed_data = in; q->compressed_size = q->checksum_size; -// dump_context(q); - /* copy old block, clear new block of output samples */ memmove(q->output_buffer, &q->output_buffer[frame_size], frame_size * sizeof(float)); memset(&q->output_buffer[frame_size], 0, frame_size * sizeof(float));