From d837407ae0bcdfe676713c16871daadddc99649f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 21 Apr 2015 20:54:51 -0400 Subject: [PATCH] vp9: add support for resolution changes in inter frames. Signed-off-by: Michael Niedermayer (cherry picked from commit e8b4f6d6befc5062db74916ea8a4d830e83022a8) Signed-off-by: Andreas Cadhalpun --- libavcodec/vp9.c | 316 ++++++++++++++++++----------------- libavcodec/vp9_mc_template.c | 171 +++++++++++++++++++ libavcodec/vp9_parser.c | 5 +- libavcodec/vp9dsp.c | 205 +++++++++++++++++++++-- libavcodec/vp9dsp.h | 9 + 5 files changed, 535 insertions(+), 171 deletions(-) create mode 100644 libavcodec/vp9_mc_template.c diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c index 8327134d37..dd75e9e79d 100644 --- a/libavcodec/vp9.c +++ b/libavcodec/vp9.c @@ -239,7 +239,7 @@ typedef struct VP9Context { // whole-frame cache uint8_t *intra_pred_data[3]; struct VP9Filter *lflvl; - DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80]; + DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135*144]; // block reconstruction intermediates int block_alloc_using_2pass; @@ -248,6 +248,8 @@ typedef struct VP9Context { struct { int x, y; } min_mv, max_mv; DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64]; DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32]; + uint16_t mvscale[3][2]; + uint8_t mvstep[3][2]; } VP9Context; static const uint8_t bwh_tab[2][N_BS_SIZES][2] = { @@ -582,6 +584,26 @@ static int decode_frame_header(AVCodecContext *ctx, s->varcompref[1] = 2; } } + + for (i = 0; i < 3; i++) { + AVFrame *ref = s->refs[s->refidx[i]].f; + int refw = ref->width, refh = ref->height; + + if (refw == w && refh == h) { + s->mvscale[i][0] = s->mvscale[i][1] = 0; + } else { + if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) { + av_log(ctx, AV_LOG_ERROR, + "Invalid ref frame dimensions %dx%d for frame size %dx%d\n", + refw, refh, w, h); + return AVERROR_INVALIDDATA; + } + s->mvscale[i][0] = (refw << 14) / w; + s->mvscale[i][1] = (refh << 14) / h; + s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14; + s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14; + } + } } } s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb); @@ -2524,12 +2546,118 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off) } } -static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2], - uint8_t *dst, ptrdiff_t dst_stride, - const uint8_t *ref, ptrdiff_t ref_stride, - ThreadFrame *ref_frame, - ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, - int bw, int bh, int w, int h) +static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc, + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, + int bw, int bh, int w, int h, + const uint16_t *scale, const uint8_t *step) +{ +#define scale_mv(n, dim) (((int64_t)n * scale[dim]) >> 14) + // BUG libvpx seems to scale the two components separately. This introduces + // rounding errors but we have to reproduce them to be exactly compatible + // with the output from libvpx... + int mx = scale_mv(mv->x * 2, 0) + scale_mv(x * 16, 0); + int my = scale_mv(mv->y * 2, 1) + scale_mv(y * 16, 1); + int refbw_m1, refbh_m1; + int th; + + y = my >> 4; + x = mx >> 4; + ref += y * ref_stride + x; + mx &= 15; + my &= 15; + refbw_m1 = ((bw - 1) * step[0] + mx) >> 4; + refbh_m1 = ((bh - 1) * step[1] + my) >> 4; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + refbh_m1 + 4 + 7) >> 6; + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) { + s->vdsp.emulated_edge_mc(s->edge_emu_buffer, + ref - 3 * ref_stride - 3, + 144, ref_stride, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref = s->edge_emu_buffer + 3 * 144 + 3; + ref_stride = 144; + } + smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]); +} + +static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc, + uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t dst_stride, + const uint8_t *ref_u, ptrdiff_t src_stride_u, + const uint8_t *ref_v, ptrdiff_t src_stride_v, + ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, + int bw, int bh, int w, int h, + const uint16_t *scale, const uint8_t *step) +{ + // BUG https://code.google.com/p/webm/issues/detail?id=820 + int mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15); + int my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15); +#undef scale_mv + int refbw_m1, refbh_m1; + int th; + + y = my >> 4; + x = mx >> 4; + ref_u += y * src_stride_u + x; + ref_v += y * src_stride_v + x; + mx &= 15; + my &= 15; + refbw_m1 = ((bw - 1) * step[0] + mx) >> 4; + refbh_m1 = ((bh - 1) * step[1] + my) >> 4; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + refbh_m1 + 4 + 7) >> 5; + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) { + s->vdsp.emulated_edge_mc(s->edge_emu_buffer, + ref_u - 3 * src_stride_u - 3, + 144, src_stride_u, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref_u = s->edge_emu_buffer + 3 * 144 + 3; + smc(dst_u, dst_stride, ref_u, 144, bh, mx, my, step[0], step[1]); + + s->vdsp.emulated_edge_mc(s->edge_emu_buffer, + ref_v - 3 * src_stride_v - 3, + 144, src_stride_v, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref_v = s->edge_emu_buffer + 3 * 144 + 3; + smc(dst_v, dst_stride, ref_v, 144, bh, mx, my, step[0], step[1]); + } else { + smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]); + smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]); + } +} + +#define FN(x) x##_scaled +#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \ + mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \ + mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) +#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, bw, bh, w, h, i) \ + mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) +#include "vp9_mc_template.c" +#undef mc_luma_dir +#undef mc_chroma_dir +#undef FN + +static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2], + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, + int bw, int bh, int w, int h) { int mx = mv->x, my = mv->y, th; @@ -2556,14 +2684,14 @@ static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2], mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1); } -static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2], - uint8_t *dst_u, uint8_t *dst_v, - ptrdiff_t dst_stride, - const uint8_t *ref_u, ptrdiff_t src_stride_u, - const uint8_t *ref_v, ptrdiff_t src_stride_v, - ThreadFrame *ref_frame, - ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, - int bw, int bh, int w, int h) +static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2], + uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t dst_stride, + const uint8_t *ref_u, ptrdiff_t src_stride_u, + const uint8_t *ref_v, ptrdiff_t src_stride_v, + ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, + int bw, int bh, int w, int h) { int mx = mv->x, my = mv->y, th; @@ -2601,156 +2729,32 @@ static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2], } } +#define FN(x) x +#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \ + mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \ + mv, bw, bh, w, h) +#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, bw, bh, w, h, i) \ + mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, bw, bh, w, h) +#include "vp9_mc_template.c" +#undef mc_luma_dir_dir +#undef mc_chroma_dir_dir +#undef FN + static void inter_recon(AVCodecContext *ctx) { - static const uint8_t bwlog_tab[2][N_BS_SIZES] = { - { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 }, - { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 }, - }; VP9Context *s = ctx->priv_data; VP9Block *b = s->b; int row = s->row, col = s->col; - ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2; - AVFrame *ref1 = tref1->f, *ref2; - int w1 = ref1->width, h1 = ref1->height, w2, h2; - ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride; - if (b->comp) { - tref2 = &s->refs[s->refidx[b->ref[1]]]; - ref2 = tref2->f; - w2 = ref2->width; - h2 = ref2->height; - } - - // y inter pred - if (b->bs > BS_8x8) { - if (b->bs == BS_8x4) { - mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1); - mc_luma_dir(s, s->dsp.mc[3][b->filter][0], - s->dst[0] + 4 * ls_y, ls_y, - ref1->data[0], ref1->linesize[0], tref1, - (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1); - - if (b->comp) { - mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2); - mc_luma_dir(s, s->dsp.mc[3][b->filter][1], - s->dst[0] + 4 * ls_y, ls_y, - ref2->data[0], ref2->linesize[0], tref2, - (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2); - } - } else if (b->bs == BS_4x8) { - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1); - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1); - - if (b->comp) { - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2); - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2); - } - } else { - av_assert2(b->bs == BS_4x4); - - // FIXME if two horizontally adjacent blocks have the same MV, - // do a w8 instead of a w4 call - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1); - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1); - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], - s->dst[0] + 4 * ls_y, ls_y, - ref1->data[0], ref1->linesize[0], tref1, - (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1); - mc_luma_dir(s, s->dsp.mc[4][b->filter][0], - s->dst[0] + 4 * ls_y + 4, ls_y, - ref1->data[0], ref1->linesize[0], tref1, - (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1); - - if (b->comp) { - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2); - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2); - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], - s->dst[0] + 4 * ls_y, ls_y, - ref2->data[0], ref2->linesize[0], tref2, - (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2); - mc_luma_dir(s, s->dsp.mc[4][b->filter][1], - s->dst[0] + 4 * ls_y + 4, ls_y, - ref2->data[0], ref2->linesize[0], tref2, - (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2); - } - } + if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) { + inter_pred_scaled(ctx); } else { - int bwl = bwlog_tab[0][b->bs]; - int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4; - - mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y, - ref1->data[0], ref1->linesize[0], tref1, - row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1); - - if (b->comp) - mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y, - ref2->data[0], ref2->linesize[0], tref2, - row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2); + inter_pred(ctx); } - - // uv inter pred - { - int bwl = bwlog_tab[1][b->bs]; - int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4; - VP56mv mvuv; - - w1 = (w1 + 1) >> 1; - h1 = (h1 + 1) >> 1; - if (b->comp) { - w2 = (w2 + 1) >> 1; - h2 = (h2 + 1) >> 1; - } - if (b->bs > BS_8x8) { - mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4); - mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4); - } else { - mvuv = b->mv[0][0]; - } - - mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0], - s->dst[1], s->dst[2], ls_uv, - ref1->data[1], ref1->linesize[1], - ref1->data[2], ref1->linesize[2], tref1, - row << 2, col << 2, &mvuv, bw, bh, w1, h1); - - if (b->comp) { - if (b->bs > BS_8x8) { - mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4); - mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4); - } else { - mvuv = b->mv[0][1]; - } - mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1], - s->dst[1], s->dst[2], ls_uv, - ref2->data[1], ref2->linesize[1], - ref2->data[2], ref2->linesize[2], tref2, - row << 2, col << 2, &mvuv, bw, bh, w2, h2); - } - } - if (!b->skip) { - /* mostly copied intra_reconn() */ + /* mostly copied intra_recon() */ int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n; int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2); diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c new file mode 100644 index 0000000000..c6ae432e26 --- /dev/null +++ b/libavcodec/vp9_mc_template.c @@ -0,0 +1,171 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje + * Copyright (C) 2013 Clément Bœsch + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +static void FN(inter_pred)(AVCodecContext *ctx) +{ + static const uint8_t bwlog_tab[2][N_BS_SIZES] = { + { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 }, + { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 }, + }; + VP9Context *s = ctx->priv_data; + VP9Block *b = s->b; + int row = s->row, col = s->col; + ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2; + AVFrame *ref1 = tref1->f, *ref2; + int w1 = ref1->width, h1 = ref1->height, w2, h2; + ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride; + + if (b->comp) { + tref2 = &s->refs[s->refidx[b->ref[1]]]; + ref2 = tref2->f; + w2 = ref2->width; + h2 = ref2->height; + } + + // y inter pred + if (b->bs > BS_8x8) { + if (b->bs == BS_8x4) { + mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1, 0); + mc_luma_dir(s, mc[3][b->filter][0], + s->dst[0] + 4 * ls_y, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1, 0); + + if (b->comp) { + mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2, 1); + mc_luma_dir(s, mc[3][b->filter][1], + s->dst[0] + 4 * ls_y, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2, 1); + } + } else if (b->bs == BS_4x8) { + mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1, 0); + mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1, 0); + + if (b->comp) { + mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2, 1); + mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2, 1); + } + } else { + av_assert2(b->bs == BS_4x4); + + // FIXME if two horizontally adjacent blocks have the same MV, + // do a w8 instead of a w4 call + mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1, 0); + mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1, 0); + mc_luma_dir(s, mc[4][b->filter][0], + s->dst[0] + 4 * ls_y, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1, 0); + mc_luma_dir(s, mc[4][b->filter][0], + s->dst[0] + 4 * ls_y + 4, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1, 0); + + if (b->comp) { + mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2, 1); + mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2, 1); + mc_luma_dir(s, mc[4][b->filter][1], + s->dst[0] + 4 * ls_y, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2, 1); + mc_luma_dir(s, mc[4][b->filter][1], + s->dst[0] + 4 * ls_y + 4, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2, 1); + } + } + } else { + int bwl = bwlog_tab[0][b->bs]; + int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4; + + mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1, 0); + + if (b->comp) + mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1); + } + + // uv inter pred + { + int bwl = bwlog_tab[1][b->bs]; + int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4; + VP56mv mvuv; + + w1 = (w1 + 1) >> 1; + h1 = (h1 + 1) >> 1; + if (b->comp) { + w2 = (w2 + 1) >> 1; + h2 = (h2 + 1) >> 1; + } + if (b->bs > BS_8x8) { + mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4); + mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4); + } else { + mvuv = b->mv[0][0]; + } + + mc_chroma_dir(s, mc[bwl][b->filter][0], + s->dst[1], s->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 2, col << 2, &mvuv, bw, bh, w1, h1, 0); + + if (b->comp) { + if (b->bs > BS_8x8) { + mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4); + mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4); + } else { + mvuv = b->mv[0][1]; + } + mc_chroma_dir(s, mc[bwl][b->filter][1], + s->dst[1], s->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 2, col << 2, &mvuv, bw, bh, w2, h2, 1); + } + } +} diff --git a/libavcodec/vp9_parser.c b/libavcodec/vp9_parser.c index b188785456..8e55d2593e 100644 --- a/libavcodec/vp9_parser.c +++ b/libavcodec/vp9_parser.c @@ -1,5 +1,8 @@ /* - * Copyright (C) 2008 Michael Niedermayer + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje + * Copyright (C) 2013 Clément Bœsch * * This file is part of FFmpeg. * diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c index 6356adde32..95b7eb5c71 100644 --- a/libavcodec/vp9dsp.c +++ b/libavcodec/vp9dsp.c @@ -1707,8 +1707,9 @@ copy_avg_fn(4) #undef fpel_fn #undef copy_avg_fn -static const int8_t vp9_subpel_filters[3][15][8] = { +static const int16_t vp9_subpel_filters[3][16][8] = { [FILTER_8TAP_REGULAR] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 1, -5, 126, 8, -3, 1, 0 }, { -1, 3, -10, 122, 18, -6, 2, 0 }, { -1, 4, -13, 118, 27, -9, 3, -1 }, @@ -1725,6 +1726,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = { { 0, 2, -6, 18, 122, -10, 3, -1 }, { 0, 1, -3, 8, 126, -5, 1, 0 }, }, [FILTER_8TAP_SHARP] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -1, 3, -7, 127, 8, -3, 1, 0 }, { -2, 5, -13, 125, 17, -6, 3, -1 }, { -3, 7, -17, 121, 27, -10, 5, -2 }, @@ -1741,6 +1743,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = { { -1, 3, -6, 17, 125, -13, 5, -2 }, { 0, 1, -3, 8, 127, -7, 3, -1 }, }, [FILTER_8TAP_SMOOTH] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -3, -1, 32, 64, 38, 1, -3, 0 }, { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 26, 63, 43, 4, -4, 0 }, @@ -1772,7 +1775,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = { static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int w, int h, ptrdiff_t ds, - const int8_t *filter, int avg) + const int16_t *filter, int avg) { do { int x; @@ -1792,7 +1795,7 @@ static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride, #define filter_8tap_1d_fn(opn, opa, dir, ds) \ static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ - int w, int h, const int8_t *filter) \ + int w, int h, const int16_t *filter) \ { \ do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \ } @@ -1806,8 +1809,8 @@ filter_8tap_1d_fn(avg, 1, h, 1) static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, - int w, int h, const int8_t *filterx, - const int8_t *filtery, int avg) + int w, int h, const int16_t *filterx, + const int16_t *filtery, int avg) { int tmp_h = h + 7; uint8_t tmp[64 * 71], *tmp_ptr = tmp; @@ -1842,8 +1845,8 @@ static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride, #define filter_8tap_2d_fn(opn, opa) \ static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ - int w, int h, const int8_t *filterx, \ - const int8_t *filtery) \ + int w, int h, const int16_t *filterx, \ + const int16_t *filtery) \ { \ do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \ } @@ -1853,15 +1856,13 @@ filter_8tap_2d_fn(avg, 1) #undef filter_8tap_2d_fn -#undef FILTER_8TAP - #define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \ static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ int h, int mx, int my) \ { \ avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \ - vp9_subpel_filters[type_idx][dir_m - 1]); \ + vp9_subpel_filters[type_idx][dir_m]); \ } #define filter_fn_2d(sz, type, type_idx, avg) \ @@ -1870,8 +1871,8 @@ static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ int h, int mx, int my) \ { \ avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \ - vp9_subpel_filters[type_idx][mx - 1], \ - vp9_subpel_filters[type_idx][my - 1]); \ + vp9_subpel_filters[type_idx][mx], \ + vp9_subpel_filters[type_idx][my]); \ } #define FILTER_BILIN(src, x, mxy, stride) \ @@ -1957,8 +1958,6 @@ bilin_2d_fn(avg, 1) #undef bilin_2d_fn -#undef FILTER_BILIN - #define bilinf_fn_1d(sz, dir, dir_m, avg) \ static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ @@ -2053,12 +2052,190 @@ static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp) #undef init_subpel3 } +static av_always_inline void do_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg, + const int16_t (*filters)[8]) +{ + int tmp_h = (((h - 1) * dy + my) >> 4) + 8; + uint8_t tmp[64 * 135], *tmp_ptr = tmp; + + src -= src_stride * 3; + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp + 64 * 3; + do { + int x; + const int16_t *filter = filters[my]; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_filter_8tap_fn(opn, opa) \ +static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy, \ + const int16_t (*filters)[8]) \ +{ \ + do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ + opa, filters); \ +} + +scaled_filter_8tap_fn(put, 0) +scaled_filter_8tap_fn(avg, 1) + +#undef scaled_filter_8tap_fn + +#undef FILTER_8TAP + +#define scaled_filter_fn(sz, type, type_idx, avg) \ +static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \ + vp9_subpel_filters[type_idx]); \ +} + +static av_always_inline void do_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg) +{ + uint8_t tmp[64 * 129], *tmp_ptr = tmp; + int tmp_h = (((h - 1) * dy + my) >> 4) + 2; + + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_bilin_fn(opn, opa) \ +static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy) \ +{ \ + do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \ +} + +scaled_bilin_fn(put, 0) +scaled_bilin_fn(avg, 1) + +#undef scaled_bilin_fn + +#undef FILTER_BILIN + +#define scaled_bilinf_fn(sz, avg) \ +static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \ +} + +#define scaled_filter_fns(sz, avg) \ +scaled_filter_fn(sz, regular, FILTER_8TAP_REGULAR, avg) \ +scaled_filter_fn(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ +scaled_filter_fn(sz, sharp, FILTER_8TAP_SHARP, avg) \ +scaled_bilinf_fn(sz, avg) + +#define scaled_filter_fn_set(avg) \ +scaled_filter_fns(64, avg) \ +scaled_filter_fns(32, avg) \ +scaled_filter_fns(16, avg) \ +scaled_filter_fns(8, avg) \ +scaled_filter_fns(4, avg) + +scaled_filter_fn_set(put) +scaled_filter_fn_set(avg) + +#undef scaled_filter_fns +#undef scaled_filter_fn_set +#undef scaled_filter_fn +#undef scaled_bilinf_fn + +static av_cold void vp9dsp_scaled_mc_init(VP9DSPContext *dsp) +{ +#define init_scaled(idx1, idx2, sz, type) \ + dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_SHARP ][idx2] = type##_scaled_sharp_##sz##_c; \ + dsp->smc[idx1][FILTER_BILINEAR ][idx2] = type##_scaled_bilin_##sz##_c + +#define init_scaled_put_avg(idx, sz) \ + init_scaled(idx, 0, sz, put); \ + init_scaled(idx, 1, sz, avg) + + init_scaled_put_avg(0, 64); + init_scaled_put_avg(1, 32); + init_scaled_put_avg(2, 16); + init_scaled_put_avg(3, 8); + init_scaled_put_avg(4, 4); + +#undef init_scaled_put_avg +#undef init_scaled +} + av_cold void ff_vp9dsp_init(VP9DSPContext *dsp) { vp9dsp_intrapred_init(dsp); vp9dsp_itxfm_init(dsp); vp9dsp_loopfilter_init(dsp); vp9dsp_mc_init(dsp); + vp9dsp_scaled_mc_init(dsp); if (ARCH_X86) ff_vp9dsp_init_x86(dsp); } diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h index db0a92e210..33dfc09acd 100644 --- a/libavcodec/vp9dsp.h +++ b/libavcodec/vp9dsp.h @@ -32,6 +32,9 @@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *ref, ptrdiff_t ref_stride, int h, int mx, int my); +typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int h, int mx, int my, int dx, int dy); typedef struct VP9DSPContext { /* @@ -109,6 +112,12 @@ typedef struct VP9DSPContext { * dst/stride are aligned by hsize */ vp9_mc_func mc[5][4][2][2][2]; + + /* + * for scalable MC, first 3 dimensions identical to above, the other two + * don't exist since it changes per stepsize. + */ + vp9_scaled_mc_func smc[5][4][2]; } VP9DSPContext; void ff_vp9dsp_init(VP9DSPContext *dsp);