vp9: do unscaled MC in scaled path if size of this reference matches.

This can happen if we do bidirectional MC, where one reference has the same size as the current frame, but the other one doesn't.
2015-09-04 15:03:30 -04:00 · 2015-09-04 15:03:30 -04:00 · 9cdeb105a6
parent 873dbc6758
commit 9cdeb105a6
1 changed files with 157 additions and 142 deletions
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@ -2766,146 +2766,6 @@ static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv
    intra_recon(ctx, y_off, uv_off, 2);
 }

-static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
-                                            uint8_t *dst, ptrdiff_t dst_stride,
-                                            const uint8_t *ref, ptrdiff_t ref_stride,
-                                            ThreadFrame *ref_frame,
-                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
-                                            int px, int py, int pw, int ph,
-                                            int bw, int bh, int w, int h, int bytesperpixel,
-                                            const uint16_t *scale, const uint8_t *step)
-{
-#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
-    int mx, my;
-    int refbw_m1, refbh_m1;
-    int th;
-    VP56mv mv;
-
-    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
-    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
-    // BUG libvpx seems to scale the two components separately. This introduces
-    // rounding errors but we have to reproduce them to be exactly compatible
-    // with the output from libvpx...
-    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
-    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
-
-    y = my >> 4;
-    x = mx >> 4;
-    ref += y * ref_stride + x * bytesperpixel;
-    mx &= 15;
-    my &= 15;
-    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
-    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
-    // the longest loopfilter of the next sbrow
-    th = (y + refbh_m1 + 4 + 7) >> 6;
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
-    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref - 3 * ref_stride - 3 * bytesperpixel,
-                                 288, ref_stride,
-                                 refbw_m1 + 8, refbh_m1 + 8,
-                                 x - 3, y - 3, w, h);
-        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
-        ref_stride = 288;
-    }
-    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
-}
-
-static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
-                                              uint8_t *dst_u, uint8_t *dst_v,
-                                              ptrdiff_t dst_stride,
-                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
-                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
-                                              ThreadFrame *ref_frame,
-                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
-                                              int px, int py, int pw, int ph,
-                                              int bw, int bh, int w, int h, int bytesperpixel,
-                                              const uint16_t *scale, const uint8_t *step)
-{
-    int mx, my;
-    int refbw_m1, refbh_m1;
-    int th;
-    VP56mv mv;
-
-    if (s->ss_h) {
-        // BUG https://code.google.com/p/webm/issues/detail?id=820
-        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
-        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
-    } else {
-        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
-        mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
-    }
-    if (s->ss_v) {
-        // BUG https://code.google.com/p/webm/issues/detail?id=820
-        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
-        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
-    } else {
-        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
-        my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
-    }
-#undef scale_mv
-    y = my >> 4;
-    x = mx >> 4;
-    ref_u += y * src_stride_u + x * bytesperpixel;
-    ref_v += y * src_stride_v + x * bytesperpixel;
-    mx &= 15;
-    my &= 15;
-    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
-    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
-    // the longest loopfilter of the next sbrow
-    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
-    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
-                                 288, src_stride_u,
-                                 refbw_m1 + 8, refbh_m1 + 8,
-                                 x - 3, y - 3, w, h);
-        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
-        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
-
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
-                                 288, src_stride_v,
-                                 refbw_m1 + 8, refbh_m1 + 8,
-                                 x - 3, y - 3, w, h);
-        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
-        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
-    } else {
-        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
-        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
-    }
-}
-
-#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
-                    px, py, pw, ph, bw, bh, w, h, i) \
-    mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
-                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
-                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
-#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
-                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
-    mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
-                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
-                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
-#define SCALED 1
-#define FN(x) x##_scaled_8bpp
-#define BYTES_PER_PIXEL 1
-#include "vp9_mc_template.c"
-#undef FN
-#undef BYTES_PER_PIXEL
-#define FN(x) x##_scaled_16bpp
-#define BYTES_PER_PIXEL 2
-#include "vp9_mc_template.c"
-#undef mc_luma_dir
-#undef mc_chroma_dir
-#undef FN
-#undef BYTES_PER_PIXEL
-#undef SCALED
-
 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
                                              uint8_t *dst, ptrdiff_t dst_stride,
                                              const uint8_t *ref, ptrdiff_t ref_stride,
@ -3000,8 +2860,163 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
 #define FN(x) x##_16bpp
 #define BYTES_PER_PIXEL 2
 #include "vp9_mc_template.c"
-#undef mc_luma_dir_dir
-#undef mc_chroma_dir_dir
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                            vp9_mc_func (*mc)[2],
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                            int px, int py, int pw, int ph,
+                                            int bw, int bh, int w, int h, int bytesperpixel,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+    if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
+                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3 * bytesperpixel,
+                                 288, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        ref_stride = 288;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+    }
+}
+
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                              vp9_mc_func (*mc)[2],
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                              int px, int py, int pw, int ph,
+                                              int bw, int bh, int w, int h, int bytesperpixel,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
+                           ref_v, src_stride_v, ref_frame,
+                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    if (s->ss_h) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    } else {
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
+        mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
+    }
+    if (s->ss_v) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    } else {
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
+        my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
+    }
+#undef scale_mv
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
+                                 288, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
+                                 288, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+    }
+}
+
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define SCALED 1
+#define FN(x) x##_scaled_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_scaled_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
 #undef FN
 #undef BYTES_PER_PIXEL
 #undef SCALED