From 9227bd8ac2f22cfad2ee5bc3122d407196a0ba15 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Fri, 23 Dec 2016 15:41:51 +0100
Subject: [PATCH] utvideodec: Reuse the huffyuv add_left

~10% faster when simd is available.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
---
 configure               |   2 +-
 libavcodec/utvideo.h    |   2 +
 libavcodec/utvideodec.c | 164 ++++++++++++++++++++++++++++++++--------
 3 files changed, 137 insertions(+), 31 deletions(-)

diff --git a/configure b/configure
index ecc77842ff..e83a4cf697 100755
--- a/configure
+++ b/configure
@@ -2129,7 +2129,7 @@ truespeech_decoder_select="bswapdsp"
 tscc_decoder_deps="zlib"
 txd_decoder_select="texturedsp"
 twinvq_decoder_select="mdct lsp sinewin"
-utvideo_decoder_select="bswapdsp"
+utvideo_decoder_select="bswapdsp huffyuvdsp"
 utvideo_encoder_select="bswapdsp huffman huffyuvencdsp"
 vble_decoder_select="huffyuvdsp"
 vc1_decoder_select="blockdsp error_resilience h263_decoder h264qpel intrax8 mpeg_er mpegvideo vc1dsp"
diff --git a/libavcodec/utvideo.h b/libavcodec/utvideo.h
index bc83a28d43..2fa2b7cb93 100644
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -30,6 +30,7 @@
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
+#include "huffyuvdsp.h"
 #include "huffyuvencdsp.h"
 
 enum {
@@ -68,6 +69,7 @@ typedef struct UtvideoContext {
     const AVClass *class;
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
+    HuffYUVDSPContext hdspdec;
     HuffYUVEncDSPContext hdsp;
 
     uint32_t frame_info_size, flags, frame_info;
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index f5fae8ba44..67ffe6f253 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -373,8 +373,110 @@ static void restore_rgb_planes10(AVFrame *frame, int width, int height)
     }
 }
 
-static void restore_median(uint8_t *src, int step, ptrdiff_t stride,
-                           int width, int height, int slices, int rmode)
+static void restore_median_planar(UtvideoContext *c, uint8_t *src,
+                                  ptrdiff_t stride, int width, int height,
+                                  int slices, int rmode)
+{
+    int i, j, slice;
+    int A, B, C;
+    uint8_t *bsrc;
+    int slice_start, slice_height;
+    const int cmask = ~rmode;
+
+    for (slice = 0; slice < slices; slice++) {
+        slice_start  = ((slice * height) / slices) & cmask;
+        slice_height = ((((slice + 1) * height) / slices) & cmask) -
+                       slice_start;
+
+        if (!slice_height)
+            continue;
+        bsrc = src + slice_start * stride;
+
+        // first line - left neighbour prediction
+        bsrc[0] += 0x80;
+        c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
+        bsrc += stride;
+        if (slice_height <= 1)
+            continue;
+        // second line - first element has top prediction, the rest uses median
+        C        = bsrc[-stride];
+        bsrc[0] += C;
+        A        = bsrc[0];
+        for (i = 1; i < width; i++) {
+            B        = bsrc[i - stride];
+            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
+            C        = B;
+            A        = bsrc[i];
+        }
+        bsrc += stride;
+        // the rest of lines use continuous median prediction
+        for (j = 2; j < slice_height; j++) {
+            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride,
+                                            bsrc, width, &A, &B);
+            bsrc += stride;
+        }
+    }
+}
+
+/* UtVideo interlaced mode treats every two lines as a single one,
+ * so restoring function should take care of possible padding between
+ * two parts of the same "line".
+ */
+static void restore_median_planar_il(UtvideoContext *c, uint8_t *src,
+                                     ptrdiff_t stride, int width, int height,
+                                     int slices, int rmode)
+{
+    int i, j, slice;
+    int A, B, C;
+    uint8_t *bsrc;
+    int slice_start, slice_height;
+    const int cmask   = ~(rmode ? 3 : 1);
+    const int stride2 = stride << 1;
+
+    for (slice = 0; slice < slices; slice++) {
+        slice_start    = ((slice * height) / slices) & cmask;
+        slice_height   = ((((slice + 1) * height) / slices) & cmask) -
+                         slice_start;
+        slice_height >>= 1;
+        if (!slice_height)
+            continue;
+
+        bsrc = src + slice_start * stride;
+
+        // first line - left neighbour prediction
+        bsrc[0] += 0x80;
+        A = c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
+        c->hdspdec.add_hfyu_left_pred(bsrc + stride, bsrc + stride, width, A);
+        bsrc += stride2;
+        if (slice_height <= 1)
+            continue;
+        // second line - first element has top prediction, the rest uses median
+        C        = bsrc[-stride2];
+        bsrc[0] += C;
+        A        = bsrc[0];
+        for (i = 1; i < width; i++) {
+            B        = bsrc[i - stride2];
+            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
+            C        = B;
+            A        = bsrc[i];
+        }
+        c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
+                                        bsrc + stride, width, &A, &B);
+        bsrc += stride2;
+        // the rest of lines use continuous median prediction
+        for (j = 2; j < slice_height; j++) {
+            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride2,
+                                            bsrc, width, &A, &B);
+            c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
+                                            bsrc + stride, width, &A, &B);
+            bsrc += stride2;
+        }
+    }
+}
+
+static void restore_median_packed(uint8_t *src, int step, ptrdiff_t stride,
+                                  int width, int height,
+                                  int slices, int rmode)
 {
     int i, j, slice;
     int A, B, C;
@@ -429,8 +531,9 @@ static void restore_median(uint8_t *src, int step, ptrdiff_t stride,
  * so restoring function should take care of possible padding between
  * two parts of the same "line".
  */
-static void restore_median_il(uint8_t *src, int step, ptrdiff_t stride,
-                              int width, int height, int slices, int rmode)
+static void restore_median_packed_il(uint8_t *src, int step, ptrdiff_t stride,
+                                     int width, int height,
+                                     int slices, int rmode)
 {
     int i, j, slice;
     int A, B, C;
@@ -613,14 +716,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return ret;
             if (c->frame_pred == PRED_MEDIAN) {
                 if (!c->interlaced) {
-                    restore_median(frame.f->data[0] + ff_ut_rgb_order[i],
-                                   c->planes, frame.f->linesize[0], avctx->width,
-                                   avctx->height, c->slices, 0);
+                    restore_median_packed(frame.f->data[0] + ff_ut_rgb_order[i],
+                                          c->planes, frame.f->linesize[0], avctx->width,
+                                          avctx->height, c->slices, 0);
                 } else {
-                    restore_median_il(frame.f->data[0] + ff_ut_rgb_order[i],
-                                      c->planes, frame.f->linesize[0],
-                                      avctx->width, avctx->height, c->slices,
-                                      0);
+                    restore_median_packed_il(frame.f->data[0] + ff_ut_rgb_order[i],
+                                             c->planes, frame.f->linesize[0],
+                                             avctx->width, avctx->height, c->slices,
+                                             0);
                 }
             }
         }
@@ -649,14 +752,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return ret;
             if (c->frame_pred == PRED_MEDIAN) {
                 if (!c->interlaced) {
-                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
-                                   avctx->width >> !!i, avctx->height >> !!i,
-                                   c->slices, !i);
+                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
+                                          avctx->width >> !!i, avctx->height >> !!i,
+                                          c->slices, !i);
                 } else {
-                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
-                                      avctx->width  >> !!i,
-                                      avctx->height >> !!i,
-                                      c->slices, !i);
+                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
+                                             avctx->width  >> !!i,
+                                             avctx->height >> !!i,
+                                             c->slices, !i);
                 }
             }
         }
@@ -670,13 +773,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return ret;
             if (c->frame_pred == PRED_MEDIAN) {
                 if (!c->interlaced) {
-                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
-                                   avctx->width >> !!i, avctx->height,
-                                   c->slices, 0);
+                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
+                                          avctx->width >> !!i, avctx->height,
+                                          c->slices, 0);
                 } else {
-                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
-                                      avctx->width >> !!i, avctx->height,
-                                      c->slices, 0);
+                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
+                                             avctx->width >> !!i, avctx->height,
+                                             c->slices, 0);
                 }
             }
         }
@@ -690,13 +793,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return ret;
             if (c->frame_pred == PRED_MEDIAN) {
                 if (!c->interlaced) {
-                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
-                                   avctx->width, avctx->height,
-                                   c->slices, 0);
+                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
+                                          avctx->width, avctx->height,
+                                          c->slices, 0);
                 } else {
-                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
-                                      avctx->width, avctx->height,
-                                      c->slices, 0);
+                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
+                                             avctx->width, avctx->height,
+                                             c->slices, 0);
                 }
             }
         }
@@ -729,6 +832,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
 
     ff_bswapdsp_init(&c->bdsp);
+    ff_huffyuvdsp_init(&c->hdspdec);
 
     if (avctx->extradata_size >= 16) {
         av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",