From 52b52800ce083364a9c0baa1d6e0564eaae3b4d7 Mon Sep 17 00:00:00 2001
From: James Ross-Gowan <rossymiles@gmail.com>
Date: Sun, 26 Oct 2014 10:43:15 +1100
Subject: [PATCH] dxva2: use optimized memcpy

At least on my machine, reading back the frame with system memcpy is
slower than just using software rendering. Use the optimized gpu_memcpy
from LAV to speed things up.
---
 DOCS/man/options.rst           |   2 +-
 video/decode/dxva2.c           |  46 +++++++++---
 video/decode/gpu_memcpy_sse4.h | 129 +++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 12 deletions(-)
 create mode 100644 video/decode/gpu_memcpy_sse4.h

diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 0db418a25e..d25a6a2363 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -492,7 +492,7 @@ Video
     :vaapi:     requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only)
     :vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only)
     :vda:       requires ``--vo=opengl`` (OS X only)
-    :dxva2-copy: copies video back to system RAM (win32 only) (doesn't work)
+    :dxva2-copy: copies video back to system RAM (Windows only) (experimental)
 
     ``auto`` tries to automatically enable hardware decoding using the first
     available method. This still depends what VO you are using. For example,
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 21f9fcd633..e03014438a 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -41,6 +41,7 @@
 #include "video/fmt-conversion.h"
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
+#include "gpu_memcpy_sse4.h"
 
 // A minor evil.
 #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,6 +99,9 @@ typedef struct surface_info {
 typedef struct DXVA2Context {
     struct mp_log *log;
 
+    void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
+                      unsigned src_pitch, unsigned surf_height);
+
     HMODULE d3dlib;
     HMODULE dxva2lib;
 
@@ -241,6 +245,26 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int fmt,
     return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
 }
 
+static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
+                               unsigned src_pitch, unsigned surf_height)
+{
+    unsigned height = dest->h * src_pitch;
+    memcpy(dest->planes[0], src_bits, height);
+    dest->stride[0] = src_pitch;
+    memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
+    dest->stride[1] = src_pitch;
+}
+
+static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
+                               unsigned src_pitch, unsigned surf_height)
+{
+    unsigned height = dest->h * src_pitch;
+    gpu_memcpy(dest->planes[0], src_bits, height);
+    dest->stride[0] = src_pitch;
+    gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
+    dest->stride[1] = src_pitch;
+}
+
 static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
                                              struct mp_image *img)
 {
@@ -265,17 +289,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
         return img;
     }
 
-    struct mp_image buf = {0};
-    mp_image_setfmt(&buf, IMGFMT_NV12);
-    mp_image_set_size(&buf, img->w, img->h);
-
-    buf.planes[0] = LockedRect.pBits;
-    buf.stride[0] = LockedRect.Pitch;
-    buf.planes[1] = (char *)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height;
-    buf.stride[1] = LockedRect.Pitch;
-
-    // This should probably use some sort of "special" memcpy-like function.
-    mp_image_copy(sw_img, &buf);
+    ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
 
     IDirect3DSurface9_UnlockRect(surface);
 
@@ -302,6 +316,16 @@ static int dxva2_init(struct lavc_ctx *s)
     ctx->log = mp_log_new(s, s->log, "dxva2");
     ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
 
+    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
+        // Use a memcpy implementation optimised for copying from GPU memory
+        MP_DBG(ctx, "Using SSE4 memcpy\n");
+        ctx->copy_nv12 = copy_nv12_gpu_sse4;
+    } else {
+        // Use the CRT memcpy. This can be slower than software decoding.
+        MP_WARN(ctx, "Using fallback memcpy (slow)\n");
+        ctx->copy_nv12 = copy_nv12_fallback;
+    }
+
     ctx->deviceHandle = INVALID_HANDLE_VALUE;
 
     ctx->d3dlib = LoadLibrary("d3d9.dll");
diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h
new file mode 100644
index 0000000000..c441ff38fe
--- /dev/null
+++ b/video/decode/gpu_memcpy_sse4.h
@@ -0,0 +1,129 @@
+/*
+ *      Copyright (C) 2011-2014 Hendrik Leppkes
+ *      http://www.1f0.de
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Taken from the QuickSync decoder by Eric Gur
+ */
+
+#include <emmintrin.h>
+
+// gpu_memcpy is a memcpy style function that copied data very fast from a
+// GPU tiled memory (write back)
+// Performance tip: page offset (12 lsb) of both addresses should be different
+//  optimally use a 2K offset between them.
+__attribute__((target("sse4"))) static inline void
+*gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
+{
+    static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
+
+    if (d == NULL || s == NULL) return NULL;
+
+    // If memory is not aligned, use memcpy
+    bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
+    if (!isAligned)
+    {
+        return memcpy(d, s, size);
+    }
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+#ifdef __x86_64__
+    __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+#endif
+
+    size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
+    size_t end = 0;
+
+    __m128i* pTrg = (__m128i*)d;
+    __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
+    __m128i* pSrc = (__m128i*)s;
+
+    // Make sure source is synced - doesn't hurt if not needed.
+    _mm_sfence();
+
+    while (pTrg < pTrgEnd)
+    {
+        // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
+        // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
+        xmm0  = _mm_stream_load_si128(pSrc);
+        xmm1  = _mm_stream_load_si128(pSrc + 1);
+        xmm2  = _mm_stream_load_si128(pSrc + 2);
+        xmm3  = _mm_stream_load_si128(pSrc + 3);
+        xmm4  = _mm_stream_load_si128(pSrc + 4);
+        xmm5  = _mm_stream_load_si128(pSrc + 5);
+        xmm6  = _mm_stream_load_si128(pSrc + 6);
+        xmm7  = _mm_stream_load_si128(pSrc + 7);
+#ifdef __x86_64__ // Use all 16 xmm registers
+        xmm8  = _mm_stream_load_si128(pSrc + 8);
+        xmm9  = _mm_stream_load_si128(pSrc + 9);
+        xmm10 = _mm_stream_load_si128(pSrc + 10);
+        xmm11 = _mm_stream_load_si128(pSrc + 11);
+        xmm12 = _mm_stream_load_si128(pSrc + 12);
+        xmm13 = _mm_stream_load_si128(pSrc + 13);
+        xmm14 = _mm_stream_load_si128(pSrc + 14);
+        xmm15 = _mm_stream_load_si128(pSrc + 15);
+#endif
+        pSrc += regsInLoop;
+        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
+        _mm_store_si128(pTrg     , xmm0);
+        _mm_store_si128(pTrg +  1, xmm1);
+        _mm_store_si128(pTrg +  2, xmm2);
+        _mm_store_si128(pTrg +  3, xmm3);
+        _mm_store_si128(pTrg +  4, xmm4);
+        _mm_store_si128(pTrg +  5, xmm5);
+        _mm_store_si128(pTrg +  6, xmm6);
+        _mm_store_si128(pTrg +  7, xmm7);
+#ifdef __x86_64__ // Use all 16 xmm registers
+        _mm_store_si128(pTrg +  8, xmm8);
+        _mm_store_si128(pTrg +  9, xmm9);
+        _mm_store_si128(pTrg + 10, xmm10);
+        _mm_store_si128(pTrg + 11, xmm11);
+        _mm_store_si128(pTrg + 12, xmm12);
+        _mm_store_si128(pTrg + 13, xmm13);
+        _mm_store_si128(pTrg + 14, xmm14);
+        _mm_store_si128(pTrg + 15, xmm15);
+#endif
+        pTrg += regsInLoop;
+    }
+
+    // Copy in 16 byte steps
+    if (reminder >= 16)
+    {
+        size = reminder;
+        reminder = size & 15;
+        end = size >> 4;
+        for (size_t i = 0; i < end; ++i)
+        {
+            pTrg[i] = _mm_stream_load_si128(pSrc + i);
+        }
+    }
+
+    // Copy last bytes - shouldn't happen as strides are modulu 16
+    if (reminder)
+    {
+        __m128i temp = _mm_stream_load_si128(pSrc + end);
+
+        char* ps = (char*)(&temp);
+        char* pt = (char*)(pTrg + end);
+
+        for (size_t i = 0; i < reminder; ++i)
+        {
+            pt[i] = ps[i];
+        }
+    }
+
+    return d;
+}