From 52b52800ce083364a9c0baa1d6e0564eaae3b4d7 Mon Sep 17 00:00:00 2001 From: James Ross-Gowan Date: Sun, 26 Oct 2014 10:43:15 +1100 Subject: [PATCH] dxva2: use optimized memcpy At least on my machine, reading back the frame with system memcpy is slower than just using software rendering. Use the optimized gpu_memcpy from LAV to speed things up. --- DOCS/man/options.rst | 2 +- video/decode/dxva2.c | 46 +++++++++--- video/decode/gpu_memcpy_sse4.h | 129 +++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 12 deletions(-) create mode 100644 video/decode/gpu_memcpy_sse4.h diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 0db418a25e..d25a6a2363 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -492,7 +492,7 @@ Video :vaapi: requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only) :vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only) :vda: requires ``--vo=opengl`` (OS X only) - :dxva2-copy: copies video back to system RAM (win32 only) (doesn't work) + :dxva2-copy: copies video back to system RAM (Windows only) (experimental) ``auto`` tries to automatically enable hardware decoding using the first available method. This still depends what VO you are using. For example, diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c index 21f9fcd633..e03014438a 100644 --- a/video/decode/dxva2.c +++ b/video/decode/dxva2.c @@ -41,6 +41,7 @@ #include "video/fmt-conversion.h" #include "video/mp_image_pool.h" #include "video/hwdec.h" +#include "gpu_memcpy_sse4.h" // A minor evil. #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO @@ -98,6 +99,9 @@ typedef struct surface_info { typedef struct DXVA2Context { struct mp_log *log; + void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height); + HMODULE d3dlib; HMODULE dxva2lib; @@ -241,6 +245,26 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int fmt, return mp_image_new_custom_ref(&mpi, w, dxva2_release_img); } +static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) +{ + unsigned height = dest->h * src_pitch; + memcpy(dest->planes[0], src_bits, height); + dest->stride[0] = src_pitch; + memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2); + dest->stride[1] = src_pitch; +} + +static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) +{ + unsigned height = dest->h * src_pitch; + gpu_memcpy(dest->planes[0], src_bits, height); + dest->stride[0] = src_pitch; + gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2); + dest->stride[1] = src_pitch; +} + static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, struct mp_image *img) { @@ -265,17 +289,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, return img; } - struct mp_image buf = {0}; - mp_image_setfmt(&buf, IMGFMT_NV12); - mp_image_set_size(&buf, img->w, img->h); - - buf.planes[0] = LockedRect.pBits; - buf.stride[0] = LockedRect.Pitch; - buf.planes[1] = (char *)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height; - buf.stride[1] = LockedRect.Pitch; - - // This should probably use some sort of "special" memcpy-like function. - mp_image_copy(sw_img, &buf); + ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); IDirect3DSurface9_UnlockRect(surface); @@ -302,6 +316,16 @@ static int dxva2_init(struct lavc_ctx *s) ctx->log = mp_log_new(s, s->log, "dxva2"); ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17)); + if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) { + // Use a memcpy implementation optimised for copying from GPU memory + MP_DBG(ctx, "Using SSE4 memcpy\n"); + ctx->copy_nv12 = copy_nv12_gpu_sse4; + } else { + // Use the CRT memcpy. This can be slower than software decoding. + MP_WARN(ctx, "Using fallback memcpy (slow)\n"); + ctx->copy_nv12 = copy_nv12_fallback; + } + ctx->deviceHandle = INVALID_HANDLE_VALUE; ctx->d3dlib = LoadLibrary("d3d9.dll"); diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h new file mode 100644 index 0000000000..c441ff38fe --- /dev/null +++ b/video/decode/gpu_memcpy_sse4.h @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2011-2014 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Taken from the QuickSync decoder by Eric Gur + */ + +#include + +// gpu_memcpy is a memcpy style function that copied data very fast from a +// GPU tiled memory (write back) +// Performance tip: page offset (12 lsb) of both addresses should be different +// optimally use a 2K offset between them. +__attribute__((target("sse4"))) static inline void +*gpu_memcpy(void *restrict d, const void *restrict s, size_t size) +{ + static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 + + if (d == NULL || s == NULL) return NULL; + + // If memory is not aligned, use memcpy + bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0; + if (!isAligned) + { + return memcpy(d, s, size); + } + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +#ifdef __x86_64__ + __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; +#endif + + size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop + size_t end = 0; + + __m128i* pTrg = (__m128i*)d; + __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); + __m128i* pSrc = (__m128i*)s; + + // Make sure source is synced - doesn't hurt if not needed. + _mm_sfence(); + + while (pTrg < pTrgEnd) + { + // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA + // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad) + xmm0 = _mm_stream_load_si128(pSrc); + xmm1 = _mm_stream_load_si128(pSrc + 1); + xmm2 = _mm_stream_load_si128(pSrc + 2); + xmm3 = _mm_stream_load_si128(pSrc + 3); + xmm4 = _mm_stream_load_si128(pSrc + 4); + xmm5 = _mm_stream_load_si128(pSrc + 5); + xmm6 = _mm_stream_load_si128(pSrc + 6); + xmm7 = _mm_stream_load_si128(pSrc + 7); +#ifdef __x86_64__ // Use all 16 xmm registers + xmm8 = _mm_stream_load_si128(pSrc + 8); + xmm9 = _mm_stream_load_si128(pSrc + 9); + xmm10 = _mm_stream_load_si128(pSrc + 10); + xmm11 = _mm_stream_load_si128(pSrc + 11); + xmm12 = _mm_stream_load_si128(pSrc + 12); + xmm13 = _mm_stream_load_si128(pSrc + 13); + xmm14 = _mm_stream_load_si128(pSrc + 14); + xmm15 = _mm_stream_load_si128(pSrc + 15); +#endif + pSrc += regsInLoop; + // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store) + _mm_store_si128(pTrg , xmm0); + _mm_store_si128(pTrg + 1, xmm1); + _mm_store_si128(pTrg + 2, xmm2); + _mm_store_si128(pTrg + 3, xmm3); + _mm_store_si128(pTrg + 4, xmm4); + _mm_store_si128(pTrg + 5, xmm5); + _mm_store_si128(pTrg + 6, xmm6); + _mm_store_si128(pTrg + 7, xmm7); +#ifdef __x86_64__ // Use all 16 xmm registers + _mm_store_si128(pTrg + 8, xmm8); + _mm_store_si128(pTrg + 9, xmm9); + _mm_store_si128(pTrg + 10, xmm10); + _mm_store_si128(pTrg + 11, xmm11); + _mm_store_si128(pTrg + 12, xmm12); + _mm_store_si128(pTrg + 13, xmm13); + _mm_store_si128(pTrg + 14, xmm14); + _mm_store_si128(pTrg + 15, xmm15); +#endif + pTrg += regsInLoop; + } + + // Copy in 16 byte steps + if (reminder >= 16) + { + size = reminder; + reminder = size & 15; + end = size >> 4; + for (size_t i = 0; i < end; ++i) + { + pTrg[i] = _mm_stream_load_si128(pSrc + i); + } + } + + // Copy last bytes - shouldn't happen as strides are modulu 16 + if (reminder) + { + __m128i temp = _mm_stream_load_si128(pSrc + end); + + char* ps = (char*)(&temp); + char* pt = (char*)(pTrg + end); + + for (size_t i = 0; i < reminder; ++i) + { + pt[i] = ps[i]; + } + } + + return d; +}