1
0
mirror of https://github.com/mpv-player/mpv synced 2025-01-26 01:23:17 +00:00

dxva2: use optimized memcpy

At least on my machine, reading back the frame with system memcpy is
slower than just using software rendering. Use the optimized gpu_memcpy
from LAV to speed things up.
This commit is contained in:
James Ross-Gowan 2014-10-26 10:43:15 +11:00 committed by wm4
parent f2d51171f7
commit 52b52800ce
3 changed files with 165 additions and 12 deletions

View File

@ -492,7 +492,7 @@ Video
:vaapi: requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only) :vaapi: requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only)
:vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only) :vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only)
:vda: requires ``--vo=opengl`` (OS X only) :vda: requires ``--vo=opengl`` (OS X only)
:dxva2-copy: copies video back to system RAM (win32 only) (doesn't work) :dxva2-copy: copies video back to system RAM (Windows only) (experimental)
``auto`` tries to automatically enable hardware decoding using the first ``auto`` tries to automatically enable hardware decoding using the first
available method. This still depends what VO you are using. For example, available method. This still depends what VO you are using. For example,

View File

@ -41,6 +41,7 @@
#include "video/fmt-conversion.h" #include "video/fmt-conversion.h"
#include "video/mp_image_pool.h" #include "video/mp_image_pool.h"
#include "video/hwdec.h" #include "video/hwdec.h"
#include "gpu_memcpy_sse4.h"
// A minor evil. // A minor evil.
#ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@ -98,6 +99,9 @@ typedef struct surface_info {
typedef struct DXVA2Context { typedef struct DXVA2Context {
struct mp_log *log; struct mp_log *log;
void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
unsigned src_pitch, unsigned surf_height);
HMODULE d3dlib; HMODULE d3dlib;
HMODULE dxva2lib; HMODULE dxva2lib;
@ -241,6 +245,26 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int fmt,
return mp_image_new_custom_ref(&mpi, w, dxva2_release_img); return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
} }
static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
unsigned src_pitch, unsigned surf_height)
{
unsigned height = dest->h * src_pitch;
memcpy(dest->planes[0], src_bits, height);
dest->stride[0] = src_pitch;
memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
dest->stride[1] = src_pitch;
}
static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
unsigned src_pitch, unsigned surf_height)
{
unsigned height = dest->h * src_pitch;
gpu_memcpy(dest->planes[0], src_bits, height);
dest->stride[0] = src_pitch;
gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
dest->stride[1] = src_pitch;
}
static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
struct mp_image *img) struct mp_image *img)
{ {
@ -265,17 +289,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
return img; return img;
} }
struct mp_image buf = {0}; ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
mp_image_setfmt(&buf, IMGFMT_NV12);
mp_image_set_size(&buf, img->w, img->h);
buf.planes[0] = LockedRect.pBits;
buf.stride[0] = LockedRect.Pitch;
buf.planes[1] = (char *)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height;
buf.stride[1] = LockedRect.Pitch;
// This should probably use some sort of "special" memcpy-like function.
mp_image_copy(sw_img, &buf);
IDirect3DSurface9_UnlockRect(surface); IDirect3DSurface9_UnlockRect(surface);
@ -302,6 +316,16 @@ static int dxva2_init(struct lavc_ctx *s)
ctx->log = mp_log_new(s, s->log, "dxva2"); ctx->log = mp_log_new(s, s->log, "dxva2");
ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17)); ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
// Use a memcpy implementation optimised for copying from GPU memory
MP_DBG(ctx, "Using SSE4 memcpy\n");
ctx->copy_nv12 = copy_nv12_gpu_sse4;
} else {
// Use the CRT memcpy. This can be slower than software decoding.
MP_WARN(ctx, "Using fallback memcpy (slow)\n");
ctx->copy_nv12 = copy_nv12_fallback;
}
ctx->deviceHandle = INVALID_HANDLE_VALUE; ctx->deviceHandle = INVALID_HANDLE_VALUE;
ctx->d3dlib = LoadLibrary("d3d9.dll"); ctx->d3dlib = LoadLibrary("d3d9.dll");

View File

@ -0,0 +1,129 @@
/*
* Copyright (C) 2011-2014 Hendrik Leppkes
* http://www.1f0.de
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Taken from the QuickSync decoder by Eric Gur
*/
#include <emmintrin.h>
// gpu_memcpy is a memcpy style function that copied data very fast from a
// GPU tiled memory (write back)
// Performance tip: page offset (12 lsb) of both addresses should be different
// optimally use a 2K offset between them.
__attribute__((target("sse4"))) static inline void
*gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
{
static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
if (d == NULL || s == NULL) return NULL;
// If memory is not aligned, use memcpy
bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
if (!isAligned)
{
return memcpy(d, s, size);
}
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
#ifdef __x86_64__
__m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
#endif
size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
size_t end = 0;
__m128i* pTrg = (__m128i*)d;
__m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
__m128i* pSrc = (__m128i*)s;
// Make sure source is synced - doesn't hurt if not needed.
_mm_sfence();
while (pTrg < pTrgEnd)
{
// _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
// Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
xmm0 = _mm_stream_load_si128(pSrc);
xmm1 = _mm_stream_load_si128(pSrc + 1);
xmm2 = _mm_stream_load_si128(pSrc + 2);
xmm3 = _mm_stream_load_si128(pSrc + 3);
xmm4 = _mm_stream_load_si128(pSrc + 4);
xmm5 = _mm_stream_load_si128(pSrc + 5);
xmm6 = _mm_stream_load_si128(pSrc + 6);
xmm7 = _mm_stream_load_si128(pSrc + 7);
#ifdef __x86_64__ // Use all 16 xmm registers
xmm8 = _mm_stream_load_si128(pSrc + 8);
xmm9 = _mm_stream_load_si128(pSrc + 9);
xmm10 = _mm_stream_load_si128(pSrc + 10);
xmm11 = _mm_stream_load_si128(pSrc + 11);
xmm12 = _mm_stream_load_si128(pSrc + 12);
xmm13 = _mm_stream_load_si128(pSrc + 13);
xmm14 = _mm_stream_load_si128(pSrc + 14);
xmm15 = _mm_stream_load_si128(pSrc + 15);
#endif
pSrc += regsInLoop;
// _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
_mm_store_si128(pTrg , xmm0);
_mm_store_si128(pTrg + 1, xmm1);
_mm_store_si128(pTrg + 2, xmm2);
_mm_store_si128(pTrg + 3, xmm3);
_mm_store_si128(pTrg + 4, xmm4);
_mm_store_si128(pTrg + 5, xmm5);
_mm_store_si128(pTrg + 6, xmm6);
_mm_store_si128(pTrg + 7, xmm7);
#ifdef __x86_64__ // Use all 16 xmm registers
_mm_store_si128(pTrg + 8, xmm8);
_mm_store_si128(pTrg + 9, xmm9);
_mm_store_si128(pTrg + 10, xmm10);
_mm_store_si128(pTrg + 11, xmm11);
_mm_store_si128(pTrg + 12, xmm12);
_mm_store_si128(pTrg + 13, xmm13);
_mm_store_si128(pTrg + 14, xmm14);
_mm_store_si128(pTrg + 15, xmm15);
#endif
pTrg += regsInLoop;
}
// Copy in 16 byte steps
if (reminder >= 16)
{
size = reminder;
reminder = size & 15;
end = size >> 4;
for (size_t i = 0; i < end; ++i)
{
pTrg[i] = _mm_stream_load_si128(pSrc + i);
}
}
// Copy last bytes - shouldn't happen as strides are modulu 16
if (reminder)
{
__m128i temp = _mm_stream_load_si128(pSrc + end);
char* ps = (char*)(&temp);
char* pt = (char*)(pTrg + end);
for (size_t i = 0; i < reminder; ++i)
{
pt[i] = ps[i];
}
}
return d;
}