mirror of
https://github.com/mpv-player/mpv
synced 2025-01-26 01:23:17 +00:00
dxva2: use optimized memcpy
At least on my machine, reading back the frame with system memcpy is slower than just using software rendering. Use the optimized gpu_memcpy from LAV to speed things up.
This commit is contained in:
parent
f2d51171f7
commit
52b52800ce
@ -492,7 +492,7 @@ Video
|
|||||||
:vaapi: requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only)
|
:vaapi: requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux with Intel GPUs only)
|
||||||
:vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only)
|
:vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only)
|
||||||
:vda: requires ``--vo=opengl`` (OS X only)
|
:vda: requires ``--vo=opengl`` (OS X only)
|
||||||
:dxva2-copy: copies video back to system RAM (win32 only) (doesn't work)
|
:dxva2-copy: copies video back to system RAM (Windows only) (experimental)
|
||||||
|
|
||||||
``auto`` tries to automatically enable hardware decoding using the first
|
``auto`` tries to automatically enable hardware decoding using the first
|
||||||
available method. This still depends what VO you are using. For example,
|
available method. This still depends what VO you are using. For example,
|
||||||
|
@ -41,6 +41,7 @@
|
|||||||
#include "video/fmt-conversion.h"
|
#include "video/fmt-conversion.h"
|
||||||
#include "video/mp_image_pool.h"
|
#include "video/mp_image_pool.h"
|
||||||
#include "video/hwdec.h"
|
#include "video/hwdec.h"
|
||||||
|
#include "gpu_memcpy_sse4.h"
|
||||||
|
|
||||||
// A minor evil.
|
// A minor evil.
|
||||||
#ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
|
#ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
|
||||||
@ -98,6 +99,9 @@ typedef struct surface_info {
|
|||||||
typedef struct DXVA2Context {
|
typedef struct DXVA2Context {
|
||||||
struct mp_log *log;
|
struct mp_log *log;
|
||||||
|
|
||||||
|
void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
|
||||||
|
unsigned src_pitch, unsigned surf_height);
|
||||||
|
|
||||||
HMODULE d3dlib;
|
HMODULE d3dlib;
|
||||||
HMODULE dxva2lib;
|
HMODULE dxva2lib;
|
||||||
|
|
||||||
@ -241,6 +245,26 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int fmt,
|
|||||||
return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
|
return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
|
||||||
|
unsigned src_pitch, unsigned surf_height)
|
||||||
|
{
|
||||||
|
unsigned height = dest->h * src_pitch;
|
||||||
|
memcpy(dest->planes[0], src_bits, height);
|
||||||
|
dest->stride[0] = src_pitch;
|
||||||
|
memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
|
||||||
|
dest->stride[1] = src_pitch;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
|
||||||
|
unsigned src_pitch, unsigned surf_height)
|
||||||
|
{
|
||||||
|
unsigned height = dest->h * src_pitch;
|
||||||
|
gpu_memcpy(dest->planes[0], src_bits, height);
|
||||||
|
dest->stride[0] = src_pitch;
|
||||||
|
gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2);
|
||||||
|
dest->stride[1] = src_pitch;
|
||||||
|
}
|
||||||
|
|
||||||
static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
|
static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
|
||||||
struct mp_image *img)
|
struct mp_image *img)
|
||||||
{
|
{
|
||||||
@ -265,17 +289,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
|
|||||||
return img;
|
return img;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct mp_image buf = {0};
|
ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
|
||||||
mp_image_setfmt(&buf, IMGFMT_NV12);
|
|
||||||
mp_image_set_size(&buf, img->w, img->h);
|
|
||||||
|
|
||||||
buf.planes[0] = LockedRect.pBits;
|
|
||||||
buf.stride[0] = LockedRect.Pitch;
|
|
||||||
buf.planes[1] = (char *)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height;
|
|
||||||
buf.stride[1] = LockedRect.Pitch;
|
|
||||||
|
|
||||||
// This should probably use some sort of "special" memcpy-like function.
|
|
||||||
mp_image_copy(sw_img, &buf);
|
|
||||||
|
|
||||||
IDirect3DSurface9_UnlockRect(surface);
|
IDirect3DSurface9_UnlockRect(surface);
|
||||||
|
|
||||||
@ -302,6 +316,16 @@ static int dxva2_init(struct lavc_ctx *s)
|
|||||||
ctx->log = mp_log_new(s, s->log, "dxva2");
|
ctx->log = mp_log_new(s, s->log, "dxva2");
|
||||||
ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
|
ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
|
||||||
|
|
||||||
|
if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
|
||||||
|
// Use a memcpy implementation optimised for copying from GPU memory
|
||||||
|
MP_DBG(ctx, "Using SSE4 memcpy\n");
|
||||||
|
ctx->copy_nv12 = copy_nv12_gpu_sse4;
|
||||||
|
} else {
|
||||||
|
// Use the CRT memcpy. This can be slower than software decoding.
|
||||||
|
MP_WARN(ctx, "Using fallback memcpy (slow)\n");
|
||||||
|
ctx->copy_nv12 = copy_nv12_fallback;
|
||||||
|
}
|
||||||
|
|
||||||
ctx->deviceHandle = INVALID_HANDLE_VALUE;
|
ctx->deviceHandle = INVALID_HANDLE_VALUE;
|
||||||
|
|
||||||
ctx->d3dlib = LoadLibrary("d3d9.dll");
|
ctx->d3dlib = LoadLibrary("d3d9.dll");
|
||||||
|
129
video/decode/gpu_memcpy_sse4.h
Normal file
129
video/decode/gpu_memcpy_sse4.h
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011-2014 Hendrik Leppkes
|
||||||
|
* http://www.1f0.de
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License along
|
||||||
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Taken from the QuickSync decoder by Eric Gur
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
// gpu_memcpy is a memcpy style function that copied data very fast from a
|
||||||
|
// GPU tiled memory (write back)
|
||||||
|
// Performance tip: page offset (12 lsb) of both addresses should be different
|
||||||
|
// optimally use a 2K offset between them.
|
||||||
|
__attribute__((target("sse4"))) static inline void
|
||||||
|
*gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
|
||||||
|
{
|
||||||
|
static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
|
||||||
|
|
||||||
|
if (d == NULL || s == NULL) return NULL;
|
||||||
|
|
||||||
|
// If memory is not aligned, use memcpy
|
||||||
|
bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
|
||||||
|
if (!isAligned)
|
||||||
|
{
|
||||||
|
return memcpy(d, s, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||||
|
#ifdef __x86_64__
|
||||||
|
__m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
|
||||||
|
size_t end = 0;
|
||||||
|
|
||||||
|
__m128i* pTrg = (__m128i*)d;
|
||||||
|
__m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
|
||||||
|
__m128i* pSrc = (__m128i*)s;
|
||||||
|
|
||||||
|
// Make sure source is synced - doesn't hurt if not needed.
|
||||||
|
_mm_sfence();
|
||||||
|
|
||||||
|
while (pTrg < pTrgEnd)
|
||||||
|
{
|
||||||
|
// _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
|
||||||
|
// Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
|
||||||
|
xmm0 = _mm_stream_load_si128(pSrc);
|
||||||
|
xmm1 = _mm_stream_load_si128(pSrc + 1);
|
||||||
|
xmm2 = _mm_stream_load_si128(pSrc + 2);
|
||||||
|
xmm3 = _mm_stream_load_si128(pSrc + 3);
|
||||||
|
xmm4 = _mm_stream_load_si128(pSrc + 4);
|
||||||
|
xmm5 = _mm_stream_load_si128(pSrc + 5);
|
||||||
|
xmm6 = _mm_stream_load_si128(pSrc + 6);
|
||||||
|
xmm7 = _mm_stream_load_si128(pSrc + 7);
|
||||||
|
#ifdef __x86_64__ // Use all 16 xmm registers
|
||||||
|
xmm8 = _mm_stream_load_si128(pSrc + 8);
|
||||||
|
xmm9 = _mm_stream_load_si128(pSrc + 9);
|
||||||
|
xmm10 = _mm_stream_load_si128(pSrc + 10);
|
||||||
|
xmm11 = _mm_stream_load_si128(pSrc + 11);
|
||||||
|
xmm12 = _mm_stream_load_si128(pSrc + 12);
|
||||||
|
xmm13 = _mm_stream_load_si128(pSrc + 13);
|
||||||
|
xmm14 = _mm_stream_load_si128(pSrc + 14);
|
||||||
|
xmm15 = _mm_stream_load_si128(pSrc + 15);
|
||||||
|
#endif
|
||||||
|
pSrc += regsInLoop;
|
||||||
|
// _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
|
||||||
|
_mm_store_si128(pTrg , xmm0);
|
||||||
|
_mm_store_si128(pTrg + 1, xmm1);
|
||||||
|
_mm_store_si128(pTrg + 2, xmm2);
|
||||||
|
_mm_store_si128(pTrg + 3, xmm3);
|
||||||
|
_mm_store_si128(pTrg + 4, xmm4);
|
||||||
|
_mm_store_si128(pTrg + 5, xmm5);
|
||||||
|
_mm_store_si128(pTrg + 6, xmm6);
|
||||||
|
_mm_store_si128(pTrg + 7, xmm7);
|
||||||
|
#ifdef __x86_64__ // Use all 16 xmm registers
|
||||||
|
_mm_store_si128(pTrg + 8, xmm8);
|
||||||
|
_mm_store_si128(pTrg + 9, xmm9);
|
||||||
|
_mm_store_si128(pTrg + 10, xmm10);
|
||||||
|
_mm_store_si128(pTrg + 11, xmm11);
|
||||||
|
_mm_store_si128(pTrg + 12, xmm12);
|
||||||
|
_mm_store_si128(pTrg + 13, xmm13);
|
||||||
|
_mm_store_si128(pTrg + 14, xmm14);
|
||||||
|
_mm_store_si128(pTrg + 15, xmm15);
|
||||||
|
#endif
|
||||||
|
pTrg += regsInLoop;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy in 16 byte steps
|
||||||
|
if (reminder >= 16)
|
||||||
|
{
|
||||||
|
size = reminder;
|
||||||
|
reminder = size & 15;
|
||||||
|
end = size >> 4;
|
||||||
|
for (size_t i = 0; i < end; ++i)
|
||||||
|
{
|
||||||
|
pTrg[i] = _mm_stream_load_si128(pSrc + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy last bytes - shouldn't happen as strides are modulu 16
|
||||||
|
if (reminder)
|
||||||
|
{
|
||||||
|
__m128i temp = _mm_stream_load_si128(pSrc + end);
|
||||||
|
|
||||||
|
char* ps = (char*)(&temp);
|
||||||
|
char* pt = (char*)(pTrg + end);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < reminder; ++i)
|
||||||
|
{
|
||||||
|
pt[i] = ps[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return d;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user