vo_opengl: hwdec_cuda: Support separate decode and display devices

In a multi GPU scenario, it may be desirable to use different GPUs
for decode and display responsibilities. For example, if a secondary
GPU has better video decoding capabilities.

In such a scenario, we need to initialise a separate context for each
GPU, and use the display context in hwdec_cuda, while passing the
decode context to avcodec.

Once that's done, the actually hand-off between the two GPUs is
transparent to us (It happens during the cuMemcpy2D operation which
copies the decoded frame from a cuda buffer to the OpenGL texture).

In the end, the bulk of the work is around introducing a new
configuration option to specify the decode device.
This commit is contained in:
Philip Langdale 2017-05-29 09:48:10 -07:00 committed by wm4
parent 7e889e5e63
commit 7424651b96
5 changed files with 66 additions and 12 deletions

View File

@ -4802,6 +4802,16 @@ The following video options are currently all specific to ``--vo=opengl`` and
This option might be silently removed in the future, if ANGLE fixes shader
compilation speed.
``--cuda-decode-device=<auto|0..>``
Choose the GPU device used for decoding when using the ``cuda`` hwdec.
By default, the device that is being used to provide OpenGL output will
also be used for decoding (and in the vast majority of cases, only one
GPU will be present).
Note that when using the ``cuda-copy`` hwdec, a different option must be
passed: ``--vd-lavc-o=gpu=<0..>``.
Miscellaneous
-------------

View File

@ -728,6 +728,11 @@ const m_option_t mp_opts[] = {
({"no", -1}, {"auto", 0}, {"windowed", 1}, {"yes", 2})),
#endif
#if HAVE_CUDA_HWACCEL
OPT_CHOICE_OR_INT("cuda-decode-device", cuda_device, 0,
0, INT_MAX, ({"auto", -1})),
#endif
#if HAVE_ENCODING
OPT_SUBSTRUCT("", encode_opts, encode_config, 0),
#endif
@ -973,6 +978,8 @@ const struct MPOpts mp_default_opts = {
"Performer", "Title", "Track", "icy-title", "service_name",
NULL
},
.cuda_device = -1,
};
#endif /* MPLAYER_CFG_MPLAYER_H */

View File

@ -336,6 +336,8 @@ typedef struct MPOpts {
struct angle_opts *angle_opts;
struct cocoa_opts *cocoa_opts;
struct dvd_opts *dvd_opts;
int cuda_device;
} MPOpts;
struct dvd_opts {

View File

@ -94,6 +94,7 @@ typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CU
typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *pdevice, int ordinal);
typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
@ -110,6 +111,7 @@ typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, C
FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \
FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \
FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \
FN(cuDeviceGet, tcuDeviceGet) \
FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \
FN(cuGetErrorName, tcuGetErrorName) \
FN(cuGetErrorString, tcuGetErrorString) \
@ -130,6 +132,7 @@ CUDA_FNS(CUDA_EXT_DECL)
#define cuCtxPushCurrent mpv_cuCtxPushCurrent_v2
#define cuCtxPopCurrent mpv_cuCtxPopCurrent_v2
#define cuCtxDestroy mpv_cuCtxDestroy_v2
#define cuDeviceGet mpv_cuDeviceGet
#define cuMemcpy2D mpv_cuMemcpy2D_v2
#define cuGetErrorName mpv_cuGetErrorName
#define cuGetErrorString mpv_cuGetErrorString

View File

@ -34,6 +34,7 @@
#include "formats.h"
#include "hwdec.h"
#include "options/m_config.h"
#include "video.h"
struct priv {
@ -44,7 +45,8 @@ struct priv {
CUarray cu_array[4];
int plane_bytes[4];
CUcontext cuda_ctx;
CUcontext display_ctx;
CUcontext decode_ctx;
};
static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func)
@ -72,8 +74,7 @@ static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func)
static int cuda_create(struct gl_hwdec *hw)
{
CUdevice device;
CUcontext cuda_ctx = NULL;
CUdevice display_dev;
AVBufferRef *hw_device_ctx = NULL;
CUcontext dummy;
unsigned int device_count;
@ -97,16 +98,43 @@ static int cuda_create(struct gl_hwdec *hw)
if (ret < 0)
goto error;
ret = CHECK_CU(cuGLGetDevices(&device_count, &device, 1,
// Allocate display context
ret = CHECK_CU(cuGLGetDevices(&device_count, &display_dev, 1,
CU_GL_DEVICE_LIST_ALL));
if (ret < 0)
goto error;
ret = CHECK_CU(cuCtxCreate(&cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, device));
ret = CHECK_CU(cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
display_dev));
if (ret < 0)
goto error;
p->cuda_ctx = cuda_ctx;
p->decode_ctx = p->display_ctx;
int decode_dev_idx = -1;
mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice,
&decode_dev_idx);
if (decode_dev_idx > -1) {
CUdevice decode_dev;
ret = CHECK_CU(cuDeviceGet(&decode_dev, decode_dev_idx));
if (ret < 0)
goto error;
if (decode_dev != display_dev) {
MP_INFO(hw, "Using separate decoder and display devices\n");
// Pop the display context. We won't use it again during init()
ret = CHECK_CU(cuCtxPopCurrent(&dummy));
if (ret < 0)
goto error;
ret = CHECK_CU(cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
decode_dev));
if (ret < 0)
goto error;
}
}
hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
if (!hw_device_ctx)
@ -115,7 +143,7 @@ static int cuda_create(struct gl_hwdec *hw)
AVHWDeviceContext *device_ctx = (void *)hw_device_ctx->data;
AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
device_hwctx->cuda_ctx = cuda_ctx;
device_hwctx->cuda_ctx = p->decode_ctx;
ret = av_hwdevice_ctx_init(hw_device_ctx);
if (ret < 0) {
@ -129,7 +157,7 @@ static int cuda_create(struct gl_hwdec *hw)
p->hwctx = (struct mp_hwdec_ctx) {
.type = HWDEC_CUDA,
.ctx = cuda_ctx,
.ctx = p->decode_ctx,
.av_device_ref = hw_device_ctx,
};
p->hwctx.driver_name = hw->driver->name;
@ -162,7 +190,7 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
return -1;
}
ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
if (ret < 0)
return ret;
@ -219,7 +247,7 @@ static void destroy(struct gl_hwdec *hw)
CUcontext dummy;
// Don't bail if any CUDA calls fail. This is all best effort.
CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
CHECK_CU(cuCtxPushCurrent(p->display_ctx));
for (int n = 0; n < 4; n++) {
if (p->cu_res[n] > 0)
CHECK_CU(cuGraphicsUnregisterResource(p->cu_res[n]));
@ -227,7 +255,11 @@ static void destroy(struct gl_hwdec *hw)
}
CHECK_CU(cuCtxPopCurrent(&dummy));
CHECK_CU(cuCtxDestroy(p->cuda_ctx));
if (p->decode_ctx != p->display_ctx) {
CHECK_CU(cuCtxDestroy(p->decode_ctx));
}
CHECK_CU(cuCtxDestroy(p->display_ctx));
gl->DeleteTextures(4, p->gl_textures);
@ -242,7 +274,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
CUcontext dummy;
int ret = 0, eret = 0;
ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
if (ret < 0)
return ret;