diff --git a/options/options.c b/options/options.c index 20153405cd..1d2203726d 100644 --- a/options/options.c +++ b/options/options.c @@ -98,6 +98,7 @@ const struct m_opt_choice_alternatives mp_hwdec_names[] = { {"d3d11va-copy",HWDEC_D3D11VA_COPY}, {"rpi", HWDEC_RPI}, {"mediacodec", HWDEC_MEDIACODEC}, + {"cuda", HWDEC_CUDA}, {0} }; diff --git a/video/decode/cuda.c b/video/decode/cuda.c new file mode 100644 index 0000000000..f41aed2e7c --- /dev/null +++ b/video/decode/cuda.c @@ -0,0 +1,126 @@ +/* + * This file is part of mpv. + * + * Copyright (c) 2016 Philip Langdale + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include +#include + +#include "common/av_common.h" +#include "video/decode/lavc.h" + +typedef struct CUVIDContext { + CUcontext cuda_ctx; +} CUVIDContext; + +static void cuvid_ctx_free(AVHWDeviceContext *ctx) +{ + AVCUDADeviceContext *hwctx = ctx->hwctx; + cuCtxDestroy(hwctx->cuda_ctx); +} + +static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec, + const char *codec) +{ + if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_CUDA)) + return HWDEC_ERR_NO_CTX; + return 0; +} + +static int init(struct lavc_ctx *ctx) +{ + struct CUVIDContext *p = talloc_ptrtype(NULL, p); + + *p = (struct CUVIDContext) { + .cuda_ctx = hwdec_devices_get(ctx->hwdec_devs, HWDEC_CUDA)->ctx, + }; + ctx->hwdec_priv = p; + return 0; +} + +static int init_decoder(struct lavc_ctx *ctx, int w, int h) +{ + AVCodecContext *avctx = ctx->avctx; + AVCUDADeviceContext *device_hwctx; + AVHWDeviceContext *device_ctx; + CUVIDContext *priv = ctx->hwdec_priv; + int ret = 0; + + if (avctx->hw_frames_ctx) { + MP_ERR(ctx, "hw_frames_ctx already initialised!\n"); + return -1; + } + + AVBufferRef *hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA); + if (!hw_device_ctx) { + MP_WARN(ctx, "av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA) failed\n"); + goto error; + } + + device_ctx = (AVHWDeviceContext*)hw_device_ctx->data; + device_ctx->free = cuvid_ctx_free; + + device_hwctx = device_ctx->hwctx; + device_hwctx->cuda_ctx = priv->cuda_ctx; + + ret = av_hwdevice_ctx_init(hw_device_ctx); + if (ret < 0) { + MP_ERR(ctx, "av_hwdevice_ctx_init failed\n"); + goto error; + } + + avctx->hw_frames_ctx = av_hwframe_ctx_alloc(hw_device_ctx); + if (!avctx->hw_frames_ctx) { + MP_ERR(ctx, "av_hwframe_ctx_alloc failed\n"); + goto error; + } + + return 0; + + error: + av_buffer_unref(&avctx->hw_frames_ctx); + av_buffer_unref(&hw_device_ctx); + return -1; +} + +static void uninit(struct lavc_ctx *ctx) +{ + struct CUVIDContext *p = ctx->hwdec_priv; + if (!p) + return; + + talloc_free(p); + ctx->hwdec_priv = NULL; +} + +static struct mp_image *process_image(struct lavc_ctx *ctx, struct mp_image *img) +{ + if (img->imgfmt == IMGFMT_CUDA) + img->params.hw_subfmt = IMGFMT_NV12; + return img; +} + +const struct vd_lavc_hwdec mp_vd_lavc_cuda = { + .type = HWDEC_CUDA, + .image_format = IMGFMT_CUDA, + .lavc_suffix = "_cuvid", + .probe = probe, + .init = init, + .uninit = uninit, + .init_decoder = init_decoder, + .process_image = process_image, +}; diff --git a/video/decode/vd_lavc.c b/video/decode/vd_lavc.c index 4b8528fc3a..11978b3ded 100644 --- a/video/decode/vd_lavc.c +++ b/video/decode/vd_lavc.c @@ -133,6 +133,7 @@ extern const struct vd_lavc_hwdec mp_vd_lavc_dxva2; extern const struct vd_lavc_hwdec mp_vd_lavc_dxva2_copy; extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va; extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va_copy; +extern const struct vd_lavc_hwdec mp_vd_lavc_cuda; #if HAVE_RPI static const struct vd_lavc_hwdec mp_vd_lavc_rpi = { @@ -173,6 +174,9 @@ static const struct vd_lavc_hwdec *const hwdec_list[] = { #endif #if HAVE_ANDROID &mp_vd_lavc_mediacodec, +#endif +#if HAVE_CUDA_GL + &mp_vd_lavc_cuda, #endif NULL }; diff --git a/video/fmt-conversion.c b/video/fmt-conversion.c index 1fca8bfacf..2d0425f26a 100644 --- a/video/fmt-conversion.c +++ b/video/fmt-conversion.c @@ -115,7 +115,9 @@ static const struct { #if HAVE_AV_PIX_FMT_MMAL {IMGFMT_MMAL, AV_PIX_FMT_MMAL}, #endif - +#if HAVE_CUDA_GL + {IMGFMT_CUDA, AV_PIX_FMT_CUDA}, +#endif #ifdef AV_PIX_FMT_P010 {IMGFMT_P010, AV_PIX_FMT_P010}, #endif diff --git a/video/hwdec.h b/video/hwdec.h index 4d99076f16..473c02b5f5 100644 --- a/video/hwdec.h +++ b/video/hwdec.h @@ -21,6 +21,7 @@ enum hwdec_type { HWDEC_D3D11VA_COPY, HWDEC_RPI, HWDEC_MEDIACODEC, + HWDEC_CUDA, }; #define HWDEC_IS_AUTO(x) ((x) == HWDEC_AUTO || (x) == HWDEC_AUTO_COPY) diff --git a/video/img_format.h b/video/img_format.h index b6f5830b8a..41463195ac 100644 --- a/video/img_format.h +++ b/video/img_format.h @@ -213,6 +213,7 @@ enum mp_imgfmt { IMGFMT_MMAL, // MMAL_BUFFER_HEADER_T IMGFMT_VIDEOTOOLBOX, // CVPixelBufferRef + IMGFMT_CUDA, // CUDA Buffer // Generic pass-through of AV_PIX_FMT_*. Used for formats which don't have // a corresponding IMGFMT_ value. IMGFMT_AVPIXFMT_START, diff --git a/video/out/opengl/hwdec.c b/video/out/opengl/hwdec.c index 8c8286119d..b6c2ba1070 100644 --- a/video/out/opengl/hwdec.c +++ b/video/out/opengl/hwdec.c @@ -33,6 +33,7 @@ extern const struct gl_hwdec_driver gl_hwdec_d3d11egl; extern const struct gl_hwdec_driver gl_hwdec_d3d11eglrgb; extern const struct gl_hwdec_driver gl_hwdec_dxva2gldx; extern const struct gl_hwdec_driver gl_hwdec_dxva2; +extern const struct gl_hwdec_driver gl_hwdec_cuda; static const struct gl_hwdec_driver *const mpgl_hwdec_drivers[] = { #if HAVE_VAAPI_EGL @@ -57,6 +58,9 @@ static const struct gl_hwdec_driver *const mpgl_hwdec_drivers[] = { &gl_hwdec_dxva2gldx, #endif &gl_hwdec_dxva2, +#endif +#if HAVE_CUDA_GL + &gl_hwdec_cuda, #endif NULL }; diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c new file mode 100644 index 0000000000..8b8b099103 --- /dev/null +++ b/video/out/opengl/hwdec_cuda.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2016 Philip Langdale + * + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +/* + * This hwdec implements an optimized output path using CUDA->OpenGL + * interop for frame data that is stored in CUDA device memory. + * Although it is not explicit in the code here, the only practical way + * to get data in this form is from the 'cuvid' decoder (aka NvDecode). + * + * For now, cuvid/NvDecode will always return images in NV12 format, even + * when decoding 10bit streams (there is some hardware dithering going on). + */ + +#include +#include + +#include "hwdec.h" +#include "video.h" + +#include + +struct priv { + struct mp_hwdec_ctx hwctx; + struct mp_image layout; + GLuint gl_textures[2]; + GLuint gl_pbos[2]; + bool mapped; + + CUcontext cuda_ctx; +}; + +static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func) +{ + const char *err_name; + const char *err_string; + + MP_TRACE(hw, "Calling %s\n", func); + + if (err == CUDA_SUCCESS) + return 0; + + cuGetErrorName(err, &err_name); + cuGetErrorString(err, &err_string); + + MP_ERR(hw, "%s failed", func); + if (err_name && err_string) + MP_ERR(hw, " -> %s: %s", err_name, err_string); + MP_ERR(hw, "\n"); + + return -1; +} + +#define CHECK_CU(x) check_cu(hw, (x), #x) + +static int cuda_create(struct gl_hwdec *hw) +{ + CUdevice device; + CUcontext cuda_ctx = NULL; + CUcontext dummy; + int ret = 0, eret = 0; + + // PBO Requirements + if (hw->gl->version < 210 && hw->gl->es < 300) { + MP_ERR(hw, "need OpenGL >= 2.1 or OpenGL-ES >= 3.0\n"); + return -1; + } + + struct priv *p = talloc_zero(hw, struct priv); + hw->priv = p; + + ret = CHECK_CU(cuInit(0)); + if (ret < 0) + goto error; + + ///TODO: Make device index configurable + ret = CHECK_CU(cuDeviceGet(&device, 0)); + if (ret < 0) + goto error; + + ret = CHECK_CU(cuCtxCreate(&cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, device)); + if (ret < 0) + goto error; + + p->cuda_ctx = cuda_ctx; + + p->hwctx = (struct mp_hwdec_ctx) { + .type = HWDEC_CUDA, + .ctx = cuda_ctx, + }; + p->hwctx.driver_name = hw->driver->name; + hwdec_devices_add(hw->devs, &p->hwctx); + + error: + eret = CHECK_CU(cuCtxPopCurrent(&dummy)); + if (eret < 0) + return eret; + + return ret; +} + +static int reinit(struct gl_hwdec *hw, struct mp_image_params *params) +{ + struct priv *p = hw->priv; + GL *gl = hw->gl; + CUcontext dummy; + int ret = 0, eret = 0; + + assert(params->imgfmt == hw->driver->imgfmt); + params->imgfmt = IMGFMT_NV12; + params->hw_subfmt = 0; + + mp_image_set_params(&p->layout, params); + + ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + if (ret < 0) + return ret; + + gl->GenTextures(2, p->gl_textures); + for (int n = 0; n < 2; n++) { + gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]); + GLenum filter = GL_NEAREST; + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter); + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter); + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + gl->TexImage2D(GL_TEXTURE_2D, 0, n == 0 ? GL_R8 : GL_RG8, + mp_image_plane_w(&p->layout, n), + mp_image_plane_h(&p->layout, n), + 0, n == 0 ? GL_RED : GL_RG, GL_UNSIGNED_BYTE, NULL); + } + gl->BindTexture(GL_TEXTURE_2D, 0); + + gl->GenBuffers(2, p->gl_pbos); + for (int n = 0; n < 2; n++) { + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, p->gl_pbos[n]); + // Chroma plane is two bytes per pixel + gl->BufferData(GL_PIXEL_UNPACK_BUFFER, + mp_image_plane_w(&p->layout, n) * + mp_image_plane_h(&p->layout, n) * (n + 1), + NULL, GL_STREAM_DRAW); + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + ret = CHECK_CU(cuGLRegisterBufferObject(p->gl_pbos[n])); + if (ret < 0) + goto error; + } + + error: + eret = CHECK_CU(cuCtxPopCurrent(&dummy)); + if (eret < 0) + return eret; + + return ret; +} + +static void destroy(struct gl_hwdec *hw) +{ + struct priv *p = hw->priv; + GL *gl = hw->gl; + CUcontext dummy; + + // Don't bail if any CUDA calls fail. This is all best effort. + CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + for (int n = 0; n < 2; n++) { + if (p->gl_pbos[n] > 0) + CHECK_CU(cuGLUnregisterBufferObject(p->gl_pbos[n])); + } + CHECK_CU(cuCtxPopCurrent(&dummy)); + + gl->DeleteBuffers(2, p->gl_pbos); + gl->DeleteTextures(2, p->gl_textures); + + hwdec_devices_remove(hw->devs, &p->hwctx); +} + +static int get_alignment(int stride) +{ + if (stride % 8 == 0) + return 8; + if (stride % 4 == 0) + return 4; + if (stride % 2 == 0) + return 2; + return 1; +} + +static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image, + struct gl_hwdec_frame *out_frame) +{ + struct priv *p = hw->priv; + GL *gl = hw->gl; + CUcontext dummy; + CUdeviceptr cuda_data; + size_t cuda_size; + int ret = 0, eret = 0; + + ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + if (ret < 0) + return ret; + + *out_frame = (struct gl_hwdec_frame) { 0, }; + + for (int n = 0; n < 2; n++) { + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, p->gl_pbos[n]); + ret = CHECK_CU(cuGLMapBufferObject(&cuda_data, &cuda_size, p->gl_pbos[n])); + if (ret < 0) + goto error; + + // dstPitch and widthInBytes must account for the chroma plane + // elements being two bytes wide. + CUDA_MEMCPY2D cpy = { + .srcMemoryType = CU_MEMORYTYPE_DEVICE, + .dstMemoryType = CU_MEMORYTYPE_DEVICE, + .srcDevice = (CUdeviceptr)hw_image->planes[n], + .dstDevice = cuda_data, + .srcPitch = hw_image->stride[n], + .dstPitch = mp_image_plane_w(&p->layout, n) * (n + 1), + .srcY = 0, + .WidthInBytes = mp_image_plane_w(&p->layout, n) * (n + 1), + .Height = mp_image_plane_h(&p->layout, n), + }; + ret = CHECK_CU(cuMemcpy2D(&cpy)); + if (ret < 0) + goto error; + + gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]); + gl->PixelStorei(GL_UNPACK_ALIGNMENT, + get_alignment(mp_image_plane_w(&p->layout, n))); + gl->TexSubImage2D(GL_TEXTURE_2D, 0, + 0, 0, + mp_image_plane_w(&p->layout, n), + mp_image_plane_h(&p->layout, n), + n == 0 ? GL_RED : GL_RG, GL_UNSIGNED_BYTE, NULL); + gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4); + + ret = CHECK_CU(cuGLUnmapBufferObject(p->gl_pbos[n])); + if (ret < 0) + goto error; + + gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + gl->BindTexture(GL_TEXTURE_2D, 0); + + out_frame->planes[n] = (struct gl_hwdec_plane){ + .gl_texture = p->gl_textures[n], + .gl_target = GL_TEXTURE_2D, + .tex_w = mp_image_plane_w(&p->layout, n), + .tex_h = mp_image_plane_h(&p->layout, n), + }; + } + + error: + eret = CHECK_CU(cuCtxPopCurrent(&dummy)); + if (eret < 0) + return eret; + + return ret; +} + +const struct gl_hwdec_driver gl_hwdec_cuda = { + .name = "cuda", + .api = HWDEC_CUDA, + .imgfmt = IMGFMT_CUDA, + .create = cuda_create, + .reinit = reinit, + .map_frame = map_frame, + .destroy = destroy, +}; diff --git a/wscript b/wscript index 7eed072bb6..033c8f5fa2 100644 --- a/wscript +++ b/wscript @@ -874,6 +874,12 @@ hwaccel_features = [ 'func': compose_checks( check_headers('libavcodec/dxva2.h', use='libav'), check_headers('libavcodec/d3d11va.h', use='libav')), + }, { + 'name': '--cuda-gl', + 'desc': 'CUDA with OpenGL', + 'func': compose_checks( + check_cc(lib="cuda"), + check_headers('libavutil/hwcontext_cuda.h', use='libav')), }, { 'name': 'sse4-intrinsics', 'desc': 'GCC SSE4 intrinsics for GPU memcpy', diff --git a/wscript_build.py b/wscript_build.py index f246ae4a33..23f61c0d2d 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -286,6 +286,7 @@ def build(ctx): ( "video/vdpau.c", "vdpau" ), ( "video/vdpau_mixer.c", "vdpau" ), ( "video/decode/dec_video.c"), + ( "video/decode/cuda.c", "cuda-gl" ), ( "video/decode/dxva2.c", "d3d-hwaccel" ), ( "video/decode/d3d11va.c", "d3d-hwaccel" ), ( "video/decode/d3d.c", "win32" ), @@ -341,6 +342,7 @@ def build(ctx): ( "video/out/opengl/egl_helpers.c", "egl-helpers" ), ( "video/out/opengl/formats.c", "gl" ), ( "video/out/opengl/hwdec.c", "gl" ), + ( "video/out/opengl/hwdec_cuda.c", "cuda-gl" ), ( "video/out/opengl/hwdec_d3d11egl.c", "egl-angle" ), ( "video/out/opengl/hwdec_d3d11eglrgb.c","egl-angle" ), ( "video/out/opengl/hwdec_dxva2.c", "gl-win32" ),