ffmpeg/libavutil/hwcontext_cuda.c
Lynne 2e08b39444
hwcontext: add av_hwdevice_ctx_create_derived_opts
This allows for users who derive devices to set options for the
new device context they derive.
The main use case of this is to allow users to enable extensions
(such as surface drawing extensions) in Vulkan while deriving from
the device their frames are on. That way, users don't need to write
any initialization code themselves, since the Vulkan spec invalidates
mixing instances, physical devices and active devices.
Apart from Vulkan, other hwcontexts ignore the opts argument since they
don't support options at all (or in VAAPI and OpenCL's case, options are
currently only used for device selection, which device_derive overrides).
2020-05-23 19:07:26 +01:00

502 lines
14 KiB
C

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "buffer.h"
#include "common.h"
#include "hwcontext.h"
#include "hwcontext_internal.h"
#include "hwcontext_cuda_internal.h"
#if CONFIG_VULKAN
#include "hwcontext_vulkan.h"
#endif
#include "cuda_check.h"
#include "mem.h"
#include "pixdesc.h"
#include "pixfmt.h"
#include "imgutils.h"
#define CUDA_FRAME_ALIGNMENT 256
typedef struct CUDAFramesContext {
int shift_width, shift_height;
} CUDAFramesContext;
static const enum AVPixelFormat supported_formats[] = {
AV_PIX_FMT_NV12,
AV_PIX_FMT_YUV420P,
AV_PIX_FMT_YUVA420P,
AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010,
AV_PIX_FMT_P016,
AV_PIX_FMT_YUV444P16,
AV_PIX_FMT_0RGB32,
AV_PIX_FMT_0BGR32,
#if CONFIG_VULKAN
AV_PIX_FMT_VULKAN,
#endif
};
#define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
const void *hwconfig,
AVHWFramesConstraints *constraints)
{
int i;
constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
sizeof(*constraints->valid_sw_formats));
if (!constraints->valid_sw_formats)
return AVERROR(ENOMEM);
for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
constraints->valid_sw_formats[i] = supported_formats[i];
constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
if (!constraints->valid_hw_formats)
return AVERROR(ENOMEM);
constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
return 0;
}
static void cuda_buffer_free(void *opaque, uint8_t *data)
{
AVHWFramesContext *ctx = opaque;
AVHWDeviceContext *device_ctx = ctx->device_ctx;
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu = hwctx->internal->cuda_dl;
CUcontext dummy;
CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
CHECK_CU(cu->cuMemFree((CUdeviceptr)data));
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
}
static AVBufferRef *cuda_pool_alloc(void *opaque, int size)
{
AVHWFramesContext *ctx = opaque;
AVHWDeviceContext *device_ctx = ctx->device_ctx;
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu = hwctx->internal->cuda_dl;
AVBufferRef *ret = NULL;
CUcontext dummy = NULL;
CUdeviceptr data;
int err;
err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
if (err < 0)
return NULL;
err = CHECK_CU(cu->cuMemAlloc(&data, size));
if (err < 0)
goto fail;
ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
if (!ret) {
CHECK_CU(cu->cuMemFree(data));
goto fail;
}
fail:
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
return ret;
}
static int cuda_frames_init(AVHWFramesContext *ctx)
{
CUDAFramesContext *priv = ctx->internal->priv;
int i;
for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
if (ctx->sw_format == supported_formats[i])
break;
}
if (i == FF_ARRAY_ELEMS(supported_formats)) {
av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
av_get_pix_fmt_name(ctx->sw_format));
return AVERROR(ENOSYS);
}
av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
if (!ctx->pool) {
int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
if (size < 0)
return size;
ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
if (!ctx->internal->pool_internal)
return AVERROR(ENOMEM);
}
return 0;
}
static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
{
int res;
frame->buf[0] = av_buffer_pool_get(ctx->pool);
if (!frame->buf[0])
return AVERROR(ENOMEM);
res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
if (res < 0)
return res;
// YUV420P is a special case.
// Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2;
frame->data[2] = frame->data[1];
frame->data[1] = frame->data[2] + frame->linesize[2] * ctx->height / 2;
}
frame->format = AV_PIX_FMT_CUDA;
frame->width = ctx->width;
frame->height = ctx->height;
return 0;
}
static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
enum AVHWFrameTransferDirection dir,
enum AVPixelFormat **formats)
{
enum AVPixelFormat *fmts;
fmts = av_malloc_array(2, sizeof(*fmts));
if (!fmts)
return AVERROR(ENOMEM);
fmts[0] = ctx->sw_format;
fmts[1] = AV_PIX_FMT_NONE;
*formats = fmts;
return 0;
}
static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst,
const AVFrame *src)
{
CUDAFramesContext *priv = ctx->internal->priv;
AVHWDeviceContext *device_ctx = ctx->device_ctx;
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu = hwctx->internal->cuda_dl;
CUcontext dummy;
int i, ret;
if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) ||
(dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA))
return AVERROR(ENOSYS);
ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
if (ret < 0)
return ret;
for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
CUDA_MEMCPY2D cpy = {
.srcPitch = src->linesize[i],
.dstPitch = dst->linesize[i],
.WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
.Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height),
};
if (src->hw_frames_ctx) {
cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
cpy.srcDevice = (CUdeviceptr)src->data[i];
} else {
cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
cpy.srcHost = src->data[i];
}
if (dst->hw_frames_ctx) {
cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
cpy.dstDevice = (CUdeviceptr)dst->data[i];
} else {
cpy.dstMemoryType = CU_MEMORYTYPE_HOST;
cpy.dstHost = dst->data[i];
}
ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
if (ret < 0)
goto exit;
}
if (!dst->hw_frames_ctx) {
ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream));
if (ret < 0)
goto exit;
}
exit:
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
return 0;
}
static void cuda_device_uninit(AVHWDeviceContext *device_ctx)
{
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
if (hwctx->internal) {
CudaFunctions *cu = hwctx->internal->cuda_dl;
if (hwctx->internal->is_allocated && hwctx->cuda_ctx) {
if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT)
CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device));
else
CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx));
hwctx->cuda_ctx = NULL;
}
cuda_free_functions(&hwctx->internal->cuda_dl);
}
av_freep(&hwctx->internal);
}
static int cuda_device_init(AVHWDeviceContext *ctx)
{
AVCUDADeviceContext *hwctx = ctx->hwctx;
int ret;
if (!hwctx->internal) {
hwctx->internal = av_mallocz(sizeof(*hwctx->internal));
if (!hwctx->internal)
return AVERROR(ENOMEM);
}
if (!hwctx->internal->cuda_dl) {
ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx);
if (ret < 0) {
av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
goto error;
}
}
return 0;
error:
cuda_device_uninit(ctx);
return ret;
}
static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) {
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu;
CUcontext dummy;
int ret, dev_active = 0;
unsigned int dev_flags = 0;
const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
cu = hwctx->internal->cuda_dl;
hwctx->internal->flags = flags;
if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
&dev_flags, &dev_active));
if (ret < 0)
return ret;
if (dev_active && dev_flags != desired_flags) {
av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
return AVERROR(ENOTSUP);
} else if (dev_flags != desired_flags) {
ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
desired_flags));
if (ret < 0)
return ret;
}
ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
hwctx->internal->cuda_device));
if (ret < 0)
return ret;
} else {
ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
hwctx->internal->cuda_device));
if (ret < 0)
return ret;
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
}
hwctx->internal->is_allocated = 1;
// Setting stream to NULL will make functions automatically use the default CUstream
hwctx->stream = NULL;
return 0;
}
static int cuda_device_create(AVHWDeviceContext *device_ctx,
const char *device,
AVDictionary *opts, int flags)
{
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu;
int ret, device_idx = 0;
if (device)
device_idx = strtol(device, NULL, 0);
if (cuda_device_init(device_ctx) < 0)
goto error;
cu = hwctx->internal->cuda_dl;
ret = CHECK_CU(cu->cuInit(0));
if (ret < 0)
goto error;
ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx));
if (ret < 0)
goto error;
ret = cuda_context_init(device_ctx, flags);
if (ret < 0)
goto error;
return 0;
error:
cuda_device_uninit(device_ctx);
return AVERROR_UNKNOWN;
}
static int cuda_device_derive(AVHWDeviceContext *device_ctx,
AVHWDeviceContext *src_ctx, AVDictionary *opts,
int flags) {
AVCUDADeviceContext *hwctx = device_ctx->hwctx;
CudaFunctions *cu;
const char *src_uuid = NULL;
int ret, i, device_count;
#if CONFIG_VULKAN
VkPhysicalDeviceIDProperties vk_idp = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES,
};
#endif
switch (src_ctx->type) {
#if CONFIG_VULKAN
case AV_HWDEVICE_TYPE_VULKAN: {
AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
VkPhysicalDeviceProperties2 vk_dev_props = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
.pNext = &vk_idp,
};
vkGetPhysicalDeviceProperties2(vkctx->phys_dev, &vk_dev_props);
src_uuid = vk_idp.deviceUUID;
break;
}
#endif
default:
return AVERROR(ENOSYS);
}
if (!src_uuid) {
av_log(device_ctx, AV_LOG_ERROR,
"Failed to get UUID of source device.\n");
goto error;
}
if (cuda_device_init(device_ctx) < 0)
goto error;
cu = hwctx->internal->cuda_dl;
ret = CHECK_CU(cu->cuInit(0));
if (ret < 0)
goto error;
ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
if (ret < 0)
goto error;
hwctx->internal->cuda_device = -1;
for (i = 0; i < device_count; i++) {
CUdevice dev;
CUuuid uuid;
ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
if (ret < 0)
goto error;
ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
if (ret < 0)
goto error;
if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
hwctx->internal->cuda_device = dev;
break;
}
}
if (hwctx->internal->cuda_device == -1) {
av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
goto error;
}
ret = cuda_context_init(device_ctx, flags);
if (ret < 0)
goto error;
return 0;
error:
cuda_device_uninit(device_ctx);
return AVERROR_UNKNOWN;
}
const HWContextType ff_hwcontext_type_cuda = {
.type = AV_HWDEVICE_TYPE_CUDA,
.name = "CUDA",
.device_hwctx_size = sizeof(AVCUDADeviceContext),
.frames_priv_size = sizeof(CUDAFramesContext),
.device_create = cuda_device_create,
.device_derive = cuda_device_derive,
.device_init = cuda_device_init,
.device_uninit = cuda_device_uninit,
.frames_get_constraints = cuda_frames_get_constraints,
.frames_init = cuda_frames_init,
.frames_get_buffer = cuda_get_buffer,
.transfer_get_formats = cuda_transfer_get_formats,
.transfer_data_to = cuda_transfer_data,
.transfer_data_from = cuda_transfer_data,
.pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
};