From 46a77c6496a54d0079e6a2108a0dfffead861ac8 Mon Sep 17 00:00:00 2001 From: Lynne Date: Tue, 15 Mar 2022 23:00:32 +0100 Subject: [PATCH] hwcontext_vulkan: support threadsafe queue and frame operations --- libavutil/hwcontext_vulkan.c | 191 ++++++++++++++++++++++++++--------- libavutil/hwcontext_vulkan.h | 38 ++++++- 2 files changed, 180 insertions(+), 49 deletions(-) diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c index 2434a80090..b511836b7b 100644 --- a/libavutil/hwcontext_vulkan.c +++ b/libavutil/hwcontext_vulkan.c @@ -27,6 +27,7 @@ #include #endif +#include "thread.h" #include #include "config.h" @@ -92,8 +93,10 @@ typedef struct VulkanDevicePriv { VkPhysicalDeviceVulkan13Features device_features_1_3; /* Queues */ - uint32_t qfs[5]; - int num_qfs; + pthread_mutex_t **qf_mutex; + uint32_t nb_tot_qfs; + uint32_t img_qfs[5]; + uint32_t nb_img_qfs; /* Debug callback */ VkDebugUtilsMessengerEXT debug_ctx; @@ -127,6 +130,8 @@ typedef struct VulkanFramesPriv { } VulkanFramesPriv; typedef struct AVVkFrameInternal { + pthread_mutex_t update_mutex; + #if CONFIG_CUDA /* Importing external memory into cuda is really expensive so we keep the * memory imported all the time */ @@ -1305,6 +1310,12 @@ static void vulkan_device_free(AVHWDeviceContext *ctx) if (p->libvulkan) dlclose(p->libvulkan); + for (uint32_t i = 0; i < p->nb_tot_qfs; i++) { + pthread_mutex_destroy(p->qf_mutex[i]); + av_freep(&p->qf_mutex[i]); + } + av_freep(&p->qf_mutex); + RELEASE_PROPS(hwctx->enabled_inst_extensions, hwctx->nb_enabled_inst_extensions); RELEASE_PROPS(hwctx->enabled_dev_extensions, hwctx->nb_enabled_dev_extensions); } @@ -1437,13 +1448,26 @@ end: return err; } +static void lock_queue(AVHWDeviceContext *ctx, uint32_t queue_family, uint32_t index) +{ + VulkanDevicePriv *p = ctx->internal->priv; + pthread_mutex_lock(&p->qf_mutex[queue_family][index]); +} + +static void unlock_queue(AVHWDeviceContext *ctx, uint32_t queue_family, uint32_t index) +{ + VulkanDevicePriv *p = ctx->internal->priv; + pthread_mutex_unlock(&p->qf_mutex[queue_family][index]); +} + static int vulkan_device_init(AVHWDeviceContext *ctx) { int err; - uint32_t queue_num; + uint32_t qf_num; AVVulkanDeviceContext *hwctx = ctx->hwctx; VulkanDevicePriv *p = ctx->internal->priv; FFVulkanFunctions *vk = &p->vkfn; + VkQueueFamilyProperties *qf; int graph_index, comp_index, tx_index, enc_index, dec_index; /* Set device extension flags */ @@ -1482,12 +1506,37 @@ static int vulkan_device_init(AVHWDeviceContext *ctx) p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de); p->dev_is_intel = (p->props.properties.vendorID == 0x8086); - vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &queue_num, NULL); - if (!queue_num) { + vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &qf_num, NULL); + if (!qf_num) { av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n"); return AVERROR_EXTERNAL; } + qf = av_malloc_array(qf_num, sizeof(VkQueueFamilyProperties)); + if (!qf) + return AVERROR(ENOMEM); + + vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &qf_num, qf); + + p->qf_mutex = av_calloc(qf_num, sizeof(*p->qf_mutex)); + if (!p->qf_mutex) + return AVERROR(ENOMEM); + p->nb_tot_qfs = qf_num; + + for (uint32_t i = 0; i < qf_num; i++) { + p->qf_mutex[i] = av_calloc(qf[i].queueCount, sizeof(**p->qf_mutex)); + if (!p->qf_mutex[i]) + return AVERROR(ENOMEM); + for (uint32_t j = 0; j < qf[i].queueCount; j++) { + err = pthread_mutex_init(&p->qf_mutex[i][j], NULL); + if (err != 0) { + av_log(ctx, AV_LOG_ERROR, "pthread_mutex_init failed : %s\n", + av_err2str(err)); + return AVERROR(err); + } + } + } + graph_index = hwctx->queue_family_index; comp_index = hwctx->queue_family_comp_index; tx_index = hwctx->queue_family_tx_index; @@ -1502,9 +1551,9 @@ static int vulkan_device_init(AVHWDeviceContext *ctx) return AVERROR(EINVAL); \ } else if (fidx < 0 || ctx_qf < 0) { \ break; \ - } else if (ctx_qf >= queue_num) { \ + } else if (ctx_qf >= qf_num) { \ av_log(ctx, AV_LOG_ERROR, "Invalid %s family index %i (device has %i families)!\n", \ - type, ctx_qf, queue_num); \ + type, ctx_qf, qf_num); \ return AVERROR(EINVAL); \ } \ \ @@ -1521,7 +1570,7 @@ static int vulkan_device_init(AVHWDeviceContext *ctx) tx_index = (ctx_qf == tx_index) ? -1 : tx_index; \ enc_index = (ctx_qf == enc_index) ? -1 : enc_index; \ dec_index = (ctx_qf == dec_index) ? -1 : dec_index; \ - p->qfs[p->num_qfs++] = ctx_qf; \ + p->img_qfs[p->nb_img_qfs++] = ctx_qf; \ } while (0) CHECK_QUEUE("graphics", 0, graph_index, hwctx->queue_family_index, hwctx->nb_graphics_queues); @@ -1532,6 +1581,11 @@ static int vulkan_device_init(AVHWDeviceContext *ctx) #undef CHECK_QUEUE + if (!hwctx->lock_queue) + hwctx->lock_queue = lock_queue; + if (!hwctx->unlock_queue) + hwctx->unlock_queue = unlock_queue; + /* Get device capabilities */ vk->GetPhysicalDeviceMemoryProperties(hwctx->phys_dev, &p->mprops); @@ -1733,9 +1787,6 @@ static void vulkan_free_internal(AVVkFrame *f) { AVVkFrameInternal *internal = f->internal; - if (!internal) - return; - #if CONFIG_CUDA if (internal->cuda_fc_ref) { AVHWFramesContext *cuda_fc = (AVHWFramesContext *)internal->cuda_fc_ref->data; @@ -1764,6 +1815,7 @@ static void vulkan_free_internal(AVVkFrame *f) } #endif + pthread_mutex_destroy(&internal->update_mutex); av_freep(&f->internal); } @@ -1924,9 +1976,11 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, uint32_t src_qf, dst_qf; VkImageLayout new_layout; VkAccessFlags new_access; + AVVulkanFramesContext *vkfc = hwfc->hwctx; const int planes = av_pix_fmt_count_planes(hwfc->sw_format); VulkanDevicePriv *p = hwfc->device_ctx->internal->priv; FFVulkanFunctions *vk = &p->vkfn; + AVFrame tmp = { .data[0] = (uint8_t *)frame }; uint64_t sem_sig_val[AV_NUM_DATA_POINTERS]; VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 }; @@ -1945,6 +1999,12 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, }; VkPipelineStageFlagBits wait_st[AV_NUM_DATA_POINTERS]; + + if ((err = wait_start_exec_ctx(hwfc, ectx))) + return err; + + vkfc->lock_frame(hwfc, frame); + for (int i = 0; i < planes; i++) { wait_st[i] = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; sem_sig_val[i] = frame->sem_value[i] + 1; @@ -1981,9 +2041,6 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, break; } - if ((err = wait_start_exec_ctx(hwfc, ectx))) - return err; - /* Change the image layout to something more optimal for writes. * This also signals the newly created semaphore, making it usable * for synchronization */ @@ -2009,7 +2066,10 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 0, NULL, planes, img_bar); - return submit_exec_ctx(hwfc, ectx, &s_info, frame, 0); + err = submit_exec_ctx(hwfc, ectx, &s_info, frame, 0); + vkfc->unlock_frame(hwfc, frame); + + return err; } static inline void get_plane_wh(int *w, int *h, enum AVPixelFormat format, @@ -2091,10 +2151,10 @@ static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, .usage = usage, .samples = VK_SAMPLE_COUNT_1_BIT, - .pQueueFamilyIndices = p->qfs, - .queueFamilyIndexCount = p->num_qfs, - .sharingMode = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : - VK_SHARING_MODE_EXCLUSIVE, + .pQueueFamilyIndices = p->img_qfs, + .queueFamilyIndexCount = p->nb_img_qfs, + .sharingMode = p->nb_img_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : + VK_SHARING_MODE_EXCLUSIVE, }; get_plane_wh(&create_info.extent.width, &create_info.extent.height, @@ -2118,6 +2178,7 @@ static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame, return AVERROR_EXTERNAL; } + f->queue_family[i] = p->nb_img_qfs > 1 ? VK_QUEUE_FAMILY_IGNORED : p->img_qfs[0]; f->layout[i] = create_info.initialLayout; f->access[i] = 0x0; f->sem_value[i] = 0; @@ -2162,10 +2223,10 @@ static void try_export_flags(AVHWFramesContext *hwfc, VkPhysicalDeviceImageDrmFormatModifierInfoEXT phy_dev_mod_info = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT, .pNext = NULL, - .pQueueFamilyIndices = p->qfs, - .queueFamilyIndexCount = p->num_qfs, - .sharingMode = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : - VK_SHARING_MODE_EXCLUSIVE, + .pQueueFamilyIndices = p->img_qfs, + .queueFamilyIndexCount = p->nb_img_qfs, + .sharingMode = p->nb_img_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : + VK_SHARING_MODE_EXCLUSIVE, }; VkPhysicalDeviceExternalImageFormatInfo enext = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, @@ -2260,6 +2321,16 @@ fail: return NULL; } +static void lock_frame(AVHWFramesContext *fc, AVVkFrame *vkf) +{ + pthread_mutex_lock(&vkf->internal->update_mutex); +} + +static void unlock_frame(AVHWFramesContext *fc, AVVkFrame *vkf) +{ + pthread_mutex_unlock(&vkf->internal->update_mutex); +} + static void vulkan_frames_uninit(AVHWFramesContext *hwfc) { VulkanFramesPriv *fp = hwfc->internal->priv; @@ -2422,6 +2493,11 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc) return AVERROR(ENOMEM); } + if (!hwctx->lock_frame) + hwctx->lock_frame = lock_frame; + if (!hwctx->unlock_frame) + hwctx->unlock_frame = unlock_frame; + return 0; } @@ -2728,10 +2804,10 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT, .samples = VK_SAMPLE_COUNT_1_BIT, - .pQueueFamilyIndices = p->qfs, - .queueFamilyIndexCount = p->num_qfs, - .sharingMode = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : - VK_SHARING_MODE_EXCLUSIVE, + .pQueueFamilyIndices = p->img_qfs, + .queueFamilyIndexCount = p->nb_img_qfs, + .sharingMode = p->nb_img_qfs > 1 ? VK_SHARING_MODE_CONCURRENT : + VK_SHARING_MODE_EXCLUSIVE, }; /* Image format verification */ @@ -2810,6 +2886,7 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f * offer us anything we could import and sync with, so instead * just signal the semaphore we created. */ + f->queue_family[i] = p->nb_img_qfs > 1 ? VK_QUEUE_FAMILY_IGNORED : p->img_qfs[0]; f->layout[i] = create_info.initialLayout; f->access[i] = 0x0; f->sem_value[i] = 0; @@ -3018,20 +3095,12 @@ static int vulkan_export_to_cuda(AVHWFramesContext *hwfc, CU_AD_FORMAT_UNSIGNED_INT8; dst_f = (AVVkFrame *)frame->data[0]; - dst_int = dst_f->internal; - if (!dst_int || !dst_int->cuda_fc_ref) { - if (!dst_f->internal) - dst_f->internal = dst_int = av_mallocz(sizeof(*dst_f->internal)); - - if (!dst_int) - return AVERROR(ENOMEM); + if (!dst_int->cuda_fc_ref) { dst_int->cuda_fc_ref = av_buffer_ref(cuda_hwfc); - if (!dst_int->cuda_fc_ref) { - av_freep(&dst_f->internal); + if (!dst_int->cuda_fc_ref) return AVERROR(ENOMEM); - } for (int i = 0; i < planes; i++) { CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC tex_desc = { @@ -3705,13 +3774,14 @@ static int unmap_buffers(AVHWDeviceContext *ctx, AVBufferRef **bufs, return err; } -static int transfer_image_buf(AVHWFramesContext *hwfc, const AVFrame *f, +static int transfer_image_buf(AVHWFramesContext *hwfc, AVFrame *f, AVBufferRef **bufs, size_t *buf_offsets, const int *buf_stride, int w, int h, enum AVPixelFormat pix_fmt, int to_buf) { int err; AVVkFrame *frame = (AVVkFrame *)f->data[0]; + AVVulkanFramesContext *vkfc = hwfc->hwctx; VulkanFramesPriv *fp = hwfc->internal->priv; VulkanDevicePriv *p = hwfc->device_ctx->internal->priv; FFVulkanFunctions *vk = &p->vkfn; @@ -3746,11 +3816,13 @@ static int transfer_image_buf(AVHWFramesContext *hwfc, const AVFrame *f, .waitSemaphoreCount = planes, }; - for (int i = 0; i < planes; i++) - sem_signal_values[i] = frame->sem_value[i] + 1; + vkfc->lock_frame(hwfc, frame); if ((err = wait_start_exec_ctx(hwfc, ectx))) - return err; + goto end; + + for (int i = 0; i < planes; i++) + sem_signal_values[i] = frame->sem_value[i] + 1; /* Change the image layout to something more optimal for transfers */ for (int i = 0; i < planes; i++) { @@ -3825,14 +3897,18 @@ static int transfer_image_buf(AVHWFramesContext *hwfc, const AVFrame *f, if (!f->buf[ref]) break; if ((err = add_buf_dep_exec_ctx(hwfc, ectx, &f->buf[ref], 1))) - return err; + goto end; } if (ref && (err = add_buf_dep_exec_ctx(hwfc, ectx, bufs, planes))) - return err; - return submit_exec_ctx(hwfc, ectx, &s_info, frame, !ref); + goto end; + err = submit_exec_ctx(hwfc, ectx, &s_info, frame, !ref); } else { - return submit_exec_ctx(hwfc, ectx, &s_info, frame, 1); + err = submit_exec_ctx(hwfc, ectx, &s_info, frame, 1); } + +end: + vkfc->unlock_frame(hwfc, frame); + return err; } static int vulkan_transfer_data(AVHWFramesContext *hwfc, const AVFrame *vkf, @@ -3961,8 +4037,9 @@ static int vulkan_transfer_data(AVHWFramesContext *hwfc, const AVFrame *vkf, } /* Copy buffers into/from image */ - err = transfer_image_buf(hwfc, vkf, bufs, buf_offsets, tmp.linesize, - swf->width, swf->height, swf->format, from); + err = transfer_image_buf(hwfc, (AVFrame *)vkf, bufs, buf_offsets, + tmp.linesize, swf->width, swf->height, swf->format, + from); if (from) { /* Map, copy buffer (which came FROM the VkImage) to the frame, unmap */ @@ -4143,7 +4220,25 @@ static int vulkan_frames_derive_to(AVHWFramesContext *dst_fc, AVVkFrame *av_vk_frame_alloc(void) { - return av_mallocz(sizeof(AVVkFrame)); + int err; + AVVkFrame *f = av_mallocz(sizeof(AVVkFrame)); + if (!f) + return NULL; + + f->internal = av_mallocz(sizeof(*f->internal)); + if (!f->internal) { + av_free(f); + return NULL; + } + + err = pthread_mutex_init(&f->internal->update_mutex, NULL); + if (err != 0) { + av_free(f->internal); + av_free(f); + return NULL; + } + + return f; } const HWContextType ff_hwcontext_type_vulkan = { diff --git a/libavutil/hwcontext_vulkan.h b/libavutil/hwcontext_vulkan.h index 70c8379dc3..9fbf4b2160 100644 --- a/libavutil/hwcontext_vulkan.h +++ b/libavutil/hwcontext_vulkan.h @@ -27,6 +27,8 @@ #include "pixfmt.h" #include "frame.h" +typedef struct AVVkFrame AVVkFrame; + /** * @file * API-specific header for AV_HWDEVICE_TYPE_VULKAN. @@ -135,6 +137,19 @@ typedef struct AVVulkanDeviceContext { */ int queue_family_decode_index; int nb_decode_queues; + + /** + * Locks a queue, preventing other threads from submitting any command + * buffers to this queue. + * If set to NULL, will be set to lavu-internal functions that utilize a + * mutex. + */ + void (*lock_queue)(struct AVHWDeviceContext *ctx, uint32_t queue_family, uint32_t index); + + /** + * Similar to lock_queue(), unlocks a queue. Must only be called after locking. + */ + void (*unlock_queue)(struct AVHWDeviceContext *ctx, uint32_t queue_family, uint32_t index); } AVVulkanDeviceContext; /** @@ -195,6 +210,21 @@ typedef struct AVVulkanFramesContext { * av_hwframe_ctx_init(). */ AVVkFrameFlags flags; + + /** + * Locks a frame, preventing other threads from changing frame properties. + * Users SHOULD only ever lock just before command submission in order + * to get accurate frame properties, and unlock immediately after command + * submission without waiting for it to finish. + * + * If unset, will be set to lavu-internal functions that utilize a mutex. + */ + void (*lock_frame)(struct AVHWFramesContext *fc, AVVkFrame *vkf); + + /** + * Similar to lock_frame(), unlocks a frame. Must only be called after locking. + */ + void (*unlock_frame)(struct AVHWFramesContext *fc, AVVkFrame *vkf); } AVVulkanFramesContext; /* @@ -210,7 +240,7 @@ typedef struct AVVulkanFramesContext { * @note the size of this structure is not part of the ABI, to allocate * you must use @av_vk_frame_alloc(). */ -typedef struct AVVkFrame { +struct AVVkFrame { /** * Vulkan images to which the memory is bound to. */ @@ -264,6 +294,12 @@ typedef struct AVVkFrame { * Describes the binding offset of each plane to the VkDeviceMemory. */ ptrdiff_t offset[AV_NUM_DATA_POINTERS]; + + /** + * Queue family of the images. Must be VK_QUEUE_FAMILY_IGNORED if + * the image was allocated with the CONCURRENT concurrency option. + */ + uint32_t queue_family[AV_NUM_DATA_POINTERS]; } AVVkFrame; /**