diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c index 2f4fe6a797..800db74f74 100644 --- a/libavutil/hwcontext_vulkan.c +++ b/libavutil/hwcontext_vulkan.c @@ -41,11 +41,23 @@ #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x) #endif +typedef struct VulkanQueueCtx { + VkFence fence; + VkQueue queue; + int was_synchronous; + + /* Buffer dependencies */ + AVBufferRef **buf_deps; + int nb_buf_deps; + int buf_deps_alloc_size; +} VulkanQueueCtx; + typedef struct VulkanExecCtx { VkCommandPool pool; - VkCommandBuffer buf; - VkQueue queue; - VkFence fence; + VkCommandBuffer *bufs; + VulkanQueueCtx *queues; + int nb_queues; + int cur_queue_idx; } VulkanExecCtx; typedef struct VulkanDevicePriv { @@ -60,8 +72,9 @@ typedef struct VulkanDevicePriv { /* Debug callback */ VkDebugUtilsMessengerEXT debug_ctx; - /* Image uploading */ - VulkanExecCtx cmd; + /* Image transfers */ + VulkanExecCtx upload_ctx; + VulkanExecCtx download_ctx; /* Extensions */ uint64_t extensions; @@ -89,6 +102,16 @@ typedef struct AVVkFrameInternal { #endif } AVVkFrameInternal; +#define GET_QUEUE_COUNT(hwctx, graph, comp, tx) ( \ + graph ? hwctx->nb_graphics_queues : \ + comp ? (hwctx->nb_comp_queues ? \ + hwctx->nb_comp_queues : hwctx->nb_graphics_queues) : \ + tx ? (hwctx->nb_tx_queues ? hwctx->nb_tx_queues : \ + (hwctx->nb_comp_queues ? \ + hwctx->nb_comp_queues : hwctx->nb_graphics_queues)) : \ + 0 \ +) + #define VK_LOAD_PFN(inst, name) PFN_##name pfn_##name = (PFN_##name) \ vkGetInstanceProcAddr(inst, #name) @@ -709,7 +732,7 @@ fail: } static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd, - int queue_family_index) + int queue_family_index, int num_queues) { VkResult ret; AVVulkanDeviceContext *hwctx = ctx->hwctx; @@ -722,21 +745,20 @@ static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd, VkCommandBufferAllocateInfo cbuf_create = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, - .commandBufferCount = 1, + .commandBufferCount = num_queues, }; - VkFenceCreateInfo fence_spawn = { - .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, - }; + cmd->nb_queues = num_queues; - ret = vkCreateFence(hwctx->act_dev, &fence_spawn, - hwctx->alloc, &cmd->fence); - if (ret != VK_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Failed to create frame fence: %s\n", - vk_ret2str(ret)); - return AVERROR_EXTERNAL; - } + cmd->queues = av_mallocz(num_queues * sizeof(*cmd->queues)); + if (!cmd->queues) + return AVERROR(ENOMEM); + cmd->bufs = av_mallocz(num_queues * sizeof(*cmd->bufs)); + if (!cmd->bufs) + return AVERROR(ENOMEM); + + /* Create command pool */ ret = vkCreateCommandPool(hwctx->act_dev, &cqueue_create, hwctx->alloc, &cmd->pool); if (ret != VK_SUCCESS) { @@ -747,15 +769,19 @@ static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd, cbuf_create.commandPool = cmd->pool; - ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, &cmd->buf); + /* Allocate command buffer */ + ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, cmd->bufs); if (ret != VK_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Command buffer alloc failure: %s\n", vk_ret2str(ret)); return AVERROR_EXTERNAL; } - vkGetDeviceQueue(hwctx->act_dev, cqueue_create.queueFamilyIndex, 0, - &cmd->queue); + for (int i = 0; i < num_queues; i++) { + VulkanQueueCtx *q = &cmd->queues[i]; + vkGetDeviceQueue(hwctx->act_dev, queue_family_index, i, &q->queue); + q->was_synchronous = 1; + } return 0; } @@ -764,12 +790,154 @@ static void free_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd) { AVVulkanDeviceContext *hwctx = ctx->hwctx; - if (cmd->fence) - vkDestroyFence(hwctx->act_dev, cmd->fence, hwctx->alloc); - if (cmd->buf) - vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, 1, &cmd->buf); + /* Make sure all queues have finished executing */ + for (int i = 0; i < cmd->nb_queues; i++) { + VulkanQueueCtx *q = &cmd->queues[i]; + + if (q->fence && !q->was_synchronous) { + vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX); + vkResetFences(hwctx->act_dev, 1, &q->fence); + } + + /* Free the fence */ + if (q->fence) + vkDestroyFence(hwctx->act_dev, q->fence, hwctx->alloc); + + /* Free buffer dependencies */ + for (int j = 0; j < q->nb_buf_deps; j++) + av_buffer_unref(&q->buf_deps[j]); + av_free(q->buf_deps); + } + + if (cmd->bufs) + vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, cmd->nb_queues, cmd->bufs); if (cmd->pool) vkDestroyCommandPool(hwctx->act_dev, cmd->pool, hwctx->alloc); + + av_freep(&cmd->bufs); + av_freep(&cmd->queues); +} + +static VkCommandBuffer get_buf_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd) +{ + return cmd->bufs[cmd->cur_queue_idx]; +} + +static void unref_exec_ctx_deps(AVHWDeviceContext *ctx, VulkanExecCtx *cmd) +{ + VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx]; + + for (int j = 0; j < q->nb_buf_deps; j++) + av_buffer_unref(&q->buf_deps[j]); + q->nb_buf_deps = 0; +} + +static int wait_start_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd) +{ + VkResult ret; + AVVulkanDeviceContext *hwctx = ctx->hwctx; + VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx]; + + VkCommandBufferBeginInfo cmd_start = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + /* Create the fence and don't wait for it initially */ + if (!q->fence) { + VkFenceCreateInfo fence_spawn = { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + }; + ret = vkCreateFence(hwctx->act_dev, &fence_spawn, hwctx->alloc, + &q->fence); + if (ret != VK_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Failed to queue frame fence: %s\n", + vk_ret2str(ret)); + return AVERROR_EXTERNAL; + } + } else if (!q->was_synchronous) { + vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX); + vkResetFences(hwctx->act_dev, 1, &q->fence); + } + + /* Discard queue dependencies */ + unref_exec_ctx_deps(ctx, cmd); + + ret = vkBeginCommandBuffer(cmd->bufs[cmd->cur_queue_idx], &cmd_start); + if (ret != VK_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Unable to init command buffer: %s\n", + vk_ret2str(ret)); + return AVERROR_EXTERNAL; + } + + return 0; +} + +static int add_buf_dep_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd, + AVBufferRef * const *deps, int nb_deps) +{ + AVBufferRef **dst; + VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx]; + + if (!deps || !nb_deps) + return 0; + + dst = av_fast_realloc(q->buf_deps, &q->buf_deps_alloc_size, + (q->nb_buf_deps + nb_deps) * sizeof(*dst)); + if (!dst) + goto err; + + q->buf_deps = dst; + + for (int i = 0; i < nb_deps; i++) { + q->buf_deps[q->nb_buf_deps] = av_buffer_ref(deps[i]); + if (!q->buf_deps[q->nb_buf_deps]) + goto err; + q->nb_buf_deps++; + } + + return 0; + +err: + unref_exec_ctx_deps(ctx, cmd); + return AVERROR(ENOMEM); +} + +static int submit_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd, + VkSubmitInfo *s_info, int synchronous) +{ + VkResult ret; + VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx]; + + ret = vkEndCommandBuffer(cmd->bufs[cmd->cur_queue_idx]); + if (ret != VK_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n", + vk_ret2str(ret)); + unref_exec_ctx_deps(ctx, cmd); + return AVERROR_EXTERNAL; + } + + s_info->pCommandBuffers = &cmd->bufs[cmd->cur_queue_idx]; + s_info->commandBufferCount = 1; + + ret = vkQueueSubmit(q->queue, 1, s_info, q->fence); + if (ret != VK_SUCCESS) { + unref_exec_ctx_deps(ctx, cmd); + return AVERROR_EXTERNAL; + } + + q->was_synchronous = synchronous; + + if (synchronous) { + AVVulkanDeviceContext *hwctx = ctx->hwctx; + vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX); + vkResetFences(hwctx->act_dev, 1, &q->fence); + unref_exec_ctx_deps(ctx, cmd); + } else { /* Rotate queues */ + cmd->cur_queue_idx = (cmd->cur_queue_idx + 1) % cmd->nb_queues; + } + + return 0; } static void vulkan_device_free(AVHWDeviceContext *ctx) @@ -937,7 +1105,8 @@ if (n >= queue_num) { p->qfs[p->num_qfs++] = hwctx->queue_family_comp_index; /* Create exec context - if there's something invalid this will error out */ - err = create_exec_ctx(ctx, &p->cmd, hwctx->queue_family_tx_index); + err = create_exec_ctx(ctx, &p->cmd, hwctx->queue_family_tx_index, + GET_QUEUE_COUNT(hwctx, 0, 0, 1)); if (err) return err; @@ -1257,26 +1426,17 @@ enum PrepMode { static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, AVVkFrame *frame, enum PrepMode pmode) { - VkResult ret; + int err; uint32_t dst_qf; VkImageLayout new_layout; VkAccessFlags new_access; AVHWDeviceContext *ctx = hwfc->device_ctx; - AVVulkanDeviceContext *hwctx = ctx->hwctx; const int planes = av_pix_fmt_count_planes(hwfc->sw_format); VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 }; - VkCommandBufferBeginInfo cmd_start = { - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, - .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, - }; - VkSubmitInfo s_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, - .commandBufferCount = 1, - .pCommandBuffers = &ectx->buf, - .pSignalSemaphores = frame->sem, .signalSemaphoreCount = planes, }; @@ -1306,9 +1466,8 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, break; } - ret = vkBeginCommandBuffer(ectx->buf, &cmd_start); - if (ret != VK_SUCCESS) - return AVERROR_EXTERNAL; + if ((err = wait_start_exec_ctx(ctx, ectx))) + return err; /* Change the image layout to something more optimal for writes. * This also signals the newly created semaphore, making it usable @@ -1330,23 +1489,12 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx, frame->access[i] = img_bar[i].dstAccessMask; } - vkCmdPipelineBarrier(ectx->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, - 0, NULL, 0, NULL, planes, img_bar); + vkCmdPipelineBarrier(get_buf_exec_ctx(ctx, ectx), + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, 0, NULL, 0, NULL, planes, img_bar); - ret = vkEndCommandBuffer(ectx->buf); - if (ret != VK_SUCCESS) - return AVERROR_EXTERNAL; - - ret = vkQueueSubmit(ectx->queue, 1, &s_info, ectx->fence); - if (ret != VK_SUCCESS) { - return AVERROR_EXTERNAL; - } else { - vkWaitForFences(hwctx->act_dev, 1, &ectx->fence, VK_TRUE, UINT64_MAX); - vkResetFences(hwctx->act_dev, 1, &ectx->fence); - } - - return 0; + return submit_exec_ctx(ctx, ectx, &s_info, 0); } static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame, @@ -1559,7 +1707,8 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc) hwctx->usage |= DEFAULT_USAGE_FLAGS; err = create_exec_ctx(hwfc->device_ctx, &fp->cmd, - dev_hwctx->queue_family_tx_index); + dev_hwctx->queue_family_tx_index, + GET_QUEUE_COUNT(dev_hwctx, 0, 0, 1)); if (err) return err; @@ -2620,12 +2769,12 @@ static int unmap_buffers(AVHWDeviceContext *ctx, AVBufferRef **bufs, return err; } -static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame, +static int transfer_image_buf(AVHWDeviceContext *ctx, const AVFrame *f, AVBufferRef **bufs, const int *buf_stride, int w, int h, enum AVPixelFormat pix_fmt, int to_buf) { - VkResult ret; - AVVulkanDeviceContext *hwctx = ctx->hwctx; + int err; + AVVkFrame *frame = (AVVkFrame *)f->data[0]; VulkanDevicePriv *s = ctx->internal->priv; int bar_num = 0; @@ -2634,17 +2783,11 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame, const int planes = av_pix_fmt_count_planes(pix_fmt); const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); - VkCommandBufferBeginInfo cmd_start = { - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, - .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, - }; - VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 }; + VkCommandBuffer cmd_buf = get_buf_exec_ctx(ctx, &s->cmd); VkSubmitInfo s_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, - .commandBufferCount = 1, - .pCommandBuffers = &s->cmd.buf, .pSignalSemaphores = frame->sem, .pWaitSemaphores = frame->sem, .pWaitDstStageMask = sem_wait_dst, @@ -2652,12 +2795,8 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame, .waitSemaphoreCount = planes, }; - ret = vkBeginCommandBuffer(s->cmd.buf, &cmd_start); - if (ret != VK_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Unable to init command buffer: %s\n", - vk_ret2str(ret)); - return AVERROR_EXTERNAL; - } + if ((err = wait_start_exec_ctx(ctx, &s->cmd))) + return err; /* Change the image layout to something more optimal for transfers */ for (int i = 0; i < planes; i++) { @@ -2691,7 +2830,7 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame, } if (bar_num) - vkCmdPipelineBarrier(s->cmd.buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 0, NULL, bar_num, img_bar); @@ -2715,33 +2854,33 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame, }; if (to_buf) - vkCmdCopyImageToBuffer(s->cmd.buf, frame->img[i], frame->layout[i], + vkCmdCopyImageToBuffer(cmd_buf, frame->img[i], frame->layout[i], vkbuf->buf, 1, &buf_reg); else - vkCmdCopyBufferToImage(s->cmd.buf, vkbuf->buf, frame->img[i], + vkCmdCopyBufferToImage(cmd_buf, vkbuf->buf, frame->img[i], frame->layout[i], 1, &buf_reg); } - ret = vkEndCommandBuffer(s->cmd.buf); - if (ret != VK_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n", - vk_ret2str(ret)); - return AVERROR_EXTERNAL; - } - - /* Wait for the download/upload to finish if uploading, otherwise the - * semaphore will take care of synchronization when uploading */ - ret = vkQueueSubmit(s->cmd.queue, 1, &s_info, s->cmd.fence); - if (ret != VK_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Unable to submit command buffer: %s\n", - vk_ret2str(ret)); - return AVERROR_EXTERNAL; + /* When uploading, do this asynchronously if the source is refcounted by + * keeping the buffers as a submission dependency. + * The hwcontext is guaranteed to not be freed until all frames are freed + * in the frames_unint function. + * When downloading to buffer, do this synchronously and wait for the + * queue submission to finish executing */ + if (!to_buf) { + int ref; + for (ref = 0; ref < AV_NUM_DATA_POINTERS; ref++) { + if (!f->buf[ref]) + break; + if ((err = add_buf_dep_exec_ctx(hwfc, &s->cmd, &f->buf[ref], 1))) + return err; + } + if (ref && (err = add_buf_dep_exec_ctx(hwfc, &s->cmd, bufs, planes))) + return err; + return submit_exec_ctx(hwfc, &s->cmd, &s_info, !ref); } else { - vkWaitForFences(hwctx->act_dev, 1, &s->cmd.fence, VK_TRUE, UINT64_MAX); - vkResetFences(hwctx->act_dev, 1, &s->cmd.fence); + return submit_exec_ctx(hwfc, &s->cmd, &s_info, 1); } - - return 0; } /* Technically we can use VK_EXT_external_memory_host to upload and download, @@ -2778,11 +2917,11 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst, err = vulkan_map_frame_to_mem(hwfc, map, dst, AV_HWFRAME_MAP_WRITE); if (err) - goto end; + return err; err = av_frame_copy(map, src); av_frame_free(&map); - goto end; + return err; } /* Create buffers */ @@ -2809,7 +2948,7 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst, goto end; /* Copy buffers to image */ - err = transfer_image_buf(dev_ctx, f, bufs, tmp.linesize, + err = transfer_image_buf(dev_ctx, dst, bufs, tmp.linesize, src->width, src->height, src->format, 0); end: @@ -2949,10 +3088,12 @@ static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame *dst, err = create_buf(dev_ctx, &bufs[i], p_height, &tmp.linesize[i], VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, NULL, NULL); + if (err) + goto end; } /* Copy image to buffer */ - if ((err = transfer_image_buf(dev_ctx, f, bufs, tmp.linesize, + if ((err = transfer_image_buf(dev_ctx, src, bufs, tmp.linesize, dst->width, dst->height, dst->format, 1))) goto end;