diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index b08150c6bb..80e7350292 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -4103,10 +4103,6 @@ The following video options are currently all specific to ``--vo=gpu`` and the video along the temporal axis. The filter used can be controlled using the ``--tscale`` setting. - Note that this relies on vsync to work, see ``--opengl-swapinterval`` for - more information. It should also only be used with an ``--fbo-format`` - that has at least 16 bit precision. - ``--interpolation-threshold=<0..1,-1>`` Threshold below which frame ratio interpolation gets disabled (default: ``0.0001``). This is calculated as ``abs(disphz/vfps - 1) < threshold``, @@ -4184,6 +4180,31 @@ The following video options are currently all specific to ``--vo=gpu`` and results, as can missing or incorrect display FPS information (see ``--display-fps``). +``--vulkan-swap-mode=`` + Controls the presentation mode of the vulkan swapchain. This is similar + to the ``--opengl-swapinterval`` option. + + auto + Use the preferred swapchain mode for the vulkan context. (Default) + fifo + Non-tearing, vsync blocked. Similar to "VSync on". + fifo-relaxed + Tearing, vsync blocked. Late frames will tear instead of stuttering. + mailbox + Non-tearing, not vsync blocked. Similar to "triple buffering". + immediate + Tearing, not vsync blocked. Similar to "VSync off". + +``--vulkan-queue-count=<1..8>`` + Controls the number of VkQueues used for rendering (limited by how many + your device supports). In theory, using more queues could enable some + parallelism between frames (when using a ``--swapchain-depth`` higher than + 1). (Default: 1) + + NOTE: Setting this to a value higher than 1 may cause graphical corruption, + as mpv's vulkan implementation currently does not try and protect textures + against concurrent access. + ``--glsl-shaders=`` Custom GLSL hooks. These are a flexible way to add custom fragment shaders, which can be injected at almost arbitrary points in the rendering pipeline, @@ -4590,7 +4611,7 @@ The following video options are currently all specific to ``--vo=gpu`` and on Nvidia and AMD. Newer Intel chips with the latest drivers may also work. x11 - X11/GLX + X11/GLX, VK_KHR_xlib_surface x11probe For internal autoprobing, equivalent to ``x11`` otherwise. Don't use directly, it could be removed without warning as autoprobing is changed. @@ -5020,7 +5041,10 @@ Miscellaneous Media files must use constant framerate. Section-wise VFR might work as well with some container formats (but not e.g. mkv). If the sync code detects severe A/V desync, or the framerate cannot be detected, the player - automatically reverts to ``audio`` mode for some time or permanently. + automatically reverts to ``audio`` mode for some time or permanently. These + modes also require a vsync blocked presentation mode. For OpenGL, this + translates to ``--opengl-swapinterval=1``. For Vulkan, it translates to + ``--vulkan-swap-mode=fifo`` (or ``fifo-relaxed``). The modes with ``desync`` in their names do not attempt to keep audio/video in sync. They will slowly (or quickly) desync, until e.g. the next seek diff --git a/options/options.c b/options/options.c index 1168cc196b..6467468691 100644 --- a/options/options.c +++ b/options/options.c @@ -89,6 +89,7 @@ extern const struct m_obj_list vo_obj_list; extern const struct m_obj_list ao_obj_list; extern const struct m_sub_options opengl_conf; +extern const struct m_sub_options vulkan_conf; extern const struct m_sub_options angle_conf; extern const struct m_sub_options cocoa_conf; @@ -690,6 +691,10 @@ const m_option_t mp_opts[] = { OPT_SUBSTRUCT("", opengl_opts, opengl_conf, 0), #endif +#if HAVE_VULKAN + OPT_SUBSTRUCT("", vulkan_opts, vulkan_conf, 0), +#endif + #if HAVE_EGL_ANGLE_WIN32 OPT_SUBSTRUCT("", angle_opts, angle_conf, 0), #endif diff --git a/options/options.h b/options/options.h index c02b7a34ca..63dee03612 100644 --- a/options/options.h +++ b/options/options.h @@ -329,6 +329,7 @@ typedef struct MPOpts { struct gl_video_opts *gl_video_opts; struct angle_opts *angle_opts; struct opengl_opts *opengl_opts; + struct vulkan_opts *vulkan_opts; struct cocoa_opts *cocoa_opts; struct dvd_opts *dvd_opts; diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c index c5721c73b4..25e2a754bf 100644 --- a/video/out/gpu/context.c +++ b/video/out/gpu/context.c @@ -44,6 +44,7 @@ extern const struct ra_ctx_fns ra_ctx_dxgl; extern const struct ra_ctx_fns ra_ctx_rpi; extern const struct ra_ctx_fns ra_ctx_mali; extern const struct ra_ctx_fns ra_ctx_vdpauglx; +extern const struct ra_ctx_fns ra_ctx_vulkan_xlib; static const struct ra_ctx_fns *contexts[] = { // OpenGL contexts: @@ -83,6 +84,13 @@ static const struct ra_ctx_fns *contexts[] = { #if HAVE_VDPAU_GL_X11 &ra_ctx_vdpauglx, #endif + +// Vulkan contexts: +#if HAVE_VULKAN +#if HAVE_X11 + &ra_ctx_vulkan_xlib, +#endif +#endif }; static bool get_help(struct mp_log *log, struct bstr param) diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h index 10245b250e..7a2fa0e11c 100644 --- a/video/out/gpu/ra.h +++ b/video/out/gpu/ra.h @@ -146,6 +146,7 @@ enum ra_buf_type { RA_BUF_TYPE_TEX_UPLOAD, // texture upload buffer (pixel buffer object) RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW RA_BUF_TYPE_UNIFORM, // uniform buffer (UBO), for RA_VARTYPE_BUF_RO + RA_BUF_TYPE_VERTEX, // not publicly usable (RA-internal usage) }; struct ra_buf_params { @@ -369,10 +370,10 @@ struct ra_fns { void (*buf_destroy)(struct ra *ra, struct ra_buf *buf); - // Update the contents of a buffer, starting at a given offset and up to a - // given size, with the contents of *data. This is an extremely common - // operation. Calling this while the buffer is considered "in use" is an - // error. (See: buf_poll) + // Update the contents of a buffer, starting at a given offset (*must* be a + // multiple of 4) and up to a given size, with the contents of *data. This + // is an extremely common operation. Calling this while the buffer is + // considered "in use" is an error. (See: buf_poll) void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, const void *data, size_t size); diff --git a/video/out/vo_gpu.c b/video/out/vo_gpu.c index bd245de05b..a26912e0d8 100644 --- a/video/out/vo_gpu.c +++ b/video/out/vo_gpu.c @@ -60,6 +60,7 @@ struct gpu_priv { static void resize(struct gpu_priv *p) { struct vo *vo = p->vo; + struct ra_swapchain *sw = p->ctx->swapchain; MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight); @@ -69,6 +70,11 @@ static void resize(struct gpu_priv *p) gl_video_resize(p->renderer, &src, &dst, &osd); + int fb_depth = sw->fns->color_depth ? sw->fns->color_depth(sw) : 0; + if (fb_depth) + MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth); + gl_video_set_fb_depth(p->renderer, fb_depth); + vo->want_redraw = true; } @@ -289,7 +295,6 @@ static int preinit(struct vo *vo) goto err_out; assert(p->ctx->ra); assert(p->ctx->swapchain); - struct ra_swapchain *sw = p->ctx->swapchain; p->renderer = gl_video_init(p->ctx->ra, vo->log, vo->global); gl_video_set_osd_source(p->renderer, vo->osd); @@ -305,11 +310,6 @@ static int preinit(struct vo *vo) vo->hwdec_devs, vo->opts->gl_hwdec_interop); gl_video_set_hwdec(p->renderer, p->hwdec); - int fb_depth = sw->fns->color_depth ? sw->fns->color_depth(sw) : 0; - if (fb_depth) - MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth); - gl_video_set_fb_depth(p->renderer, fb_depth); - return 0; err_out: diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h new file mode 100644 index 0000000000..4c0e783f0e --- /dev/null +++ b/video/out/vulkan/common.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "config.h" + +#include "common/common.h" +#include "common/msg.h" + +// We need to define all platforms we want to support. Since we have +// our own mechanism for checking this, we re-define the right symbols +#if HAVE_X11 +#define VK_USE_PLATFORM_XLIB_KHR +#endif + +#include + +// Vulkan allows the optional use of a custom allocator. We don't need one but +// mark this parameter with a better name in case we ever decide to change this +// in the future. (And to make the code more readable) +#define MPVK_ALLOCATOR NULL + +// A lot of things depend on streaming resources across frames. Depending on +// how many frames we render ahead of time, we need to pick enough to avoid +// any conflicts, so make all of these tunable relative to this constant in +// order to centralize them. +#define MPVK_MAX_STREAMING_DEPTH 8 + +// Shared struct used to hold vulkan context information +struct mpvk_ctx { + struct mp_log *log; + VkInstance inst; + VkPhysicalDevice physd; + VkDebugReportCallbackEXT dbg; + VkDevice dev; + + // Surface, must be initialized fter the context itself + VkSurfaceKHR surf; + VkSurfaceFormatKHR surf_format; // picked at surface initialization time + + struct vk_malloc *alloc; // memory allocator for this device + struct vk_cmdpool *pool; // primary command pool for this device + struct vk_cmd *last_cmd; // most recently submitted command + + // Cached capabilities + VkPhysicalDeviceLimits limits; +}; diff --git a/video/out/vulkan/context.c b/video/out/vulkan/context.c new file mode 100644 index 0000000000..bd456d214c --- /dev/null +++ b/video/out/vulkan/context.c @@ -0,0 +1,501 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include "options/m_config.h" +#include "context.h" +#include "ra_vk.h" +#include "utils.h" + +enum { + SWAP_AUTO = 0, + SWAP_FIFO, + SWAP_FIFO_RELAXED, + SWAP_MAILBOX, + SWAP_IMMEDIATE, + SWAP_COUNT, +}; + +struct vulkan_opts { + struct mpvk_device_opts dev_opts; // logical device options + char *device; // force a specific GPU + int swap_mode; +}; + +static int vk_validate_dev(struct mp_log *log, const struct m_option *opt, + struct bstr name, struct bstr param) +{ + int ret = M_OPT_INVALID; + VkResult res; + + // Create a dummy instance to validate/list the devices + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + VkInstance inst; + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + + res = vkCreateInstance(&info, MPVK_ALLOCATOR, &inst); + if (res != VK_SUCCESS) + goto error; + + res = vkEnumeratePhysicalDevices(inst, &num, NULL); + if (res != VK_SUCCESS) + goto error; + + devices = talloc_array(NULL, VkPhysicalDevice, num); + vkEnumeratePhysicalDevices(inst, &num, devices); + if (res != VK_SUCCESS) + goto error; + + bool help = bstr_equals0(param, "help"); + if (help) { + mp_info(log, "Available vulkan devices:\n"); + ret = M_OPT_EXIT; + } + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop; + vkGetPhysicalDeviceProperties(devices[i], &prop); + + if (help) { + mp_info(log, " '%s' (GPU %d, ID %x:%x)\n", prop.deviceName, i, + (unsigned)prop.vendorID, (unsigned)prop.deviceID); + } else if (bstr_equals0(param, prop.deviceName)) { + ret = 0; + break; + } + } + + if (!help) + mp_err(log, "No device with name '%.*s'!\n", BSTR_P(param)); + +error: + talloc_free(devices); + return ret; +} + +#define OPT_BASE_STRUCT struct vulkan_opts +const struct m_sub_options vulkan_conf = { + .opts = (const struct m_option[]) { + OPT_STRING_VALIDATE("vulkan-device", device, 0, vk_validate_dev), + OPT_CHOICE("vulkan-swap-mode", swap_mode, 0, + ({"auto", SWAP_AUTO}, + {"fifo", SWAP_FIFO}, + {"fifo-relaxed", SWAP_FIFO_RELAXED}, + {"mailbox", SWAP_MAILBOX}, + {"immediate", SWAP_IMMEDIATE})), + OPT_INTRANGE("vulkan-queue-count", dev_opts.queue_count, 0, 1, + MPVK_MAX_QUEUES, OPTDEF_INT(1)), + {0} + }, + .size = sizeof(struct vulkan_opts) +}; + +struct priv { + struct mpvk_ctx *vk; + struct vulkan_opts *opts; + // Swapchain metadata: + int w, h; // current size + VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype + VkSwapchainKHR swapchain; + VkSwapchainKHR old_swapchain; + int frames_in_flight; + // state of the images: + struct ra_tex **images; // ra_tex wrappers for the vkimages + int num_images; // size of images + VkSemaphore *acquired; // pool of semaphores used to synchronize images + int num_acquired; // size of this pool + int idx_acquired; // index of next free semaphore within this pool + int last_imgidx; // the image index last acquired (for submit) +}; + +static bool update_swapchain_info(struct priv *p, + VkSwapchainCreateInfoKHR *info) +{ + struct mpvk_ctx *vk = p->vk; + + // Query the supported capabilities and update this struct as needed + VkSurfaceCapabilitiesKHR caps; + VK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, vk->surf, &caps)); + + // Sorted by preference + static const VkCompositeAlphaFlagBitsKHR alphaModes[] = { + VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(alphaModes); i++) { + if (caps.supportedCompositeAlpha & alphaModes[i]) { + info->compositeAlpha = alphaModes[i]; + break; + } + } + + if (!info->compositeAlpha) { + MP_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)\n", + caps.supportedCompositeAlpha); + goto error; + } + + static const VkSurfaceTransformFlagBitsKHR rotModes[] = { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR, + }; + + for (int i = 0; i < MP_ARRAY_SIZE(rotModes); i++) { + if (caps.supportedTransforms & rotModes[i]) { + info->preTransform = rotModes[i]; + break; + } + } + + if (!info->preTransform) { + MP_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)\n", + caps.supportedTransforms); + goto error; + } + + // Image count as required + MP_VERBOSE(vk, "Requested image count: %d (min %d max %d)\n", + (int)info->minImageCount, (int)caps.minImageCount, + (int)caps.maxImageCount); + + info->minImageCount = MPMAX(info->minImageCount, caps.minImageCount); + if (caps.maxImageCount) + info->minImageCount = MPMIN(info->minImageCount, caps.maxImageCount); + + // Check the extent against the allowed parameters + if (caps.currentExtent.width != info->imageExtent.width && + caps.currentExtent.width != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested width %d does not match current width %d\n", + (int)info->imageExtent.width, (int)caps.currentExtent.width); + info->imageExtent.width = caps.currentExtent.width; + } + + if (caps.currentExtent.height != info->imageExtent.height && + caps.currentExtent.height != 0xFFFFFFFF) + { + MP_WARN(vk, "Requested height %d does not match current height %d\n", + (int)info->imageExtent.height, (int)caps.currentExtent.height); + info->imageExtent.height = caps.currentExtent.height; + } + + if (caps.minImageExtent.width > info->imageExtent.width || + caps.minImageExtent.height > info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d smaller than device minimum %d%d\n", + (int)info->imageExtent.width, (int)info->imageExtent.height, + (int)caps.minImageExtent.width, (int)caps.minImageExtent.height); + goto error; + } + + if (caps.maxImageExtent.width < info->imageExtent.width || + caps.maxImageExtent.height < info->imageExtent.height) + { + MP_ERR(vk, "Requested size %dx%d larger than device maximum %d%d\n", + (int)info->imageExtent.width, (int)info->imageExtent.height, + (int)caps.maxImageExtent.width, (int)caps.maxImageExtent.height); + goto error; + } + + // We just request whatever usage we can, and let the ra_vk decide what + // ra_tex_params that translates to. This makes the images as flexible + // as possible. + info->imageUsage = caps.supportedUsageFlags; + return true; + +error: + return false; +} + +void ra_vk_ctx_uninit(struct ra_ctx *ctx) +{ + if (ctx->ra) { + struct priv *p = ctx->swapchain->priv; + struct mpvk_ctx *vk = p->vk; + + mpvk_pool_wait_idle(vk, vk->pool); + + for (int i = 0; i < p->num_images; i++) + ra_tex_free(ctx->ra, &p->images[i]); + for (int i = 0; i < p->num_acquired; i++) + vkDestroySemaphore(vk->dev, p->acquired[i], MPVK_ALLOCATOR); + + vkDestroySwapchainKHR(vk->dev, p->swapchain, MPVK_ALLOCATOR); + + talloc_free(p->images); + talloc_free(p->acquired); + ctx->ra->fns->destroy(ctx->ra); + ctx->ra = NULL; + } + + talloc_free(ctx->swapchain); + ctx->swapchain = NULL; +} + +static const struct ra_swapchain_fns vulkan_swapchain; + +bool ra_vk_ctx_init(struct ra_ctx *ctx, struct mpvk_ctx *vk, + VkPresentModeKHR preferred_mode) +{ + struct ra_swapchain *sw = ctx->swapchain = talloc_zero(NULL, struct ra_swapchain); + sw->ctx = ctx; + sw->fns = &vulkan_swapchain; + + struct priv *p = sw->priv = talloc_zero(sw, struct priv); + p->vk = vk; + p->opts = mp_get_config_group(p, ctx->global, &vulkan_conf); + + if (!mpvk_find_phys_device(vk, p->opts->device, ctx->opts.allow_sw)) + goto error; + if (!mpvk_pick_surface_format(vk)) + goto error; + if (!mpvk_device_init(vk, p->opts->dev_opts)) + goto error; + + ctx->ra = ra_create_vk(vk, ctx->log); + if (!ctx->ra) + goto error; + + static const VkPresentModeKHR present_modes[SWAP_COUNT] = { + [SWAP_FIFO] = VK_PRESENT_MODE_FIFO_KHR, + [SWAP_FIFO_RELAXED] = VK_PRESENT_MODE_FIFO_RELAXED_KHR, + [SWAP_MAILBOX] = VK_PRESENT_MODE_MAILBOX_KHR, + [SWAP_IMMEDIATE] = VK_PRESENT_MODE_IMMEDIATE_KHR, + }; + + p->protoInfo = (VkSwapchainCreateInfoKHR) { + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = vk->surf, + .imageFormat = vk->surf_format.format, + .imageColorSpace = vk->surf_format.colorSpace, + .imageArrayLayers = 1, // non-stereoscopic + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .minImageCount = ctx->opts.swapchain_depth + 1, // +1 for FB + .presentMode = p->opts->swap_mode ? present_modes[p->opts->swap_mode] + : preferred_mode, + .clipped = true, + }; + + // Make sure the swapchain present mode is supported + int num_modes; + VK(vkGetPhysicalDeviceSurfacePresentModesKHR(vk->physd, vk->surf, + &num_modes, NULL)); + VkPresentModeKHR *modes = talloc_array(NULL, VkPresentModeKHR, num_modes); + VK(vkGetPhysicalDeviceSurfacePresentModesKHR(vk->physd, vk->surf, + &num_modes, modes)); + bool supported = false; + for (int i = 0; i < num_modes; i++) + supported |= (modes[i] == p->protoInfo.presentMode); + talloc_free(modes); + + if (!supported) { + MP_ERR(ctx, "Requested swap mode unsupported by this device!\n"); + goto error; + } + + return true; + +error: + ra_vk_ctx_uninit(ctx); + return false; +} + +static void destroy_swapchain(struct mpvk_ctx *vk, struct priv *p) +{ + assert(p->old_swapchain); + vkDestroySwapchainKHR(vk->dev, p->old_swapchain, MPVK_ALLOCATOR); + p->old_swapchain = NULL; +} + +bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h) +{ + struct priv *p = sw->priv; + if (w == p->w && h == p->h) + return true; + + struct ra *ra = sw->ctx->ra; + struct mpvk_ctx *vk = p->vk; + VkImage *vkimages = NULL; + + // It's invalid to trigger another swapchain recreation while there's + // more than one swapchain already active, so we need to flush any pending + // asynchronous swapchain release operations that may be ongoing. + while (p->old_swapchain) + mpvk_dev_poll_cmds(vk, 100000); // 100μs + + VkSwapchainCreateInfoKHR sinfo = p->protoInfo; + sinfo.imageExtent = (VkExtent2D){ w, h }; + sinfo.oldSwapchain = p->swapchain; + + if (!update_swapchain_info(p, &sinfo)) + goto error; + + VK(vkCreateSwapchainKHR(vk->dev, &sinfo, MPVK_ALLOCATOR, &p->swapchain)); + p->w = w; + p->h = h; + + // Freeing the old swapchain while it's still in use is an error, so do + // it asynchronously once the device is idle. + if (sinfo.oldSwapchain) { + p->old_swapchain = sinfo.oldSwapchain; + vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, p); + } + + // Get the new swapchain images + int num; + VK(vkGetSwapchainImagesKHR(vk->dev, p->swapchain, &num, NULL)); + vkimages = talloc_array(NULL, VkImage, num); + VK(vkGetSwapchainImagesKHR(vk->dev, p->swapchain, &num, vkimages)); + + // If needed, allocate some more semaphores + while (num > p->num_acquired) { + VkSemaphore sem; + static const VkSemaphoreCreateInfo seminfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem)); + MP_TARRAY_APPEND(NULL, p->acquired, p->num_acquired, sem); + } + + // Recreate the ra_tex wrappers + for (int i = 0; i < p->num_images; i++) + ra_tex_free(ra, &p->images[i]); + + p->num_images = num; + MP_TARRAY_GROW(NULL, p->images, p->num_images); + for (int i = 0; i < num; i++) { + p->images[i] = ra_vk_wrap_swapchain_img(ra, vkimages[i], sinfo); + if (!p->images[i]) + goto error; + } + + talloc_free(vkimages); + return true; + +error: + talloc_free(vkimages); + vkDestroySwapchainKHR(vk->dev, p->swapchain, MPVK_ALLOCATOR); + p->swapchain = NULL; + return false; +} + +static int color_depth(struct ra_swapchain *sw) +{ + struct priv *p = sw->priv; + int bits = 0; + + if (!p->num_images) + return bits; + + // The channel with the most bits is probably the most authoritative about + // the actual color information (consider e.g. a2bgr10). Slight downside + // in that it results in rounding r/b for e.g. rgb565, but we don't pick + // surfaces with fewer than 8 bits anyway. + const struct ra_format *fmt = p->images[0]->params.format; + for (int i = 0; i < fmt->num_components; i++) { + int depth = fmt->component_depth[i]; + bits = MPMAX(bits, depth ? depth : fmt->component_size[i]); + } + + return bits; +} + +static bool start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo) +{ + struct priv *p = sw->priv; + struct mpvk_ctx *vk = p->vk; + if (!p->swapchain) + goto error; + + uint32_t imgidx = 0; + MP_TRACE(vk, "vkAcquireNextImageKHR\n"); + VkResult res = vkAcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX, + p->acquired[p->idx_acquired], NULL, + &imgidx); + if (res == VK_ERROR_OUT_OF_DATE_KHR) + goto error; // just return in this case + VK_ASSERT(res, "Failed acquiring swapchain image"); + + p->last_imgidx = imgidx; + *out_fbo = (struct ra_fbo) { + .tex = p->images[imgidx], + .flip = false, + }; + return true; + +error: + return false; +} + +static bool submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame) +{ + struct priv *p = sw->priv; + struct ra *ra = sw->ctx->ra; + struct mpvk_ctx *vk = p->vk; + if (!p->swapchain) + goto error; + + VkSemaphore acquired = p->acquired[p->idx_acquired++]; + p->idx_acquired %= p->num_acquired; + + VkSemaphore done; + if (!ra_vk_submit(ra, p->images[p->last_imgidx], acquired, &done, + &p->frames_in_flight)) + goto error; + + // For some reason, nvidia absolutely shits itself when presenting from a + // full queue - so advance all of the cmdpool indices first and then do the + // present on an "empty" queue + vk_cmd_cycle_queues(vk); + struct vk_cmdpool *pool = vk->pool; + VkQueue queue = pool->queues[pool->qindex]; + + VkPresentInfoKHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &done, + .swapchainCount = 1, + .pSwapchains = &p->swapchain, + .pImageIndices = &p->last_imgidx, + }; + + VK(vkQueuePresentKHR(queue, &pinfo)); + return true; + +error: + return false; +} + +static void swap_buffers(struct ra_swapchain *sw) +{ + struct priv *p = sw->priv; + + while (p->frames_in_flight >= sw->ctx->opts.swapchain_depth) + mpvk_dev_poll_cmds(p->vk, 100000); // 100μs +} + +static const struct ra_swapchain_fns vulkan_swapchain = { + // .screenshot is not currently supported + .color_depth = color_depth, + .start_frame = start_frame, + .submit_frame = submit_frame, + .swap_buffers = swap_buffers, +}; diff --git a/video/out/vulkan/context.h b/video/out/vulkan/context.h new file mode 100644 index 0000000000..3f630bc10e --- /dev/null +++ b/video/out/vulkan/context.h @@ -0,0 +1,10 @@ +#pragma once + +#include "video/out/gpu/context.h" +#include "common.h" + +// Helpers for ra_ctx based on ra_vk. These initialize ctx->ra and ctx->swchain. +void ra_vk_ctx_uninit(struct ra_ctx *ctx); +bool ra_vk_ctx_init(struct ra_ctx *ctx, struct mpvk_ctx *vk, + VkPresentModeKHR preferred_mode); +bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h); diff --git a/video/out/vulkan/context_xlib.c b/video/out/vulkan/context_xlib.c new file mode 100644 index 0000000000..2611fbb706 --- /dev/null +++ b/video/out/vulkan/context_xlib.c @@ -0,0 +1,116 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include "video/out/gpu/context.h" +#include "video/out/x11_common.h" + +#include "common.h" +#include "context.h" +#include "utils.h" + +struct priv { + struct mpvk_ctx vk; +}; + +static void xlib_uninit(struct ra_ctx *ctx) +{ + struct priv *p = ctx->priv; + + ra_vk_ctx_uninit(ctx); + mpvk_uninit(&p->vk); + vo_x11_uninit(ctx->vo); +} + +static bool xlib_init(struct ra_ctx *ctx) +{ + struct priv *p = ctx->priv = talloc_zero(ctx, struct priv); + struct mpvk_ctx *vk = &p->vk; + int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR; + + if (!vo_x11_init(ctx->vo)) + goto error; + + if (!vo_x11_create_vo_window(ctx->vo, NULL, "mpvk")) + goto error; + + if (!mpvk_instance_init(vk, ctx->log, ctx->opts.debug)) + goto error; + + VkXlibSurfaceCreateInfoKHR xinfo = { + .sType = VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, + .dpy = ctx->vo->x11->display, + .window = ctx->vo->x11->window, + }; + + VkResult res = vkCreateXlibSurfaceKHR(vk->inst, &xinfo, MPVK_ALLOCATOR, + &vk->surf); + if (res != VK_SUCCESS) { + MP_MSG(ctx, msgl, "Failed creating Xlib surface: %s\n", vk_err(res)); + goto error; + } + + if (!ra_vk_ctx_init(ctx, vk, VK_PRESENT_MODE_FIFO_KHR)) + goto error; + + return true; + +error: + xlib_uninit(ctx); + return false; +} + +static bool resize(struct ra_ctx *ctx) +{ + return ra_vk_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight); +} + +static bool xlib_reconfig(struct ra_ctx *ctx) +{ + vo_x11_config_vo_window(ctx->vo); + return resize(ctx); +} + +static int xlib_control(struct ra_ctx *ctx, int *events, int request, void *arg) +{ + int ret = vo_x11_control(ctx->vo, events, request, arg); + if (*events & VO_EVENT_RESIZE) { + if (!resize(ctx)) + return VO_ERROR; + } + return ret; +} + +static void xlib_wakeup(struct ra_ctx *ctx) +{ + vo_x11_wakeup(ctx->vo); +} + +static void xlib_wait_events(struct ra_ctx *ctx, int64_t until_time_us) +{ + vo_x11_wait_events(ctx->vo, until_time_us); +} + +const struct ra_ctx_fns ra_ctx_vulkan_xlib = { + .type = "vulkan", + .name = "x11", + .reconfig = xlib_reconfig, + .control = xlib_control, + .wakeup = xlib_wakeup, + .wait_events = xlib_wait_events, + .init = xlib_init, + .uninit = xlib_uninit, +}; diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c new file mode 100644 index 0000000000..b44bead99c --- /dev/null +++ b/video/out/vulkan/formats.c @@ -0,0 +1,55 @@ +#include "formats.h" + +const struct vk_format vk_formats[] = { + // Regular, byte-aligned integer formats + {"r8", VK_FORMAT_R8_UNORM, 1, 1, {8 }, RA_CTYPE_UNORM }, + {"rg8", VK_FORMAT_R8G8_UNORM, 2, 2, {8, 8 }, RA_CTYPE_UNORM }, + {"rgb8", VK_FORMAT_R8G8B8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM }, + {"rgba8", VK_FORMAT_R8G8B8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM }, + {"r16", VK_FORMAT_R16_UNORM, 1, 2, {16 }, RA_CTYPE_UNORM }, + {"rg16", VK_FORMAT_R16G16_UNORM, 2, 4, {16, 16 }, RA_CTYPE_UNORM }, + {"rgb16", VK_FORMAT_R16G16B16_UNORM, 3, 6, {16, 16, 16 }, RA_CTYPE_UNORM }, + {"rgba16", VK_FORMAT_R16G16B16A16_UNORM, 4, 8, {16, 16, 16, 16}, RA_CTYPE_UNORM }, + + // Special, integer-only formats + {"r32ui", VK_FORMAT_R32_UINT, 1, 4, {32 }, RA_CTYPE_UINT }, + {"rg32ui", VK_FORMAT_R32G32_UINT, 2, 8, {32, 32 }, RA_CTYPE_UINT }, + {"rgb32ui", VK_FORMAT_R32G32B32_UINT, 3, 12, {32, 32, 32 }, RA_CTYPE_UINT }, + {"rgba32ui", VK_FORMAT_R32G32B32A32_UINT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_UINT }, + {"r64ui", VK_FORMAT_R64_UINT, 1, 8, {64 }, RA_CTYPE_UINT }, + {"rg64ui", VK_FORMAT_R64G64_UINT, 2, 16, {64, 64 }, RA_CTYPE_UINT }, + {"rgb64ui", VK_FORMAT_R64G64B64_UINT, 3, 24, {64, 64, 64 }, RA_CTYPE_UINT }, + {"rgba64ui", VK_FORMAT_R64G64B64A64_UINT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_UINT }, + + // Packed integer formats + {"rg4", VK_FORMAT_R4G4_UNORM_PACK8, 2, 1, {4, 4 }, RA_CTYPE_UNORM }, + {"rgba4", VK_FORMAT_R4G4B4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM }, + {"rgb565", VK_FORMAT_R5G6B5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM }, + {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM }, + + // Float formats (native formats, hf = half float, df = double float) + {"r16hf", VK_FORMAT_R16_SFLOAT, 1, 2, {16 }, RA_CTYPE_FLOAT }, + {"rg16hf", VK_FORMAT_R16G16_SFLOAT, 2, 4, {16, 16 }, RA_CTYPE_FLOAT }, + {"rgb16hf", VK_FORMAT_R16G16B16_SFLOAT, 3, 6, {16, 16, 16 }, RA_CTYPE_FLOAT }, + {"rgba16hf", VK_FORMAT_R16G16B16A16_SFLOAT, 4, 8, {16, 16, 16, 16}, RA_CTYPE_FLOAT }, + {"r32f", VK_FORMAT_R32_SFLOAT, 1, 4, {32 }, RA_CTYPE_FLOAT }, + {"rg32f", VK_FORMAT_R32G32_SFLOAT, 2, 8, {32, 32 }, RA_CTYPE_FLOAT }, + {"rgb32f", VK_FORMAT_R32G32B32_SFLOAT, 3, 12, {32, 32, 32 }, RA_CTYPE_FLOAT }, + {"rgba32f", VK_FORMAT_R32G32B32A32_SFLOAT, 4, 16, {32, 32, 32, 32}, RA_CTYPE_FLOAT }, + {"r64df", VK_FORMAT_R64_SFLOAT, 1, 8, {64 }, RA_CTYPE_FLOAT }, + {"rg64df", VK_FORMAT_R64G64_SFLOAT, 2, 16, {64, 64 }, RA_CTYPE_FLOAT }, + {"rgb64df", VK_FORMAT_R64G64B64_SFLOAT, 3, 24, {64, 64, 64 }, RA_CTYPE_FLOAT }, + {"rgba64df", VK_FORMAT_R64G64B64A64_SFLOAT, 4, 32, {64, 64, 64, 64}, RA_CTYPE_FLOAT }, + + // "Swapped" component order images + {"bgr8", VK_FORMAT_B8G8R8_UNORM, 3, 3, {8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra8", VK_FORMAT_B8G8R8A8_UNORM, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {"bgra4", VK_FORMAT_B4G4R4A4_UNORM_PACK16, 4, 2, {4, 4, 4, 4 }, RA_CTYPE_UNORM, true }, + {"bgr565", VK_FORMAT_B5G6R5_UNORM_PACK16, 3, 2, {5, 6, 5 }, RA_CTYPE_UNORM, true }, + {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16, 4, 2, {5, 5, 5, 1 }, RA_CTYPE_UNORM, true }, + {"a1rgb5", VK_FORMAT_A1R5G5B5_UNORM_PACK16, 4, 2, {1, 5, 5, 5 }, RA_CTYPE_UNORM, true }, + {"a2rgb10", VK_FORMAT_A2R10G10B10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"a2bgr10", VK_FORMAT_A2B10G10R10_UNORM_PACK32, 4, 4, {2, 10, 10, 10}, RA_CTYPE_UNORM, true }, + {"abgr8", VK_FORMAT_A8B8G8R8_UNORM_PACK32, 4, 4, {8, 8, 8, 8 }, RA_CTYPE_UNORM, true }, + {0} +}; diff --git a/video/out/vulkan/formats.h b/video/out/vulkan/formats.h new file mode 100644 index 0000000000..22782a6958 --- /dev/null +++ b/video/out/vulkan/formats.h @@ -0,0 +1,16 @@ +#pragma once + +#include "video/out/gpu/ra.h" +#include "common.h" + +struct vk_format { + const char *name; + VkFormat iformat; // vulkan format enum + int components; // how many components are there + int bytes; // how many bytes is a texel + int bits[4]; // how many bits per component + enum ra_ctype ctype; // format representation type + bool fucked_order; // used for formats which are not simply rgba +}; + +extern const struct vk_format vk_formats[]; diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c new file mode 100644 index 0000000000..31fcd36ddb --- /dev/null +++ b/video/out/vulkan/malloc.c @@ -0,0 +1,424 @@ +#include "malloc.h" +#include "utils.h" +#include "osdep/timer.h" + +// Controls the multiplication factor for new slab allocations. The new slab +// will always be allocated such that the size of the slab is this factor times +// the previous slab. Higher values make it grow faster. +#define MPVK_HEAP_SLAB_GROWTH_RATE 4 + +// Controls the minimum slab size, to reduce the frequency at which very small +// slabs would need to get allocated when allocating the first few buffers. +// (Default: 1 MB) +#define MPVK_HEAP_MINIMUM_SLAB_SIZE (1 << 20) + +// Controls the maximum slab size, to reduce the effect of unbounded slab +// growth exhausting memory. If the application needs a single allocation +// that's bigger than this value, it will be allocated directly from the +// device. (Default: 512 MB) +#define MPVK_HEAP_MAXIMUM_SLAB_SIZE (1 << 29) + +// Controls the minimum free region size, to reduce thrashing the free space +// map with lots of small buffers during uninit. (Default: 1 KB) +#define MPVK_HEAP_MINIMUM_REGION_SIZE (1 << 10) + +// Represents a region of available memory +struct vk_region { + size_t start; // first offset in region + size_t end; // first offset *not* in region +}; + +static inline size_t region_len(struct vk_region r) +{ + return r.end - r.start; +} + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as slices of this. Slabs are organized into linked +// lists, which represent individual heaps. +struct vk_slab { + VkDeviceMemory mem; // underlying device allocation + size_t size; // total size of `slab` + size_t used; // number of bytes actually in use (for GC accounting) + bool dedicated; // slab is allocated specifically for one object + // free space map: a sorted list of memory regions that are available + struct vk_region *regions; + int num_regions; + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` +}; + +// Represents a single memory heap. We keep track of a vk_heap for each +// combination of buffer type and memory selection parameters. This shouldn't +// actually be that many in practice, because some combinations simply never +// occur, and others will generally be the same for the same objects. +struct vk_heap { + VkBufferUsageFlagBits usage; // the buffer usage type (or 0) + VkMemoryPropertyFlagBits flags; // the memory type flags (or 0) + uint32_t typeBits; // the memory type index requirements (or 0) + struct vk_slab **slabs; // array of slabs sorted by size + int num_slabs; +}; + +// The overall state of the allocator, which keeps track of a vk_heap for each +// memory type. +struct vk_malloc { + VkPhysicalDeviceMemoryProperties props; + struct vk_heap *heaps; + int num_heaps; +}; + +static void slab_free(struct mpvk_ctx *vk, struct vk_slab *slab) +{ + if (!slab) + return; + + assert(slab->used == 0); + + int64_t start = mp_time_us(); + vkDestroyBuffer(vk->dev, slab->buffer, MPVK_ALLOCATOR); + // also implicitly unmaps the memory if needed + vkFreeMemory(vk->dev, slab->mem, MPVK_ALLOCATOR); + int64_t stop = mp_time_us(); + + MP_VERBOSE(vk, "Freeing slab of size %zu took %lld μs.\n", + slab->size, (long long)(stop - start)); + + talloc_free(slab); +} + +static bool find_best_memtype(struct mpvk_ctx *vk, uint32_t typeBits, + VkMemoryPropertyFlagBits flags, + VkMemoryType *out_type, int *out_index) +{ + struct vk_malloc *ma = vk->alloc; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + // The memory type flags must include our properties + if ((ma->props.memoryTypes[i].propertyFlags & flags) != flags) + continue; + // The memory type must be supported by the requirements (bitfield) + if (typeBits && !(typeBits & (1 << i))) + continue; + *out_type = ma->props.memoryTypes[i]; + *out_index = i; + return true; + } + + MP_ERR(vk, "Found no memory type matching property flags 0x%x and type " + "bits 0x%x!\n", flags, (unsigned)typeBits); + return false; +} + +static struct vk_slab *slab_alloc(struct mpvk_ctx *vk, struct vk_heap *heap, + size_t size) +{ + struct vk_slab *slab = talloc_ptrtype(NULL, slab); + *slab = (struct vk_slab) { + .size = size, + }; + + MP_TARRAY_APPEND(slab, slab->regions, slab->num_regions, (struct vk_region) { + .start = 0, + .end = slab->size, + }); + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = slab->size, + }; + + uint32_t typeBits = heap->typeBits ? heap->typeBits : UINT32_MAX; + if (heap->usage) { + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = slab->size, + .usage = heap->usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + + VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer)); + + VkMemoryRequirements reqs; + vkGetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + minfo.allocationSize = reqs.size; // this can be larger than slab->size + typeBits &= reqs.memoryTypeBits; // this can restrict the types + } + + VkMemoryType type; + int index; + if (!find_best_memtype(vk, typeBits, heap->flags, &type, &index)) + goto error; + + MP_VERBOSE(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d.\n", + slab->size, type.propertyFlags, index, (int)type.heapIndex); + + minfo.memoryTypeIndex = index; + VK(vkAllocateMemory(vk->dev, &minfo, MPVK_ALLOCATOR, &slab->mem)); + + if (heap->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + + if (slab->buffer) + VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + + return slab; + +error: + slab_free(vk, slab); + return NULL; +} + +static void insert_region(struct vk_slab *slab, struct vk_region region) +{ + if (region.start == region.end) + return; + + bool big_enough = region_len(region) >= MPVK_HEAP_MINIMUM_REGION_SIZE; + + // Find the index of the first region that comes after this + for (int i = 0; i < slab->num_regions; i++) { + struct vk_region *r = &slab->regions[i]; + + // Check for a few special cases which can be coalesced + if (r->end == region.start) { + // The new region is at the tail of this region. In addition to + // modifying this region, we also need to coalesce all the following + // regions for as long as possible + r->end = region.end; + + struct vk_region *next = &slab->regions[i+1]; + while (i+1 < slab->num_regions && r->end == next->start) { + r->end = next->end; + MP_TARRAY_REMOVE_AT(slab->regions, slab->num_regions, i+1); + } + return; + } + + if (r->start == region.end) { + // The new region is at the head of this region. We don't need to + // do anything special here - because if this could be further + // coalesced backwards, the previous loop iteration would already + // have caught it. + r->start = region.start; + return; + } + + if (r->start > region.start) { + // The new region comes somewhere before this region, so insert + // it into this index in the array. + if (big_enough) { + MP_TARRAY_INSERT_AT(slab, slab->regions, slab->num_regions, + i, region); + } + return; + } + } + + // If we've reached the end of this loop, then all of the regions + // come before the new region, and are disconnected - so append it + if (big_enough) + MP_TARRAY_APPEND(slab, slab->regions, slab->num_regions, region); +} + +static void heap_uninit(struct mpvk_ctx *vk, struct vk_heap *heap) +{ + for (int i = 0; i < heap->num_slabs; i++) + slab_free(vk, heap->slabs[i]); + + talloc_free(heap->slabs); + *heap = (struct vk_heap){0}; +} + +void vk_malloc_init(struct mpvk_ctx *vk) +{ + assert(vk->physd); + vk->alloc = talloc_zero(NULL, struct vk_malloc); + vkGetPhysicalDeviceMemoryProperties(vk->physd, &vk->alloc->props); +} + +void vk_malloc_uninit(struct mpvk_ctx *vk) +{ + struct vk_malloc *ma = vk->alloc; + if (!ma) + return; + + for (int i = 0; i < ma->num_heaps; i++) + heap_uninit(vk, &ma->heaps[i]); + + talloc_free(ma); + vk->alloc = NULL; +} + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice) +{ + struct vk_slab *slab = slice.priv; + if (!slab) + return; + + assert(slab->used >= slice.size); + slab->used -= slice.size; + + MP_DBG(vk, "Freeing slice %zu + %zu from slab with size %zu\n", + slice.offset, slice.size, slab->size); + + if (slab->dedicated) { + // If the slab was purpose-allocated for this memslice, we can just + // free it here + slab_free(vk, slab); + } else { + // Return the allocation to the free space map + insert_region(slab, (struct vk_region) { + .start = slice.offset, + .end = slice.offset + slice.size, + }); + } +} + +// reqs: can be NULL +static struct vk_heap *find_heap(struct mpvk_ctx *vk, + VkBufferUsageFlagBits usage, + VkMemoryPropertyFlagBits flags, + VkMemoryRequirements *reqs) +{ + struct vk_malloc *ma = vk->alloc; + int typeBits = reqs ? reqs->memoryTypeBits : 0; + + for (int i = 0; i < ma->num_heaps; i++) { + if (ma->heaps[i].usage != usage) + continue; + if (ma->heaps[i].flags != flags) + continue; + if (ma->heaps[i].typeBits != typeBits) + continue; + return &ma->heaps[i]; + } + + // Not found => add it + MP_TARRAY_GROW(ma, ma->heaps, ma->num_heaps + 1); + struct vk_heap *heap = &ma->heaps[ma->num_heaps++]; + *heap = (struct vk_heap) { + .usage = usage, + .flags = flags, + .typeBits = typeBits, + }; + return heap; +} + +static inline bool region_fits(struct vk_region r, size_t size, size_t align) +{ + return MP_ALIGN_UP(r.start, align) + size <= r.end; +} + +// Finds the best-fitting region in a heap. If the heap is too small or too +// fragmented, a new slab will be allocated under the hood. +static bool heap_get_region(struct mpvk_ctx *vk, struct vk_heap *heap, + size_t size, size_t align, + struct vk_slab **out_slab, int *out_index) +{ + struct vk_slab *slab = NULL; + + // If the allocation is very big, serve it directly instead of bothering + // with the heap + if (size > MPVK_HEAP_MAXIMUM_SLAB_SIZE) { + slab = slab_alloc(vk, heap, size); + *out_slab = slab; + *out_index = 0; + return !!slab; + } + + for (int i = 0; i < heap->num_slabs; i++) { + slab = heap->slabs[i]; + if (slab->size < size) + continue; + + // Attempt a best fit search + int best = -1; + for (int n = 0; n < slab->num_regions; n++) { + struct vk_region r = slab->regions[n]; + if (!region_fits(r, size, align)) + continue; + if (best >= 0 && region_len(r) > region_len(slab->regions[best])) + continue; + best = n; + } + + if (best >= 0) { + *out_slab = slab; + *out_index = best; + return true; + } + } + + // Otherwise, allocate a new vk_slab and append it to the list. + size_t cur_size = MPMAX(size, slab ? slab->size : 0); + size_t slab_size = MPVK_HEAP_SLAB_GROWTH_RATE * cur_size; + slab_size = MPMAX(MPVK_HEAP_MINIMUM_SLAB_SIZE, slab_size); + slab_size = MPMIN(MPVK_HEAP_MAXIMUM_SLAB_SIZE, slab_size); + assert(slab_size >= size); + slab = slab_alloc(vk, heap, slab_size); + if (!slab) + return false; + MP_TARRAY_APPEND(NULL, heap->slabs, heap->num_slabs, slab); + + // Return the only region there is in a newly allocated slab + assert(slab->num_regions == 1); + *out_slab = slab; + *out_index = 0; + return true; +} + +static bool slice_heap(struct mpvk_ctx *vk, struct vk_heap *heap, size_t size, + size_t alignment, struct vk_memslice *out) +{ + struct vk_slab *slab; + int index; + alignment = MP_ALIGN_UP(alignment, vk->limits.bufferImageGranularity); + if (!heap_get_region(vk, heap, size, alignment, &slab, &index)) + return false; + + struct vk_region reg = slab->regions[index]; + MP_TARRAY_REMOVE_AT(slab->regions, slab->num_regions, index); + *out = (struct vk_memslice) { + .vkmem = slab->mem, + .offset = MP_ALIGN_UP(reg.start, alignment), + .size = size, + .priv = slab, + }; + + MP_DBG(vk, "Sub-allocating slice %zu + %zu from slab with size %zu\n", + out->offset, out->size, slab->size); + + size_t out_end = out->offset + out->size; + insert_region(slab, (struct vk_region) { reg.start, out->offset }); + insert_region(slab, (struct vk_region) { out_end, reg.end }); + + slab->used += size; + return true; +} + +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out) +{ + struct vk_heap *heap = find_heap(vk, 0, flags, &reqs); + return slice_heap(vk, heap, reqs.size, reqs.alignment, out); +} + +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out) +{ + struct vk_heap *heap = find_heap(vk, bufFlags, memFlags, NULL); + if (!slice_heap(vk, heap, size, alignment, &out->mem)) + return false; + + struct vk_slab *slab = out->mem.priv; + out->buf = slab->buffer; + if (slab->data) + out->data = (void *)((uintptr_t)slab->data + (ptrdiff_t)out->mem.offset); + + return true; +} diff --git a/video/out/vulkan/malloc.h b/video/out/vulkan/malloc.h new file mode 100644 index 0000000000..65c1036929 --- /dev/null +++ b/video/out/vulkan/malloc.h @@ -0,0 +1,35 @@ +#pragma once + +#include "common.h" + +void vk_malloc_init(struct mpvk_ctx *vk); +void vk_malloc_uninit(struct mpvk_ctx *vk); + +// Represents a single "slice" of generic (non-buffer) memory, plus some +// metadata for accounting. This struct is essentially read-only. +struct vk_memslice { + VkDeviceMemory vkmem; + size_t offset; + size_t size; + void *priv; +}; + +void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice); +bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs, + VkMemoryPropertyFlagBits flags, struct vk_memslice *out); + +// Represents a single "slice" of a larger buffer +struct vk_bufslice { + struct vk_memslice mem; // must be freed by the user when done + VkBuffer buf; // the buffer this memory was sliced from + // For persistently mapped buffers, this points to the first usable byte of + // this slice. + void *data; +}; + +// Allocate a buffer slice. This is more efficient than vk_malloc_generic for +// when the user needs lots of buffers, since it doesn't require +// creating/destroying lots of (little) VkBuffers. +bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlagBits bufFlags, + VkMemoryPropertyFlagBits memFlags, VkDeviceSize size, + VkDeviceSize alignment, struct vk_bufslice *out); diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c new file mode 100644 index 0000000000..ce0cbc66e9 --- /dev/null +++ b/video/out/vulkan/ra_vk.c @@ -0,0 +1,1590 @@ +#include "ra_vk.h" +#include "malloc.h" +#include "video/out/opengl/utils.h" + +static struct ra_fns ra_fns_vk; + +// For ra.priv +struct ra_vk { + struct mpvk_ctx *vk; + struct ra_tex *clear_tex; // stupid hack for clear() + struct vk_cmd *cmd; // currently recording cmd +}; + +struct mpvk_ctx *ra_vk_get(struct ra *ra) +{ + if (ra->fns != &ra_fns_vk) + return NULL; + + struct ra_vk *p = ra->priv; + return p->vk; +} + +// Returns a command buffer, or NULL on error +static struct vk_cmd *vk_require_cmd(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (!p->cmd) + p->cmd = vk_cmd_begin(vk, vk->pool); + + return p->cmd; +} + +// Note: This technically follows the flush() API, but we don't need +// to expose that (and in fact, it's a bad idea) since we control flushing +// behavior with ra_vk_present_frame already. +static bool vk_flush(struct ra *ra, VkSemaphore *done) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (p->cmd) { + if (!vk_cmd_submit(vk, p->cmd, done)) + return false; + p->cmd = NULL; + } + + return true; +} + +// The callback's *priv will always be set to `ra` +static void vk_callback(struct ra *ra, vk_cb callback, void *arg) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (p->cmd) { + vk_cmd_callback(p->cmd, callback, ra, arg); + } else { + vk_dev_callback(vk, callback, ra, arg); + } +} + +#define MAKE_LAZY_DESTRUCTOR(fun, argtype) \ + static void fun##_lazy(struct ra *ra, argtype *arg) { \ + vk_callback(ra, (vk_cb) fun, arg); \ + } + +static void vk_destroy_ra(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + vk_flush(ra, NULL); + mpvk_dev_wait_idle(vk); + ra_tex_free(ra, &p->clear_tex); + + talloc_free(ra); +} + +static bool vk_setup_formats(struct ra *ra) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) { + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop); + + // As a bare minimum, we need to sample from an allocated image + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) + continue; + + VkFormatFeatureFlags linear_bits, render_bits; + linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + + struct ra_format *fmt = talloc_zero(ra, struct ra_format); + *fmt = (struct ra_format) { + .name = vk_fmt->name, + .priv = (void *)vk_fmt, + .ctype = vk_fmt->ctype, + .ordered = !vk_fmt->fucked_order, + .num_components = vk_fmt->components, + .pixel_size = vk_fmt->bytes, + .linear_filter = !!(flags & linear_bits), + .renderable = !!(flags & render_bits), + }; + + for (int i = 0; i < 4; i++) + fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i]; + + MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt); + } + + // Populate some other capabilities related to formats while we're at it + VkImageType imgType[3] = { + VK_IMAGE_TYPE_1D, + VK_IMAGE_TYPE_2D, + VK_IMAGE_TYPE_3D + }; + + // R8_UNORM is supported on literally every single vulkan implementation + const VkFormat testfmt = VK_FORMAT_R8_UNORM; + + for (int d = 0; d < 3; d++) { + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop); + + switch (imgType[d]) { + case VK_IMAGE_TYPE_1D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_1D; + break; + case VK_IMAGE_TYPE_2D: + // 2D formats must be supported by RA, so ensure this is the case + VK_ASSERT(res, "Querying 2D format limits"); + ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height); + break; + case VK_IMAGE_TYPE_3D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_3D; + break; + } + } + + // RA_CAP_BLIT implies both blitting between images as well as blitting + // directly to the swapchain image, so check for all three operations + bool blittable = true; + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT)) + blittable = false; + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + if (blittable) + ra->caps |= RA_CAP_BLIT; + + return true; + +error: + return false; +} + +static struct ra_fns ra_fns_vk; + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log) +{ + assert(vk->dev); + assert(vk->alloc); + + struct ra *ra = talloc_zero(NULL, struct ra); + ra->log = log; + ra->fns = &ra_fns_vk; + + struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk); + p->vk = vk; + + // There's no way to query the supported GLSL version from VK_NV_glsl_shader + // (thanks nvidia), so just pick the GL version that modern nvidia devices + // support.. + ra->glsl_version = 450; + ra->glsl_vulkan = true; + ra->max_shmem = vk->limits.maxComputeSharedMemorySize; + ra->caps = RA_CAP_NESTED_ARRAY; + + if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT) + ra->caps |= RA_CAP_COMPUTE; + + if (!vk_setup_formats(ra)) + goto error; + + // UBO support is required + ra->caps |= RA_CAP_BUF_RO; + + // Try creating a shader storage buffer + struct ra_buf_params ssbo_params = { + .type = RA_BUF_TYPE_SHADER_STORAGE, + .size = 16, + }; + + struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params); + if (ssbo) { + ra->caps |= RA_CAP_BUF_RW; + ra_buf_free(ra, &ssbo); + } + + // To support clear() by region, we need to allocate a dummy 1x1 image that + // will be used as the source of blit operations + struct ra_tex_params clear_params = { + .dimensions = 1, // no point in using a 2D image if height = 1 + .w = 1, + .h = 1, + .d = 1, + .format = ra_find_float16_format(ra, 4), + .blit_src = 1, + .host_mutable = 1, + }; + + p->clear_tex = ra_tex_create(ra, &clear_params); + if (!p->clear_tex) { + MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n"); + goto error; + } + + return ra; + +error: + vk_destroy_ra(ra); + return NULL; +} + +// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain +// compatible +static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt, + bool load_fbo, VkRenderPass *out) +{ + struct vk_format *vk_fmt = fmt->priv; + assert(fmt->renderable); + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = vk_fmt->iformat, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out); +} + +// For ra_tex.priv +struct ra_tex_vk { + bool external_img; + VkImageType type; + VkImage img; + struct vk_memslice mem; + // for sampling + VkImageView view; + VkSampler sampler; + // for rendering + VkFramebuffer framebuffer; + VkRenderPass dummyPass; + // for uploading + struct ra_buf_pool pbo; + // "current" metadata, can change during the course of execution + VkImageLayout current_layout; + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +// Small helper to ease image barrier creation. if `discard` is set, the contents +// of the image will be undefined after the barrier +static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, VkImageLayout newLayout, + bool discard) +{ + VkImageMemoryBarrier imgBarrier = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .oldLayout = tex_vk->current_layout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .srcAccessMask = tex_vk->current_access, + .dstAccessMask = newAccess, + .image = tex_vk->img, + .subresourceRange = vk_range, + }; + + if (discard) { + imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imgBarrier.srcAccessMask = 0; + } + + if (imgBarrier.oldLayout != imgBarrier.newLayout || + imgBarrier.srcAccessMask != imgBarrier.dstAccessMask) + { + vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0, + 0, NULL, 0, NULL, 1, &imgBarrier); + } + + tex_vk->current_stage = newStage; + tex_vk->current_layout = newLayout; + tex_vk->current_access = newAccess; +} + +static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex) +{ + if (!tex) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_tex_vk *tex_vk = tex->priv; + + ra_buf_pool_uninit(ra, &tex_vk->pbo); + vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR); + vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR); + vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR); + if (!tex_vk->external_img) { + vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR); + vk_free_memslice(vk, tex_vk->mem); + } + + talloc_free(tex); +} + +MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex); + +// Initializes non-VkImage values like the image view, samplers, etc. +static bool vk_init_image(struct ra *ra, struct ra_tex *tex) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_tex_params *params = &tex->params; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex_vk->img); + + tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + tex_vk->current_access = 0; + + if (params->render_src || params->render_dst) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const struct vk_format *fmt = params->format->priv; + VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = fmt->iformat, + .subresourceRange = vk_range, + }; + + VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view)); + } + + if (params->render_src) { + assert(params->format->linear_filter || !params->src_linear); + VkFilter filter = params->src_linear + ? VK_FILTER_LINEAR + : VK_FILTER_NEAREST; + VkSamplerAddressMode wrap = params->src_repeat + ? VK_SAMPLER_ADDRESS_MODE_REPEAT + : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filter, + .minFilter = filter, + .addressModeU = wrap, + .addressModeV = wrap, + .addressModeW = wrap, + .maxAnisotropy = 1.0, + }; + + VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler)); + } + + if (params->render_dst) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = tex_vk->dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR, + &tex_vk->framebuffer)); + + // NOTE: Normally we would free the dummyPass again here, but a bug + // in the nvidia vulkan driver causes a segfault if you do. + } + + return true; + +error: + return false; +} + +static struct ra_tex *vk_tex_create(struct ra *ra, + const struct ra_tex_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_tex *tex = talloc_zero(NULL, struct ra_tex); + tex->params = *params; + tex->params.initial_data = NULL; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + + const struct vk_format *fmt = params->format->priv; + switch (params->dimensions) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + default: abort(); + } + + VkImageUsageFlags usage = 0; + if (params->render_src) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (params->render_dst) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (params->storage_dst) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (params->blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (params->host_mutable || params->blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + // Double-check image usage support and fail immediately if invalid + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0, + &iprop); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + return NULL; + } else { + VK_ASSERT(res, "Querying image format properties"); + } + + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + + bool has_blit_src = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT, + has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + + if (params->w > iprop.maxExtent.width || + params->h > iprop.maxExtent.height || + params->d > iprop.maxExtent.depth || + (params->blit_src && !has_blit_src) || + (params->src_linear && !has_src_linear)) + { + return NULL; + } + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = tex_vk->type, + .format = fmt->iformat, + .extent = (VkExtent3D) { params->w, params->h, params->d }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 1, + .pQueueFamilyIndices = &vk->pool->qf, + }; + + VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img)); + + VkMemoryPropertyFlagBits memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs); + + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_generic(vk, reqs, memFlags, mem)) + goto error; + + VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + + if (!vk_init_image(ra, tex)) + goto error; + + if (params->initial_data) { + struct ra_tex_upload_params ul_params = { + .tex = tex, + .invalidate = true, + .src = params->initial_data, + .stride = params->w * fmt->bytes, + }; + if (!ra->fns->tex_upload(ra, &ul_params)) + goto error; + } + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_tex *tex = NULL; + + const struct ra_format *format = NULL; + for (int i = 0; i < ra->num_formats; i++) { + const struct vk_format *fmt = ra->formats[i]->priv; + if (fmt->iformat == vk->surf_format.format) { + format = ra->formats[i]; + break; + } + } + + if (!format) { + MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image " + "with surface format 0x%x\n", vk->surf_format.format); + goto error; + } + + tex = talloc_zero(NULL, struct ra_tex); + tex->params = (struct ra_tex_params) { + .format = format, + .dimensions = 2, + .w = info.imageExtent.width, + .h = info.imageExtent.height, + .d = 1, + .blit_src = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .render_src = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT), + .render_dst = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + .storage_dst = !!(info.imageUsage & VK_IMAGE_USAGE_STORAGE_BIT), + }; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + tex_vk->type = VK_IMAGE_TYPE_2D; + tex_vk->external_img = true; + tex_vk->img = vkimg; + + if (!vk_init_image(ra, tex)) + goto error; + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +// For ra_buf.priv +struct ra_buf_vk { + struct vk_bufslice slice; + int refcount; // 1 = object allocated but not in use, > 1 = in use + bool needsflush; + // "current" metadata, can change during course of execution + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +static void vk_buf_deref(struct ra *ra, struct ra_buf *buf) +{ + if (!buf) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_buf_vk *buf_vk = buf->priv; + + if (--buf_vk->refcount == 0) { + vk_free_memslice(vk, buf_vk->slice.mem); + talloc_free(buf); + } +} + +static void buf_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_buf *buf, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, int offset, size_t size) +{ + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferMemoryBarrier buffBarrier = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .srcAccessMask = buf_vk->current_access, + .dstAccessMask = newAccess, + .buffer = buf_vk->slice.buf, + .offset = offset, + .size = size, + }; + + if (buf_vk->needsflush || buf->params.host_mapped) { + buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT; + buf_vk->needsflush = false; + } + + if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask) { + vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0, + 0, NULL, 1, &buffBarrier, 0, NULL); + } + + buf_vk->current_stage = newStage; + buf_vk->current_access = newAccess; + buf_vk->refcount++; + vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, ra, buf); +} + +#define vk_buf_destroy vk_buf_deref +MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf); + +static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size) +{ + assert(buf->params.host_mutable || buf->params.initial_data); + struct ra_buf_vk *buf_vk = buf->priv; + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->slice.data) { + assert(offset + size <= buf->params.size); + uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset; + memcpy((void *)addr, data, size); + buf_vk->needsflush = true; + } else { + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) { + MP_ERR(ra, "Failed updating buffer!\n"); + return; + } + + buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, offset, size); + + VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset; + assert(bufOffset == MP_ALIGN_UP(bufOffset, 4)); + vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data); + } +} + +static struct ra_buf *vk_buf_create(struct ra *ra, + const struct ra_buf_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_buf *buf = talloc_zero(NULL, struct ra_buf); + buf->params = *params; + + struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk); + buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + buf_vk->current_access = 0; + buf_vk->refcount = 1; + + VkBufferUsageFlagBits bufFlags = 0; + VkMemoryPropertyFlagBits memFlags = 0; + VkDeviceSize align = 4; // alignment 4 is needed for buf_update + + switch (params->type) { + case RA_BUF_TYPE_TEX_UPLOAD: + bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + break; + case RA_BUF_TYPE_UNIFORM: + bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment); + break; + case RA_BUF_TYPE_SHADER_STORAGE: + bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment); + break; + case RA_BUF_TYPE_VERTEX: + bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + default: abort(); + } + + if (params->host_mutable || params->initial_data) { + bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment); + } + + if (params->host_mapped) { + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + + if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align, + &buf_vk->slice)) + { + goto error; + } + + if (params->host_mapped) + buf->data = buf_vk->slice.data; + + if (params->initial_data) + vk_buf_update(ra, buf, 0, params->initial_data, params->size); + + buf->params.initial_data = NULL; // do this after vk_buf_update + return buf; + +error: + vk_buf_destroy(ra, buf); + return NULL; +} + +static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf) +{ + struct ra_buf_vk *buf_vk = buf->priv; + return buf_vk->refcount == 1; +} + +static bool vk_tex_upload(struct ra *ra, + const struct ra_tex_upload_params *params) +{ + struct ra_tex *tex = params->tex; + struct ra_tex_vk *tex_vk = tex->priv; + + if (!params->buf) + return ra_tex_upload_pbo(ra, &tex_vk->pbo, params); + + assert(!params->src); + assert(params->buf); + struct ra_buf *buf = params->buf; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferImageCopy region = { + .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset, + .bufferRowLength = tex->params.w, + .bufferImageHeight = tex->params.h, + .imageSubresource = vk_layers, + .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d}, + }; + + if (tex->params.dimensions == 2) { + int pix_size = tex->params.format->pixel_size; + region.bufferRowLength = params->stride / pix_size; + if (region.bufferRowLength * pix_size != params->stride) { + MP_ERR(ra, "Texture upload strides must be a multiple of the texel " + "size!\n"); + goto error; + } + + if (params->rc) { + struct mp_rect *rc = params->rc; + region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0}; + region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1}; + } + } + + uint64_t size = region.bufferRowLength * region.bufferImageHeight * + region.imageExtent.depth; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + params->invalidate); + + vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img, + tex_vk->current_layout, 1, ®ion); + + return true; + +error: + return false; +} + +#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH + +// For ra_renderpass.priv +struct ra_renderpass_vk { + // Compiled shaders + VkShaderModule vert; + VkShaderModule frag; + VkShaderModule comp; + // Pipeline / render pass + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkPipelineCache pipeCache; + VkRenderPass renderPass; + // Descriptor set (bindings) + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + VkDescriptorSet dss[MPVK_NUM_DS]; + int dindex; + // Vertex buffers (vertices) + struct ra_buf_pool vbo; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; +}; + +static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass) +{ + if (!pass) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_renderpass_vk *pass_vk = pass->priv; + + ra_buf_pool_uninit(ra, &pass_vk->vbo); + vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR); + vkDestroyPipelineCache(vk->dev, pass_vk->pipeCache, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR); + vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR); + vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR); + vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->vert, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->frag, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->comp, MPVK_ALLOCATOR); + + talloc_free(pass); +} + +MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass); + +static const VkDescriptorType dsType[] = { + [RA_VARTYPE_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [RA_VARTYPE_IMG_W] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, +}; + +static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp, + VkFormat *out_fmt) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + enum ra_ctype ctype; + switch (inp->type) { + case RA_VARTYPE_FLOAT: ctype = RA_CTYPE_FLOAT; break; + case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break; + default: abort(); + } + + assert(inp->dim_m == 1); + for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) { + if (fmt->ctype != ctype) + continue; + if (fmt->components != inp->dim_v) + continue; + if (fmt->bytes != ra_renderpass_input_layout(inp).size) + continue; + + // Ensure this format is valid for vertex attributes + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) + continue; + + *out_fmt = fmt->iformat; + return true; + } + + return false; +} + +static const VkPipelineStageFlagBits stageFlags[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static struct ra_renderpass *vk_renderpass_create(struct ra *ra, + const struct ra_renderpass_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass); + pass->params = *ra_renderpass_params_copy(pass, params); + pass->params.cached_program = (bstr){0}; + struct ra_renderpass_vk *pass_vk = pass->priv = + talloc_zero(pass, struct ra_renderpass_vk); + + static int dsCount[RA_VARTYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = NULL; + int num_bindings = 0; + + for (int i = 0; i < params->num_inputs; i++) { + struct ra_renderpass_input *inp = ¶ms->inputs[i]; + switch (inp->type) { + case RA_VARTYPE_TEX: + case RA_VARTYPE_IMG_W: + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + VkDescriptorSetLayoutBinding desc = { + .binding = inp->binding, + .descriptorType = dsType[inp->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + + MP_TARRAY_APPEND(pass, bindings, num_bindings, desc); + dsCount[inp->type]++; + break; + } + default: abort(); + } + } + + VkDescriptorPoolSize *dsPoolSizes = NULL; + int poolSizeCount = 0; + for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) { + if (dsCount[t] > 0) { + VkDescriptorPoolSize dssize = { + .type = dsType[t], + .descriptorCount = dsCount[t] * MPVK_NUM_DS, + }; + + MP_TARRAY_APPEND(pass, dsPoolSizes, poolSizeCount, dssize); + } + } + + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = MPVK_NUM_DS, + .pPoolSizes = dsPoolSizes, + .poolSizeCount = poolSizeCount, + }; + + VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool)); + talloc_free(dsPoolSizes); + + pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings); + pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings); + pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings); + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_bindings, + }; + + VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR, + &pass_vk->dsLayout)); + + VkDescriptorSetLayout layouts[MPVK_NUM_DS]; + for (int i = 0; i < MPVK_NUM_DS; i++) + layouts[i] = pass_vk->dsLayout; + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = MPVK_NUM_DS, + .pSetLayouts = layouts, + }; + + VK(vkAllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss)); + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR, + &pass_vk->pipeLayout)); + + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = params->cached_program.start, + .initialDataSize = params->cached_program.len, + }; + + VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pass_vk->pipeCache)); + + VkShaderModuleCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + + switch (params->type) { + case RA_RENDERPASS_TYPE_RASTER: { + sinfo.pCode = (uint32_t *)params->vertex_shader; + sinfo.codeSize = strlen(params->vertex_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->vert)); + + sinfo.pCode = (uint32_t *)params->frag_shader; + sinfo.codeSize = strlen(params->frag_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->frag)); + + VK(vk_create_render_pass(vk->dev, params->target_format, + params->enable_blend, &pass_vk->renderPass)); + + VkPipelineShaderStageCreateInfo stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = pass_vk->vert, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pass_vk->frag, + .pName = "main", + } + }; + + VkVertexInputAttributeDescription *attrs = talloc_array(pass, + VkVertexInputAttributeDescription, params->num_vertex_attribs); + + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct ra_renderpass_input *inp = ¶ms->vertex_attribs[i]; + attrs[i] = (VkVertexInputAttributeDescription) { + .location = i, + .binding = 0, + .offset = inp->offset, + }; + + if (!vk_get_input_format(ra, inp, &attrs[i].format)) { + MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n", + inp->name); + goto error; + } + } + + static const VkBlendFactor blendFactors[] = { + [RA_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO, + [RA_BLEND_ONE] = VK_BLEND_FACTOR_ONE, + [RA_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA, + [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + }; + + VkPipelineColorBlendAttachmentState binfo = { + .blendEnable = params->enable_blend, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcColorBlendFactor = blendFactors[params->blend_src_rgb], + .dstColorBlendFactor = blendFactors[params->blend_dst_rgb], + .alphaBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha], + .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha], + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }; + + VkGraphicsPipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = MP_ARRAY_SIZE(stages), + .pStages = &stages[0], + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) { + .binding = 0, + .stride = params->vertex_stride, + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }, + .vertexAttributeDescriptionCount = params->num_vertex_attribs, + .pVertexAttributeDescriptions = attrs, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .lineWidth = 1.0f, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &binfo, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]){ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }, + }, + .layout = pass_vk->pipeLayout, + .renderPass = pass_vk->renderPass, + }; + + VK(vkCreateGraphicsPipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + case RA_RENDERPASS_TYPE_COMPUTE: { + sinfo.pCode = (uint32_t *)params->compute_shader; + sinfo.codeSize = strlen(params->compute_shader); + VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &pass_vk->comp)); + + VkComputePipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = pass_vk->comp, + .pName = "main", + }, + .layout = pass_vk->pipeLayout, + }; + + VK(vkCreateComputePipelines(vk->dev, pass_vk->pipeCache, 1, &cinfo, + MPVK_ALLOCATOR, &pass_vk->pipe)); + break; + } + } + + // Update cached program + bstr *prog = &pass->params.cached_program; + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, NULL)); + prog->start = talloc_size(pass, prog->len); + VK(vkGetPipelineCacheData(vk->dev, pass_vk->pipeCache, &prog->len, prog->start)); + + return pass; + +error: + vk_renderpass_destroy(ra, pass); + return NULL; +} + +static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd, + struct ra_renderpass *pass, + struct ra_renderpass_input_val val, + VkDescriptorSet ds, int idx) +{ + struct ra_renderpass_vk *pass_vk = pass->priv; + struct ra_renderpass_input *inp = &pass->params.inputs[val.index]; + + VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx]; + *wds = (VkWriteDescriptorSet) { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = inp->binding, + .descriptorCount = 1, + .descriptorType = dsType[inp->type], + }; + + static const VkPipelineStageFlags passStages[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + }; + + switch (inp->type) { + case RA_VARTYPE_TEX: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.render_src); + tex_barrier(cmd, tex_vk, passStages[pass->params.type], + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .sampler = tex_vk->sampler, + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_IMG_W: { + struct ra_tex *tex = *(struct ra_tex **)val.data; + struct ra_tex_vk *tex_vk = tex->priv; + + assert(tex->params.storage_dst); + tex_barrier(cmd, tex_vk, passStages[pass->params.type], + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, false); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .imageView = tex_vk->view, + .imageLayout = tex_vk->current_layout, + }; + + wds->pImageInfo = iinfo; + break; + } + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + struct ra_buf *buf = *(struct ra_buf **)val.data; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT; + if (inp->type == RA_VARTYPE_BUF_RW) + access |= VK_ACCESS_SHADER_WRITE_BIT; + + buf_barrier(ra, cmd, buf, passStages[pass->params.type], + access, buf_vk->slice.mem.offset, buf->params.size); + + VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx]; + *binfo = (VkDescriptorBufferInfo) { + .buffer = buf_vk->slice.buf, + .offset = buf_vk->slice.mem.offset, + .range = buf->params.size, + }; + + wds->pBufferInfo = binfo; + break; + } + } +} + +static void vk_renderpass_run(struct ra *ra, + const struct ra_renderpass_run_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_renderpass *pass = params->pass; + struct ra_renderpass_vk *pass_vk = pass->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + static const VkPipelineBindPoint bindPoint[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE, + }; + + vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe); + + VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++]; + pass_vk->dindex %= MPVK_NUM_DS; + + for (int i = 0; i < params->num_values; i++) + vk_update_descriptor(ra, cmd, pass, params->values[i], ds, i); + + if (params->num_values > 0) { + vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite, + 0, NULL); + } + + vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + + switch (pass->params.type) { + case RA_RENDERPASS_TYPE_COMPUTE: + vkCmdDispatch(cmd->buf, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + break; + case RA_RENDERPASS_TYPE_RASTER: { + struct ra_tex *tex = params->target; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.render_dst); + + struct ra_buf_params buf_params = { + .type = RA_BUF_TYPE_VERTEX, + .size = params->vertex_count * pass->params.vertex_stride, + .host_mutable = true, + }; + + struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params); + if (!buf) { + MP_ERR(ra, "Failed allocating vertex buffer!\n"); + goto error; + } + struct ra_buf_vk *buf_vk = buf->priv; + + vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size); + + buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + buf_vk->slice.mem.offset, buf_params.size); + + vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf, + &buf_vk->slice.mem.offset); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false); + + VkViewport viewport = { + .x = params->viewport.x0, + .y = params->viewport.y0, + .width = mp_rect_w(params->viewport), + .height = mp_rect_h(params->viewport), + }; + + VkRect2D scissor = { + .offset = {params->scissors.x0, params->scissors.y0}, + .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)}, + }; + + vkCmdSetViewport(cmd->buf, 0, 1, &viewport); + vkCmdSetScissor(cmd->buf, 0, 1, &scissor); + + VkRenderPassBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = pass_vk->renderPass, + .framebuffer = tex_vk->framebuffer, + .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}}, + }; + + vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE); + vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0); + vkCmdEndRenderPass(cmd->buf); + break; + } + default: abort(); + }; + +error: + return; +} + +static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src, + struct mp_rect *dst_rc, struct mp_rect *src_rc) +{ + assert(src->params.blit_src); + assert(dst->params.blit_dst); + + struct ra_tex_vk *src_vk = src->priv; + struct ra_tex_vk *dst_vk = dst->priv; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + false); + + bool discard = dst_rc->x0 == 0 && + dst_rc->y0 == 0 && + dst_rc->x1 == dst->params.w && + dst_rc->y1 == dst->params.h; + + tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + discard); + + VkImageBlit region = { + .srcSubresource = vk_layers, + .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}}, + .dstSubresource = vk_layers, + .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}}, + }; + + vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img, + dst_vk->current_layout, 1, ®ion, VK_FILTER_NEAREST); +} + +static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4], + struct mp_rect *rc) +{ + struct ra_vk *p = ra->priv; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex->params.blit_dst); + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + struct mp_rect full = {0, 0, tex->params.w, tex->params.h}; + if (!rc || mp_rect_equals(rc, &full)) { + // To clear the entire image, we can use the efficient clear command + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true); + + VkClearColorValue clearColor = {0}; + for (int c = 0; c < 4; c++) + clearColor.float32[c] = color[c]; + + vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout, + &clearColor, 1, &vk_range); + } else { + // To simulate per-region clearing, we blit from a 1x1 texture instead + struct ra_tex_upload_params ul_params = { + .tex = p->clear_tex, + .invalidate = true, + .src = &color[0], + }; + vk_tex_upload(ra, &ul_params); + vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1}); + } +} + +#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4) + +struct vk_timer { + VkQueryPool pool; + int index; + uint64_t result; +}; + +static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer) +{ + if (!ratimer) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct vk_timer *timer = ratimer; + + vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR); + + talloc_free(timer); +} + +MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer); + +static ra_timer *vk_timer_create(struct ra *ra) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct vk_timer *timer = talloc_zero(NULL, struct vk_timer); + + struct VkQueryPoolCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = VK_QUERY_POOL_SIZE, + }; + + VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool)); + + return (ra_timer *)timer; + +error: + vk_timer_destroy(ra, timer); + return NULL; +} + +static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index, + VkPipelineStageFlags stage) +{ + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + return; + + vkCmdWriteTimestamp(cmd->buf, stage, pool, index); +} + +static void vk_timer_start(struct ra *ra, ra_timer *ratimer) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + struct vk_timer *timer = ratimer; + + timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE; + + uint64_t out[2]; + VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2, + sizeof(out), &out[0], sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT); + switch (res) { + case VK_SUCCESS: + timer->result = (out[1] - out[0]) * vk->limits.timestampPeriod; + break; + case VK_NOT_READY: + timer->result = 0; + break; + default: + MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res)); + return; + }; + + vk_timer_record(ra, timer->pool, timer->index, + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); +} + +static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer) +{ + struct vk_timer *timer = ratimer; + vk_timer_record(ra, timer->pool, timer->index + 1, + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); + + return timer->result; +} + +static struct ra_fns ra_fns_vk = { + .destroy = vk_destroy_ra, + .tex_create = vk_tex_create, + .tex_destroy = vk_tex_destroy_lazy, + .tex_upload = vk_tex_upload, + .buf_create = vk_buf_create, + .buf_destroy = vk_buf_destroy_lazy, + .buf_update = vk_buf_update, + .buf_poll = vk_buf_poll, + .clear = vk_clear, + .blit = vk_blit, + .uniform_layout = std140_layout, + .renderpass_create = vk_renderpass_create, + .renderpass_destroy = vk_renderpass_destroy_lazy, + .renderpass_run = vk_renderpass_run, + .timer_create = vk_timer_create, + .timer_destroy = vk_timer_destroy_lazy, + .timer_start = vk_timer_start, + .timer_stop = vk_timer_stop, +}; + +static void present_cb(void *priv, int *inflight) +{ + *inflight -= 1; +} + +bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired, + VkSemaphore *done, int *inflight) +{ + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + if (inflight) { + *inflight += 1; + vk_cmd_callback(cmd, (vk_cb)present_cb, NULL, inflight); + } + + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex_vk->external_img); + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, + VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false); + + // These are the only two stages that we use/support for actually + // outputting to swapchain imagechain images, so just add a dependency + // on both of them. In theory, we could maybe come up with some more + // advanced mechanism of tracking dynamic dependencies, but that seems + // like overkill. + vk_cmd_dep(cmd, acquired, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT); + + return vk_flush(ra, done); + +error: + return false; +} diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h new file mode 100644 index 0000000000..893421bc59 --- /dev/null +++ b/video/out/vulkan/ra_vk.h @@ -0,0 +1,31 @@ +#pragma once + +#include "video/out/gpu/ra.h" + +#include "common.h" +#include "utils.h" + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log); + +// Access to the VkDevice is needed for swapchain creation +VkDevice ra_vk_get_dev(struct ra *ra); + +// Allocates a ra_tex that wraps a swapchain image. The contents of the image +// will be invalidated, and access to it will only be internally synchronized. +// So the calling could should not do anything else with the VkImage. +struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info); + +// This function flushes the command buffers, transitions `tex` (which must be +// a wrapped swapchain image) into a format suitable for presentation, and +// submits the current rendering commands. The indicated semaphore must fire +// before the submitted command can run. If `done` is non-NULL, it will be +// set to a semaphore that fires once the command completes. If `inflight` +// is non-NULL, it will be incremented when the command starts and decremented +// when it completes. +bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired, + VkSemaphore *done, int *inflight); + +// May be called on a struct ra of any type. Returns NULL if the ra is not +// a vulkan ra. +struct mpvk_ctx *ra_vk_get(struct ra *ra); diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c new file mode 100644 index 0000000000..43e446bc36 --- /dev/null +++ b/video/out/vulkan/utils.c @@ -0,0 +1,726 @@ +#include + +#include "utils.h" +#include "malloc.h" + +const char* vk_err(VkResult res) +{ + switch (res) { + // These are technically success codes, but include them nonetheless + case VK_SUCCESS: return "VK_SUCCESS"; + case VK_NOT_READY: return "VK_NOT_READY"; + case VK_TIMEOUT: return "VK_TIMEOUT"; + case VK_EVENT_SET: return "VK_EVENT_SET"; + case VK_EVENT_RESET: return "VK_EVENT_RESET"; + case VK_INCOMPLETE: return "VK_INCOMPLETE"; + case VK_SUBOPTIMAL_KHR: return "VK_SUBOPTIMAL_KHR"; + + // Actual error codes + case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY"; + case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED"; + case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST"; + case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED"; + case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT"; + case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT"; + case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT"; + case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER"; + case VK_ERROR_TOO_MANY_OBJECTS: return "VK_ERROR_TOO_MANY_OBJECTS"; + case VK_ERROR_FORMAT_NOT_SUPPORTED: return "VK_ERROR_FORMAT_NOT_SUPPORTED"; + case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL"; + case VK_ERROR_INVALID_SHADER_NV: return "VK_ERROR_INVALID_SHADER_NV"; + case VK_ERROR_OUT_OF_DATE_KHR: return "VK_ERROR_OUT_OF_DATE_KHR"; + case VK_ERROR_SURFACE_LOST_KHR: return "VK_ERROR_SURFACE_LOST_KHR"; + } + + return "Unknown error!"; +} + +static const char* vk_dbg_type(VkDebugReportObjectTypeEXT type) +{ + switch (type) { + case VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT: + return "VkInstance"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT: + return "VkPhysicalDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT: + return "VkDevice"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT: + return "VkQueue"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT: + return "VkSemaphore"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT: + return "VkCommandBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT: + return "VkFence"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT: + return "VkDeviceMemory"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT: + return "VkBuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT: + return "VkImage"; + case VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT: + return "VkEvent"; + case VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT: + return "VkQueryPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT: + return "VkBufferView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT: + return "VkImageView"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT: + return "VkShaderModule"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT: + return "VkPipelineCache"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT: + return "VkPipelineLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT: + return "VkRenderPass"; + case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT: + return "VkPipeline"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT: + return "VkDescriptorSetLayout"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT: + return "VkSampler"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT: + return "VkDescriptorPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT: + return "VkDescriptorSet"; + case VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT: + return "VkFramebuffer"; + case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT: + return "VkCommandPool"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT: + return "VkSurfaceKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT: + return "VkSwapchainKHR"; + case VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT: + return "VkDebugReportCallbackEXT"; + case VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT: + default: + return "unknown object"; + } +} + +static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objType, + uint64_t obj, size_t loc, int32_t msgCode, + const char *layer, const char *msg, void *priv) +{ + struct mpvk_ctx *vk = priv; + int lev = MSGL_V; + + switch (flags) { + case VK_DEBUG_REPORT_ERROR_BIT_EXT: lev = MSGL_ERR; break; + case VK_DEBUG_REPORT_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_INFORMATION_BIT_EXT: lev = MSGL_TRACE; break; + case VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT: lev = MSGL_WARN; break; + case VK_DEBUG_REPORT_DEBUG_BIT_EXT: lev = MSGL_DEBUG; break; + }; + + MP_MSG(vk, lev, "vk [%s] %d: %s (obj 0x%llx (%s), loc 0x%zx)\n", + layer, (int)msgCode, msg, (unsigned long long)obj, + vk_dbg_type(objType), loc); + + // The return value of this function determines whether the call will + // be explicitly aborted (to prevent GPU errors) or not. In this case, + // we generally want this to be on for the errors. + return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT); +} + +static void vk_cmdpool_uninit(struct mpvk_ctx *vk, struct vk_cmdpool *pool) +{ + if (!pool) + return; + + // also frees associated command buffers + vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR); + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR); + vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR); + talloc_free(pool->cmds[n].callbacks); + } + talloc_free(pool); +} + +void mpvk_uninit(struct mpvk_ctx *vk) +{ + if (!vk->inst) + return; + + if (vk->dev) { + vk_cmdpool_uninit(vk, vk->pool); + vk_malloc_uninit(vk); + vkDestroyDevice(vk->dev, MPVK_ALLOCATOR); + } + + if (vk->dbg) { + // Same deal as creating the debug callback, we need to load this + // first. + VK_LOAD_PFN(vkDestroyDebugReportCallbackEXT) + pfn_vkDestroyDebugReportCallbackEXT(vk->inst, vk->dbg, MPVK_ALLOCATOR); + } + + vkDestroySurfaceKHR(vk->inst, vk->surf, MPVK_ALLOCATOR); + vkDestroyInstance(vk->inst, MPVK_ALLOCATOR); + + *vk = (struct mpvk_ctx){0}; +} + +bool mpvk_instance_init(struct mpvk_ctx *vk, struct mp_log *log, bool debug) +{ + *vk = (struct mpvk_ctx) { + .log = log, + }; + + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + }; + + if (debug) { + // Enables the LunarG standard validation layer, which + // is a meta-layer that loads lots of other validators + static const char* layers[] = { + "VK_LAYER_LUNARG_standard_validation", + }; + + info.ppEnabledLayerNames = layers; + info.enabledLayerCount = MP_ARRAY_SIZE(layers); + } + + // Enable whatever extensions were compiled in. + static const char *extensions[] = { + VK_KHR_SURFACE_EXTENSION_NAME, +#if HAVE_X11 + VK_KHR_XLIB_SURFACE_EXTENSION_NAME, +#endif + + // Extra extensions only used for debugging. These are toggled by + // decreasing the enabledExtensionCount, so the number needs to be + // synchronized with the code below. + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + }; + + const int debugExtensionCount = 1; + + info.ppEnabledExtensionNames = extensions; + info.enabledExtensionCount = MP_ARRAY_SIZE(extensions); + + if (!debug) + info.enabledExtensionCount -= debugExtensionCount; + + MP_VERBOSE(vk, "Creating instance with extensions:\n"); + for (int i = 0; i < info.enabledExtensionCount; i++) + MP_VERBOSE(vk, " %s\n", info.ppEnabledExtensionNames[i]); + + VkResult res = vkCreateInstance(&info, MPVK_ALLOCATOR, &vk->inst); + if (res != VK_SUCCESS) { + MP_VERBOSE(vk, "Failed creating instance: %s\n", vk_err(res)); + return false; + } + + if (debug) { + // Set up a debug callback to catch validation messages + VkDebugReportCallbackCreateInfoEXT dinfo = { + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | + VK_DEBUG_REPORT_DEBUG_BIT_EXT, + .pfnCallback = vk_dbg_callback, + .pUserData = vk, + }; + + // Since this is not part of the core spec, we need to load it. This + // can't fail because we've already successfully created an instance + // with this extension enabled. + VK_LOAD_PFN(vkCreateDebugReportCallbackEXT) + pfn_vkCreateDebugReportCallbackEXT(vk->inst, &dinfo, MPVK_ALLOCATOR, + &vk->dbg); + } + + return true; +} + +#define MPVK_MAX_DEVICES 16 + +static bool physd_supports_surface(struct mpvk_ctx *vk, VkPhysicalDevice physd) +{ + uint32_t qfnum; + vkGetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL); + + for (int i = 0; i < qfnum; i++) { + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(physd, i, vk->surf, &sup)); + if (sup) + return true; + } + +error: + return false; +} + +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw) +{ + assert(vk->surf); + + MP_VERBOSE(vk, "Probing for vulkan devices:\n"); + + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + VK(vkEnumeratePhysicalDevices(vk->inst, &num, NULL)); + devices = talloc_array(NULL, VkPhysicalDevice, num); + VK(vkEnumeratePhysicalDevices(vk->inst, &num, devices)); + + // Sorted by "priority". Reuses some m_opt code for convenience + static const struct m_opt_choice_alternatives types[] = { + {"discrete", VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU}, + {"integrated", VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU}, + {"virtual", VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU}, + {"software", VK_PHYSICAL_DEVICE_TYPE_CPU}, + {"unknown", VK_PHYSICAL_DEVICE_TYPE_OTHER}, + {0} + }; + + VkPhysicalDeviceProperties props[MPVK_MAX_DEVICES]; + for (int i = 0; i < num; i++) { + vkGetPhysicalDeviceProperties(devices[i], &props[i]); + MP_VERBOSE(vk, " GPU %d: %s (%s)\n", i, props[i].deviceName, + m_opt_choice_str(types, props[i].deviceType)); + } + + // Iterate through each type in order of decreasing preference + for (int t = 0; types[t].name; t++) { + // Disallow SW rendering unless explicitly enabled + if (types[t].value == VK_PHYSICAL_DEVICE_TYPE_CPU && !sw) + continue; + + for (int i = 0; i < num; i++) { + VkPhysicalDeviceProperties prop = props[i]; + if (prop.deviceType != types[t].value) + continue; + if (name && strcmp(name, prop.deviceName) != 0) + continue; + if (!physd_supports_surface(vk, devices[i])) + continue; + + MP_VERBOSE(vk, "Chose device:\n"); + MP_VERBOSE(vk, " Device Name: %s\n", prop.deviceName); + MP_VERBOSE(vk, " Device ID: %x:%x\n", + (unsigned)prop.vendorID, (unsigned)prop.deviceID); + MP_VERBOSE(vk, " Driver version: %d\n", (int)prop.driverVersion); + MP_VERBOSE(vk, " API version: %d.%d.%d\n", + (int)VK_VERSION_MAJOR(prop.apiVersion), + (int)VK_VERSION_MINOR(prop.apiVersion), + (int)VK_VERSION_PATCH(prop.apiVersion)); + vk->physd = devices[i]; + vk->limits = prop.limits; + talloc_free(devices); + return true; + } + } + +error: + MP_VERBOSE(vk, "Found no suitable device, giving up.\n"); + talloc_free(devices); + return false; +} + +bool mpvk_pick_surface_format(struct mpvk_ctx *vk) +{ + assert(vk->physd); + + VkSurfaceFormatKHR *formats = NULL; + int num; + + // Enumerate through the surface formats and find one that we can map to + // a ra_format + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, NULL)); + formats = talloc_array(NULL, VkSurfaceFormatKHR, num); + VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, formats)); + + for (int i = 0; i < num; i++) { + // A value of VK_FORMAT_UNDEFINED means we can pick anything we want + if (formats[i].format == VK_FORMAT_UNDEFINED) { + vk->surf_format = (VkSurfaceFormatKHR) { + .colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + .format = VK_FORMAT_R16G16B16A16_UNORM, + }; + break; + } + + if (formats[i].colorSpace != VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) + continue; + + // Format whitelist, since we want only >= 8 bit _UNORM formats + switch (formats[i].format) { + case VK_FORMAT_R8G8B8_UNORM: + case VK_FORMAT_B8G8R8_UNORM: + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_B8G8R8A8_UNORM: + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: + case VK_FORMAT_A2R10G10B10_UNORM_PACK32: + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + case VK_FORMAT_R16G16B16_UNORM: + case VK_FORMAT_R16G16B16A16_UNORM: + break; // accept + default: continue; + } + + vk->surf_format = formats[i]; + break; + } + + talloc_free(formats); + + if (!vk->surf_format.format) + goto error; + + return true; + +error: + MP_ERR(vk, "Failed picking surface format!\n"); + talloc_free(formats); + return false; +} + +static bool vk_cmdpool_init(struct mpvk_ctx *vk, VkDeviceQueueCreateInfo qinfo, + VkQueueFamilyProperties props, + struct vk_cmdpool **out) +{ + struct vk_cmdpool *pool = *out = talloc_ptrtype(NULL, pool); + *pool = (struct vk_cmdpool) { + .qf = qinfo.queueFamilyIndex, + .props = props, + .qcount = qinfo.queueCount, + }; + + for (int n = 0; n < pool->qcount; n++) + vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]); + + VkCommandPoolCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = pool->qf, + }; + + VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool)); + + VkCommandBufferAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = MPVK_MAX_CMDS, + }; + + VkCommandBuffer cmdbufs[MPVK_MAX_CMDS]; + VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs)); + + for (int n = 0; n < MPVK_MAX_CMDS; n++) { + struct vk_cmd *cmd = &pool->cmds[n]; + cmd->pool = pool; + cmd->buf = cmdbufs[n]; + + VkFenceCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }; + + VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence)); + + VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + + VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done)); + } + + return true; + +error: + return false; +} + +bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts) +{ + assert(vk->physd); + + VkQueueFamilyProperties *qfs = NULL; + int qfnum; + + // Enumerate the queue families and find suitable families for each task + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + qfs = talloc_array(NULL, VkQueueFamilyProperties, qfnum); + vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + + MP_VERBOSE(vk, "Queue families supported by device:\n"); + + for (int i = 0; i < qfnum; i++) { + MP_VERBOSE(vk, "QF %d: flags 0x%x num %d\n", i, + (unsigned)qfs[i].queueFlags, (int)qfs[i].queueCount); + } + + // For most of our rendering operations, we want to use one "primary" pool, + // so just pick the queue family with the most features. + int idx = -1; + for (int i = 0; i < qfnum; i++) { + if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT)) + continue; + + // QF supports more features + if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags) + idx = i; + + // QF supports more queues (at the same specialization level) + if (qfs[i].queueFlags == qfs[idx].queueFlags && + qfs[i].queueCount > qfs[idx].queueCount) + { + idx = i; + } + } + + // Vulkan requires at least one GRAPHICS queue, so if this fails something + // is horribly wrong. + assert(idx >= 0); + + // Ensure we can actually present to the surface using this queue + VkBool32 sup; + VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx, vk->surf, &sup)); + if (!sup) { + MP_ERR(vk, "Queue family does not support surface presentation!\n"); + goto error; + } + + // Now that we know which queue families we want, we can create the logical + // device + assert(opts.queue_count <= MPVK_MAX_QUEUES); + static const float priorities[MPVK_MAX_QUEUES] = {0}; + VkDeviceQueueCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = idx, + .queueCount = MPMIN(qfs[idx].queueCount, opts.queue_count), + .pQueuePriorities = priorities, + }; + + static const char *exts[] = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_NV_GLSL_SHADER_EXTENSION_NAME, + }; + + VkDeviceCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &qinfo, + .ppEnabledExtensionNames = exts, + .enabledExtensionCount = MP_ARRAY_SIZE(exts), + }; + + MP_VERBOSE(vk, "Creating vulkan device...\n"); + VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev)); + + vk_malloc_init(vk); + + // Create the vk_cmdpools and all required queues / synchronization objects + if (!vk_cmdpool_init(vk, qinfo, qfs[idx], &vk->pool)) + goto error; + + talloc_free(qfs); + return true; + +error: + MP_ERR(vk, "Failed creating logical device!\n"); + talloc_free(qfs); + return false; +} + +static void run_callbacks(struct mpvk_ctx *vk, struct vk_cmd *cmd) +{ + for (int i = 0; i < cmd->num_callbacks; i++) { + struct vk_callback *cb = &cmd->callbacks[i]; + cb->run(cb->priv, cb->arg); + *cb = (struct vk_callback){0}; + } + + cmd->num_callbacks = 0; + + // Also reset vk->last_cmd in case this was the last command to run + if (vk->last_cmd == cmd) + vk->last_cmd = NULL; +} + +static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num) +{ + if (!num) + return; + + VkFence fences[MPVK_MAX_CMDS]; + for (int i = 0; i < num; i++) + fences[i] = cmds[i].fence; + + vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX); + + for (int i = 0; i < num; i++) + run_callbacks(vk, &cmds[i]); +} + +void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool) +{ + if (!pool) + return; + + int idx = pool->cindex, pidx = pool->cindex_pending; + if (pidx < idx) { // range doesn't wrap + wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx); + } else if (pidx > idx) { // range wraps + wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx); + wait_for_cmds(vk, &pool->cmds[0], idx); + } + pool->cindex_pending = pool->cindex; +} + +void mpvk_dev_wait_idle(struct mpvk_ctx *vk) +{ + mpvk_pool_wait_idle(vk, vk->pool); +} + +void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout) +{ + if (!pool) + return; + + // If requested, hard block until at least one command completes + if (timeout > 0 && pool->cindex_pending != pool->cindex) { + vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence, + true, timeout); + } + + // Lazily garbage collect the commands based on their status + while (pool->cindex_pending != pool->cindex) { + struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending]; + VkResult res = vkGetFenceStatus(vk->dev, cmd->fence); + if (res != VK_SUCCESS) + break; + run_callbacks(vk, cmd); + pool->cindex_pending++; + pool->cindex_pending %= MPVK_MAX_CMDS; + } +} + +void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout) +{ + mpvk_pool_poll_cmds(vk, vk->pool, timeout); +} + +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg) +{ + if (vk->last_cmd) { + vk_cmd_callback(vk->last_cmd, callback, p, arg); + } else { + // The device was already idle, so we can just immediately call it + callback(p, arg); + } +} + +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg) +{ + MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks); + cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) { + .run = callback, + .priv = p, + .arg = arg, + }; +} + +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage) +{ + assert(cmd->num_deps < MPVK_MAX_CMD_DEPS); + cmd->deps[cmd->num_deps] = dep; + cmd->depstages[cmd->num_deps++] = depstage; +} + +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool) +{ + // Garbage collect the cmdpool first + mpvk_pool_poll_cmds(vk, pool, 0); + + int next = (pool->cindex + 1) % MPVK_MAX_CMDS; + if (next == pool->cindex_pending) { + MP_ERR(vk, "No free command buffers!\n"); + goto error; + } + + struct vk_cmd *cmd = &pool->cmds[pool->cindex]; + pool->cindex = next; + + VK(vkResetCommandBuffer(cmd->buf, 0)); + + VkCommandBufferBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + VK(vkBeginCommandBuffer(cmd->buf, &binfo)); + + return cmd; + +error: + return NULL; +} + +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done) +{ + VK(vkEndCommandBuffer(cmd->buf)); + + struct vk_cmdpool *pool = cmd->pool; + VkQueue queue = pool->queues[pool->qindex]; + + VkSubmitInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, + .pCommandBuffers = &cmd->buf, + .waitSemaphoreCount = cmd->num_deps, + .pWaitSemaphores = cmd->deps, + .pWaitDstStageMask = cmd->depstages, + }; + + if (done) { + sinfo.signalSemaphoreCount = 1; + sinfo.pSignalSemaphores = &cmd->done; + *done = cmd->done; + } + + VK(vkResetFences(vk->dev, 1, &cmd->fence)); + VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence)); + MP_TRACE(vk, "Submitted command on queue %p (QF %d)\n", (void *)queue, + pool->qf); + + for (int i = 0; i < cmd->num_deps; i++) + cmd->deps[i] = NULL; + cmd->num_deps = 0; + + vk->last_cmd = cmd; + return true; + +error: + return false; +} + +void vk_cmd_cycle_queues(struct mpvk_ctx *vk) +{ + struct vk_cmdpool *pool = vk->pool; + pool->qindex = (pool->qindex + 1) % pool->qcount; +} + +const VkImageSubresourceRange vk_range = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, +}; + +const VkImageSubresourceLayers vk_layers = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .layerCount = 1, +}; diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h new file mode 100644 index 0000000000..5bde48089d --- /dev/null +++ b/video/out/vulkan/utils.h @@ -0,0 +1,153 @@ +#pragma once + +#include "video/out/vo.h" +#include "video/out/gpu/context.h" +#include "video/mp_image.h" + +#include "common.h" +#include "formats.h" + +#define VK_LOAD_PFN(name) PFN_##name pfn_##name = (PFN_##name) \ + vkGetInstanceProcAddr(vk->inst, #name); + +// Return a human-readable name for various struct mpvk_ctx enums +const char* vk_err(VkResult res); + +// Convenience macros to simplify a lot of common boilerplate +#define VK_ASSERT(res, str) \ + do { \ + if (res != VK_SUCCESS) { \ + MP_ERR(vk, str ": %s\n", vk_err(res)); \ + goto error; \ + } \ + } while (0) + +#define VK(cmd) \ + do { \ + MP_TRACE(vk, #cmd "\n"); \ + VkResult res ## __LINE__ = (cmd); \ + VK_ASSERT(res ## __LINE__, #cmd); \ + } while (0) + +// Uninits everything in the correct order +void mpvk_uninit(struct mpvk_ctx *vk); + +// Initialization functions: As a rule of thumb, these need to be called in +// this order, followed by vk_malloc_init, followed by RA initialization, and +// finally followed by vk_swchain initialization. + +// Create a vulkan instance. Returns VK_NULL_HANDLE on failure +bool mpvk_instance_init(struct mpvk_ctx *vk, struct mp_log *log, bool debug); + +// Generate a VkSurfaceKHR usable for video output. Returns VK_NULL_HANDLE on +// failure. Must be called after mpvk_instance_init. +bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk); + +// Find a suitable physical device for use with rendering and which supports +// the surface. +// name: only match a device with this name +// sw: also allow software/virtual devices +bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw); + +// Pick a suitable surface format that's supported by this physical device. +bool mpvk_pick_surface_format(struct mpvk_ctx *vk); + +struct mpvk_device_opts { + int queue_count; // number of queues to use +}; + +// Create a logical device and initialize the vk_cmdpools +bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts); + +// Wait until all commands submitted to all queues have completed +void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool); +void mpvk_dev_wait_idle(struct mpvk_ctx *vk); + +// Wait until at least one command submitted to any queue has completed, and +// process the callbacks. Good for event loops that need to delay until a +// command completes. Will block at most `timeout` nanoseconds. If used with +// 0, it only garbage collects completed commands without blocking. +void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool, + uint64_t timeout); +void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout); + +// Since lots of vulkan operations need to be done lazily once the affected +// resources are no longer in use, provide an abstraction for tracking these. +// In practice, these are only checked and run when submitting new commands, so +// the actual execution may be delayed by a frame. +typedef void (*vk_cb)(void *priv, void *arg); + +struct vk_callback { + vk_cb run; + void *priv; + void *arg; // as a convenience, you also get to pass an arg for "free" +}; + +// Associate a callback with the completion of all currently pending commands. +// This will essentially run once the device is completely idle. +void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg); + +#define MPVK_MAX_CMD_DEPS 8 + +// Helper wrapper around command buffers that also track dependencies, +// callbacks and synchronization primitives +struct vk_cmd { + struct vk_cmdpool *pool; // pool it was allocated from + VkCommandBuffer buf; + VkFence fence; // the fence guards cmd buffer reuse + VkSemaphore done; // the semaphore signals when execution is done + // The semaphores represent dependencies that need to complete before + // this command can be executed. These are *not* owned by the vk_cmd + VkSemaphore deps[MPVK_MAX_CMD_DEPS]; + VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS]; + int num_deps; + // Since VkFences are useless, we have to manually track "callbacks" + // to fire once the VkFence completes. These are used for multiple purposes, + // ranging from garbage collection (resource deallocation) to fencing. + struct vk_callback *callbacks; + int num_callbacks; +}; + +// Associate a callback with the completion of the current command. This +// bool will be set to `true` once the command completes, or shortly thereafter. +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg); + +// Associate a dependency for the current command. This semaphore must signal +// by the corresponding stage before the command may execute. +void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, + VkPipelineStageFlagBits depstage); + +#define MPVK_MAX_QUEUES 8 +#define MPVK_MAX_CMDS 64 + +// Command pool / queue family hybrid abstraction +struct vk_cmdpool { + VkQueueFamilyProperties props; + uint32_t qf; // queue family index + VkCommandPool pool; + VkQueue queues[MPVK_MAX_QUEUES]; + int qcount; + int qindex; + // Command buffers associated with this queue + struct vk_cmd cmds[MPVK_MAX_CMDS]; + int cindex; + int cindex_pending; +}; + +// Fetch the next command buffer from a command pool and begin recording to it. +// Returns NULL on failure. +struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool); + +// Finish the currently recording command buffer and submit it for execution. +// If `done` is not NULL, it will be set to a semaphore that will signal once +// the command completes. (And MUST have a corresponding semaphore wait) +// Returns whether successful. +bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done); + +// Rotate the queues for each vk_cmdpool. Call this once per frame to ensure +// good parallelism between frames when using multiple queues +void vk_cmd_cycle_queues(struct mpvk_ctx *vk); + +// Predefined structs for a simple non-layered, non-mipped image +extern const VkImageSubresourceRange vk_range; +extern const VkImageSubresourceLayers vk_layers; diff --git a/wscript b/wscript index dd47956392..964b7878c7 100644 --- a/wscript +++ b/wscript @@ -802,6 +802,10 @@ video_output_features = [ 'fmsg': "No OpenGL video output found or enabled. " + "Aborting. If you really mean to compile without OpenGL " + "video outputs use --disable-gl.", + }, { + 'name': '--vulkan', + 'desc': 'Vulkan context support', + 'func': check_cc(header_name='vulkan/vulkan.h', lib='vulkan'), }, { 'name': 'egl-helpers', 'desc': 'EGL helper functions', diff --git a/wscript_build.py b/wscript_build.py index 68cfafb94f..86b51daaa2 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -445,6 +445,12 @@ def build(ctx): ( "video/out/w32_common.c", "win32-desktop" ), ( "video/out/win32/displayconfig.c", "win32-desktop" ), ( "video/out/win32/droptarget.c", "win32-desktop" ), + ( "video/out/vulkan/utils.c", "vulkan" ), + ( "video/out/vulkan/malloc.c", "vulkan" ), + ( "video/out/vulkan/formats.c", "vulkan" ), + ( "video/out/vulkan/ra_vk.c", "vulkan" ), + ( "video/out/vulkan/context.c", "vulkan" ), + ( "video/out/vulkan/context_xlib.c", "vulkan && x11" ), ( "video/out/win32/exclusive_hack.c", "gl-win32" ), ( "video/out/wayland_common.c", "wayland" ), ( "video/out/wayland/buffer.c", "wayland" ),