mpv/video/out/vulkan/ra_vk.c

#include "video/out/gpu/utils.h"
#include "video/out/gpu/spirv.h"

#include "ra_vk.h"
#include "malloc.h"

static struct ra_fns ra_fns_vk;

enum queue_type {
    GRAPHICS,
    COMPUTE,
    TRANSFER,
};

// For ra.priv
struct ra_vk {
    struct mpvk_ctx *vk;
    struct ra_tex *clear_tex; // stupid hack for clear()
    struct vk_cmd *cmd;       // currently recording cmd
};

struct mpvk_ctx *ra_vk_get(struct ra *ra)
{
    if (ra->fns != &ra_fns_vk)
        return NULL;

    struct ra_vk *p = ra->priv;
    return p->vk;
}

static void vk_submit(struct ra *ra)
{
    struct ra_vk *p = ra->priv;
    struct mpvk_ctx *vk = ra_vk_get(ra);

    if (p->cmd) {
        vk_cmd_queue(vk, p->cmd);
        p->cmd = NULL;
    }
}

// Returns a command buffer, or NULL on error
static struct vk_cmd *vk_require_cmd(struct ra *ra, enum queue_type type)
{
    struct ra_vk *p = ra->priv;
    struct mpvk_ctx *vk = ra_vk_get(ra);

    struct vk_cmdpool *pool;
    switch (type) {
    case GRAPHICS: pool = vk->pool_graphics; break;
    case COMPUTE:  pool = vk->pool_compute;  break;

    // GRAPHICS and COMPUTE also imply TRANSFER capability (vulkan spec)
    case TRANSFER:
        pool = vk->pool_transfer;
        if (!pool)
            pool = vk->pool_compute;
        if (!pool)
            pool = vk->pool_graphics;
        break;
    default: abort();
    }

    assert(pool);
    if (p->cmd && p->cmd->pool == pool)
        return p->cmd;

    vk_submit(ra);
    p->cmd = vk_cmd_begin(vk, pool);
    return p->cmd;
}

#define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
    static void fun##_lazy(struct ra *ra, argtype *arg) {   \
        struct ra_vk *p = ra->priv;                         \
        struct mpvk_ctx *vk = ra_vk_get(ra);                \
        if (p->cmd) {                                       \
            vk_cmd_callback(p->cmd, (vk_cb) fun, ra, arg);  \
        } else {                                            \
            vk_dev_callback(vk, (vk_cb) fun, ra, arg);      \
        }                                                   \
    }

static void vk_destroy_ra(struct ra *ra)
{
    struct ra_vk *p = ra->priv;
    struct mpvk_ctx *vk = ra_vk_get(ra);

    vk_submit(ra);
    mpvk_flush_commands(vk);
    mpvk_poll_commands(vk, UINT64_MAX);
    ra_tex_free(ra, &p->clear_tex);

    talloc_free(ra);
}

static bool vk_setup_formats(struct ra *ra)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);

    for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) {
        VkFormatProperties prop;
        vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop);

        // As a bare minimum, we need to sample from an allocated image
        VkFormatFeatureFlags flags = prop.optimalTilingFeatures;
        if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT))
            continue;

        VkFormatFeatureFlags linear_bits, render_bits;
        linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
        render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
                      VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;

        struct ra_format *fmt = talloc_zero(ra, struct ra_format);
        *fmt = (struct ra_format) {
            .name            = vk_fmt->name,
            .priv            = (void *)vk_fmt,
            .ctype           = vk_fmt->ctype,
            .ordered         = !vk_fmt->fucked_order,
            .num_components  = vk_fmt->components,
            .pixel_size      = vk_fmt->bytes,
            .linear_filter   = !!(flags & linear_bits),
            .renderable      = !!(flags & render_bits),
        };

        for (int i = 0; i < 4; i++)
            fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i];

        fmt->glsl_format = ra_fmt_glsl_format(fmt);

        MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt);
    }

    // Populate some other capabilities related to formats while we're at it
    VkImageType imgType[3] = {
        VK_IMAGE_TYPE_1D,
        VK_IMAGE_TYPE_2D,
        VK_IMAGE_TYPE_3D
    };

    // R8_UNORM is supported on literally every single vulkan implementation
    const VkFormat testfmt = VK_FORMAT_R8_UNORM;

    for (int d = 0; d < 3; d++) {
        VkImageFormatProperties iprop;
        VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
                testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL,
                VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop);

        switch (imgType[d]) {
        case VK_IMAGE_TYPE_1D:
            if (res == VK_SUCCESS)
                ra->caps |= RA_CAP_TEX_1D;
            break;
        case VK_IMAGE_TYPE_2D:
            // 2D formats must be supported by RA, so ensure this is the case
            VK_ASSERT(res, "Querying 2D format limits");
            ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height);
            break;
        case VK_IMAGE_TYPE_3D:
            if (res == VK_SUCCESS)
                ra->caps |= RA_CAP_TEX_3D;
            break;
        }
    }

    // RA_CAP_BLIT implies both blitting between images as well as blitting
    // directly to the swapchain image, so check for all three operations
    bool blittable = true;
    VkFormatProperties prop;
    vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop);
    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT))
        blittable = false;
    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
        blittable = false;

    vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop);
    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
        blittable = false;

    if (blittable)
        ra->caps |= RA_CAP_BLIT;

    return true;

error:
    return false;
}

static struct ra_fns ra_fns_vk;

struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
{
    assert(vk->dev);
    assert(vk->alloc);

    struct ra *ra = talloc_zero(NULL, struct ra);
    ra->log = log;
    ra->fns = &ra_fns_vk;

    struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk);
    p->vk = vk;

    ra->caps |= vk->spirv->ra_caps;
    ra->glsl_version = vk->spirv->glsl_version;
    ra->glsl_vulkan = true;
    ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
    ra->max_pushc_size = vk->limits.maxPushConstantsSize;

    if (vk->pool_compute) {
        ra->caps |= RA_CAP_COMPUTE;
        // If we have more compute queues than graphics queues, we probably
        // want to be using them. (This seems mostly relevant for AMD)
        if (vk->pool_compute->num_queues > vk->pool_graphics->num_queues)
            ra->caps |= RA_CAP_PARALLEL_COMPUTE;
    }

    if (!vk_setup_formats(ra))
        goto error;

    // UBO support is required
    ra->caps |= RA_CAP_BUF_RO | RA_CAP_FRAGCOORD;

    // textureGather is only supported in GLSL 400+
    if (ra->glsl_version >= 400)
        ra->caps |= RA_CAP_GATHER;

    // Try creating a shader storage buffer
    struct ra_buf_params ssbo_params = {
        .type = RA_BUF_TYPE_SHADER_STORAGE,
        .size = 16,
    };

    struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params);
    if (ssbo) {
        ra->caps |= RA_CAP_BUF_RW;
        ra_buf_free(ra, &ssbo);
    }

    // To support clear() by region, we need to allocate a dummy 1x1 image that
    // will be used as the source of blit operations
    struct ra_tex_params clear_params = {
        .dimensions = 1, // no point in using a 2D image if height = 1
        .w = 1,
        .h = 1,
        .d = 1,
        .format = ra_find_float16_format(ra, 4),
        .blit_src = 1,
        .host_mutable = 1,
    };

    p->clear_tex = ra_tex_create(ra, &clear_params);
    if (!p->clear_tex) {
        MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n");
        goto error;
    }

    return ra;

error:
    vk_destroy_ra(ra);
    return NULL;
}

// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain
// compatible. The renderpass will automatically transition the image out of
// initialLayout and into finalLayout.
static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
                                      VkAttachmentLoadOp loadOp,
                                      VkImageLayout initialLayout,
                                      VkImageLayout finalLayout,
                                      VkRenderPass *out)
{
    struct vk_format *vk_fmt = fmt->priv;
    assert(fmt->renderable);

    VkRenderPassCreateInfo rinfo = {
        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
        .attachmentCount = 1,
        .pAttachments = &(VkAttachmentDescription) {
            .format = vk_fmt->iformat,
            .samples = VK_SAMPLE_COUNT_1_BIT,
            .loadOp = loadOp,
            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
            .initialLayout = initialLayout,
            .finalLayout = finalLayout,
        },
        .subpassCount = 1,
        .pSubpasses = &(VkSubpassDescription) {
            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
            .colorAttachmentCount = 1,
            .pColorAttachments = &(VkAttachmentReference) {
                .attachment = 0,
                .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
            },
        },
    };

    return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out);
}

// For ra_tex.priv
struct ra_tex_vk {
    bool external_img;
    enum queue_type upload_queue;
    VkImageType type;
    VkImage img;
    struct vk_memslice mem;
    // for sampling
    VkImageView view;
    VkSampler sampler;
    // for rendering
    VkFramebuffer framebuffer;
    VkRenderPass dummyPass;
    // for uploading
    struct ra_buf_pool pbo;
    // "current" metadata, can change during the course of execution
    VkImageLayout current_layout;
    VkAccessFlags current_access;
    // the signal guards reuse, and can be NULL
    struct vk_signal *sig;
    VkPipelineStageFlags sig_stage;
    VkSemaphore ext_dep; // external semaphore, not owned by the ra_tex
};

void ra_tex_vk_external_dep(struct ra *ra, struct ra_tex *tex, VkSemaphore dep)
{
    struct ra_tex_vk *tex_vk = tex->priv;
    assert(!tex_vk->ext_dep);
    tex_vk->ext_dep = dep;
}

// Small helper to ease image barrier creation. if `discard` is set, the contents
// of the image will be undefined after the barrier
static void tex_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_tex *tex,
                        VkPipelineStageFlags stage, VkAccessFlags newAccess,
                        VkImageLayout newLayout, bool discard)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_tex_vk *tex_vk = tex->priv;

    if (tex_vk->ext_dep) {
        vk_cmd_dep(cmd, tex_vk->ext_dep, stage);
        tex_vk->ext_dep = NULL;
    }

    VkImageMemoryBarrier imgBarrier = {
        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
        .oldLayout = tex_vk->current_layout,
        .newLayout = newLayout,
        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .srcAccessMask = tex_vk->current_access,
        .dstAccessMask = newAccess,
        .image = tex_vk->img,
        .subresourceRange = vk_range,
    };

    if (discard) {
        imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        imgBarrier.srcAccessMask = 0;
    }

    VkEvent event = NULL;
    vk_cmd_wait(vk, cmd, &tex_vk->sig, stage, &event);

    bool need_trans = tex_vk->current_layout != newLayout ||
                      tex_vk->current_access != newAccess;

    // Transitioning to VK_IMAGE_LAYOUT_UNDEFINED is a pseudo-operation
    // that for us means we don't need to perform the actual transition
    if (need_trans && newLayout != VK_IMAGE_LAYOUT_UNDEFINED) {
        if (event) {
            vkCmdWaitEvents(cmd->buf, 1, &event, tex_vk->sig_stage,
                            stage, 0, NULL, 0, NULL, 1, &imgBarrier);
        } else {
            // If we're not using an event, then the source stage is irrelevant
            // because we're coming from a different queue anyway, so we can
            // safely set it to TOP_OF_PIPE.
            imgBarrier.srcAccessMask = 0;
            vkCmdPipelineBarrier(cmd->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                                 stage, 0, 0, NULL, 0, NULL, 1, &imgBarrier);
        }
    }

    tex_vk->current_layout = newLayout;
    tex_vk->current_access = newAccess;
}

static void tex_signal(struct ra *ra, struct vk_cmd *cmd, struct ra_tex *tex,
                       VkPipelineStageFlags stage)
{
    struct ra_tex_vk *tex_vk = tex->priv;
    struct mpvk_ctx *vk = ra_vk_get(ra);
    assert(!tex_vk->sig);

    tex_vk->sig = vk_cmd_signal(vk, cmd, stage);
    tex_vk->sig_stage = stage;
}

static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex)
{
    if (!tex)
        return;

    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_tex_vk *tex_vk = tex->priv;

    ra_buf_pool_uninit(ra, &tex_vk->pbo);
    vk_signal_destroy(vk, &tex_vk->sig);
    vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR);
    vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR);
    vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR);
    vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR);
    if (!tex_vk->external_img) {
        vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR);
        vk_free_memslice(vk, tex_vk->mem);
    }

    talloc_free(tex);
}

MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex);

// Initializes non-VkImage values like the image view, samplers, etc.
static bool vk_init_image(struct ra *ra, struct ra_tex *tex)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);

    struct ra_tex_params *params = &tex->params;
    struct ra_tex_vk *tex_vk = tex->priv;
    assert(tex_vk->img);

    tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    tex_vk->current_access = 0;

    if (params->render_src || params->render_dst) {
        static const VkImageViewType viewType[] = {
            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
        };

        const struct vk_format *fmt = params->format->priv;
        VkImageViewCreateInfo vinfo = {
            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
            .image = tex_vk->img,
            .viewType = viewType[tex_vk->type],
            .format = fmt->iformat,
            .subresourceRange = vk_range,
        };

        VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view));
    }

    if (params->render_src) {
        assert(params->format->linear_filter || !params->src_linear);
        VkFilter filter = params->src_linear
            ? VK_FILTER_LINEAR
            : VK_FILTER_NEAREST;
        VkSamplerAddressMode wrap = params->src_repeat
            ? VK_SAMPLER_ADDRESS_MODE_REPEAT
            : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
        VkSamplerCreateInfo sinfo = {
            .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
            .magFilter = filter,
            .minFilter = filter,
            .addressModeU = wrap,
            .addressModeV = wrap,
            .addressModeW = wrap,
            .maxAnisotropy = 1.0,
        };

        VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler));
    }

    if (params->render_dst) {
        // Framebuffers need to be created against a specific render pass
        // layout, so we need to temporarily create a skeleton/dummy render
        // pass for vulkan to figure out the compatibility
        VK(vk_create_render_pass(vk->dev, params->format,
                                 VK_ATTACHMENT_LOAD_OP_DONT_CARE,
                                 VK_IMAGE_LAYOUT_UNDEFINED,
                                 VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
                                 &tex_vk->dummyPass));

        VkFramebufferCreateInfo finfo = {
            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
            .renderPass = tex_vk->dummyPass,
            .attachmentCount = 1,
            .pAttachments = &tex_vk->view,
            .width = tex->params.w,
            .height = tex->params.h,
            .layers = 1,
        };

        VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR,
                               &tex_vk->framebuffer));

        // NOTE: Normally we would free the dummyPass again here, but a bug
        // in the nvidia vulkan driver causes a segfault if you do.
    }

    return true;

error:
    return false;
}

static struct ra_tex *vk_tex_create(struct ra *ra,
                                    const struct ra_tex_params *params)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    assert(!params->format->dummy_format);

    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
    tex->params = *params;
    tex->params.initial_data = NULL;

    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
    tex_vk->upload_queue = GRAPHICS;

    const struct vk_format *fmt = params->format->priv;
    switch (params->dimensions) {
    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
    default: abort();
    }

    VkImageUsageFlags usage = 0;
    if (params->render_src)
        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
    if (params->render_dst)
        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
    if (params->storage_dst)
        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
    if (params->blit_src)
        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
    if (params->host_mutable || params->blit_dst || params->initial_data)
        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;

    // Always use the transfer pool if available, for efficiency
    if (params->host_mutable && vk->pool_transfer)
        tex_vk->upload_queue = TRANSFER;

    // Double-check image usage support and fail immediately if invalid
    VkImageFormatProperties iprop;
    VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
            fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0,
            &iprop);
    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
        return NULL;
    } else {
        VK_ASSERT(res, "Querying image format properties");
    }

    VkFormatProperties prop;
    vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
    VkFormatFeatureFlags flags = prop.optimalTilingFeatures;

    bool has_blit_src   = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT,
         has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;

    if (params->w > iprop.maxExtent.width ||
        params->h > iprop.maxExtent.height ||
        params->d > iprop.maxExtent.depth ||
        (params->blit_src && !has_blit_src) ||
        (params->src_linear && !has_src_linear))
    {
        return NULL;
    }

    // FIXME: Since we can't keep track of queue family ownership properly,
    // and we don't know in advance what types of queue families this image
    // will belong to, we're forced to share all of our images between all
    // command pools.
    uint32_t qfs[3] = {0};
    for (int i = 0; i < vk->num_pools; i++)
        qfs[i] = vk->pools[i]->qf;

    VkImageCreateInfo iinfo = {
        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
        .imageType = tex_vk->type,
        .format = fmt->iformat,
        .extent = (VkExtent3D) { params->w, params->h, params->d },
        .mipLevels = 1,
        .arrayLayers = 1,
        .samples = VK_SAMPLE_COUNT_1_BIT,
        .tiling = VK_IMAGE_TILING_OPTIMAL,
        .usage = usage,
        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
        .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
                                         : VK_SHARING_MODE_EXCLUSIVE,
        .queueFamilyIndexCount = vk->num_pools,
        .pQueueFamilyIndices = qfs,
    };

    VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));

    VkMemoryPropertyFlags memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
    VkMemoryRequirements reqs;
    vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs);

    struct vk_memslice *mem = &tex_vk->mem;
    if (!vk_malloc_generic(vk, reqs, memFlags, mem))
        goto error;

    VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));

    if (!vk_init_image(ra, tex))
        goto error;

    if (params->initial_data) {
        struct ra_tex_upload_params ul_params = {
            .tex = tex,
            .invalidate = true,
            .src = params->initial_data,
            .stride = params->w * fmt->bytes,
        };
        if (!ra->fns->tex_upload(ra, &ul_params))
            goto error;
    }

    return tex;

error:
    vk_tex_destroy(ra, tex);
    return NULL;
}

struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg,
                                        VkSwapchainCreateInfoKHR info)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_tex *tex = NULL;

    const struct ra_format *format = NULL;
    for (int i = 0; i < ra->num_formats; i++) {
        const struct vk_format *fmt = ra->formats[i]->priv;
        if (fmt->iformat == vk->surf_format.format) {
            format = ra->formats[i];
            break;
        }
    }

    if (!format) {
        MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image "
                   "with surface format 0x%x\n", vk->surf_format.format);
        goto error;
    }

    tex = talloc_zero(NULL, struct ra_tex);
    tex->params = (struct ra_tex_params) {
        .format = format,
        .dimensions = 2,
        .w = info.imageExtent.width,
        .h = info.imageExtent.height,
        .d = 1,
        .blit_src    = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
        .blit_dst    = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
        .render_src  = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT),
        .render_dst  = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
        .storage_dst = !!(info.imageUsage & VK_IMAGE_USAGE_STORAGE_BIT),
    };

    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
    tex_vk->type = VK_IMAGE_TYPE_2D;
    tex_vk->external_img = true;
    tex_vk->img = vkimg;

    if (!vk_init_image(ra, tex))
        goto error;

    return tex;

error:
    vk_tex_destroy(ra, tex);
    return NULL;
}

// For ra_buf.priv
struct ra_buf_vk {
    struct vk_bufslice slice;
    int refcount; // 1 = object allocated but not in use, > 1 = in use
    bool needsflush;
    enum queue_type update_queue;
    // "current" metadata, can change during course of execution
    VkPipelineStageFlags current_stage;
    VkAccessFlags current_access;
};

static void vk_buf_deref(struct ra *ra, struct ra_buf *buf)
{
    if (!buf)
        return;

    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_buf_vk *buf_vk = buf->priv;

    if (--buf_vk->refcount == 0) {
        vk_free_memslice(vk, buf_vk->slice.mem);
        talloc_free(buf);
    }
}

static void buf_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_buf *buf,
                        VkPipelineStageFlags newStage,
                        VkAccessFlags newAccess, int offset, size_t size)
{
    struct ra_buf_vk *buf_vk = buf->priv;

    VkBufferMemoryBarrier buffBarrier = {
        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
        .srcAccessMask = buf_vk->current_access,
        .dstAccessMask = newAccess,
        .buffer = buf_vk->slice.buf,
        .offset = offset,
        .size = size,
    };

    if (buf_vk->needsflush || buf->params.host_mapped) {
        buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
        buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT;
        buf_vk->needsflush = false;
    }

    if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask) {
        vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0,
                             0, NULL, 1, &buffBarrier, 0, NULL);
    }

    buf_vk->current_stage = newStage;
    buf_vk->current_access = newAccess;
    buf_vk->refcount++;
    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, ra, buf);
}

#define vk_buf_destroy vk_buf_deref
MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf);

static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
                          const void *data, size_t size)
{
    assert(buf->params.host_mutable || buf->params.initial_data);
    struct ra_buf_vk *buf_vk = buf->priv;

    // For host-mapped buffers, we can just directly memcpy the buffer contents.
    // Otherwise, we can update the buffer from the GPU using a command buffer.
    if (buf_vk->slice.data) {
        assert(offset + size <= buf->params.size);
        uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset;
        memcpy((void *)addr, data, size);
        buf_vk->needsflush = true;
    } else {
        struct vk_cmd *cmd = vk_require_cmd(ra, buf_vk->update_queue);
        if (!cmd) {
            MP_ERR(ra, "Failed updating buffer!\n");
            return;
        }

        buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
                    VK_ACCESS_TRANSFER_WRITE_BIT, offset, size);

        VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset;
        assert(bufOffset == MP_ALIGN_UP(bufOffset, 4));
        vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data);
    }
}

static struct ra_buf *vk_buf_create(struct ra *ra,
                                    const struct ra_buf_params *params)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);

    struct ra_buf *buf = talloc_zero(NULL, struct ra_buf);
    buf->params = *params;

    struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk);
    buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
    buf_vk->current_access = 0;
    buf_vk->refcount = 1;

    VkBufferUsageFlags bufFlags = 0;
    VkMemoryPropertyFlags memFlags = 0;
    VkDeviceSize align = 4; // alignment 4 is needed for buf_update

    switch (params->type) {
    case RA_BUF_TYPE_TEX_UPLOAD:
        bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
        // Use TRANSFER-style updates for large enough buffers for efficiency
        if (params->size > 1024*1024) // 1 MB
            buf_vk->update_queue = TRANSFER;
        break;
    case RA_BUF_TYPE_UNIFORM:
        bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
        align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment);
        break;
    case RA_BUF_TYPE_SHADER_STORAGE:
        bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
        align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
        buf_vk->update_queue = COMPUTE;
        break;
    case RA_BUF_TYPE_VERTEX:
        bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
        break;
    default: abort();
    }

    if (params->host_mutable || params->initial_data) {
        bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
        align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment);
    }

    if (params->host_mapped) {
        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                    VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
    }

    if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align,
                          &buf_vk->slice))
    {
        goto error;
    }

    if (params->host_mapped)
        buf->data = buf_vk->slice.data;

    if (params->initial_data)
        vk_buf_update(ra, buf, 0, params->initial_data, params->size);

    buf->params.initial_data = NULL; // do this after vk_buf_update
    return buf;

error:
    vk_buf_destroy(ra, buf);
    return NULL;
}

static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf)
{
    struct ra_buf_vk *buf_vk = buf->priv;
    return buf_vk->refcount == 1;
}

static bool vk_tex_upload(struct ra *ra,
                          const struct ra_tex_upload_params *params)
{
    struct ra_tex *tex = params->tex;
    struct ra_tex_vk *tex_vk = tex->priv;

    if (!params->buf)
        return ra_tex_upload_pbo(ra, &tex_vk->pbo, params);

    assert(!params->src);
    assert(params->buf);
    struct ra_buf *buf = params->buf;
    struct ra_buf_vk *buf_vk = buf->priv;

    VkBufferImageCopy region = {
        .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset,
        .bufferRowLength = tex->params.w,
        .bufferImageHeight = tex->params.h,
        .imageSubresource = vk_layers,
        .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d},
    };

    if (tex->params.dimensions == 2) {
        int pix_size = tex->params.format->pixel_size;
        region.bufferRowLength = params->stride / pix_size;
        if (region.bufferRowLength * pix_size != params->stride) {
            MP_ERR(ra, "Texture upload strides must be a multiple of the texel "
                       "size!\n");
            goto error;
        }

        if (params->rc) {
            struct mp_rect *rc = params->rc;
            region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0};
            region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1};
        }
    }

    uint64_t size = region.bufferRowLength * region.bufferImageHeight *
                    region.imageExtent.depth;

    struct vk_cmd *cmd = vk_require_cmd(ra, tex_vk->upload_queue);
    if (!cmd)
        goto error;

    buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
                VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size);

    tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT,
                VK_ACCESS_TRANSFER_WRITE_BIT,
                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                params->invalidate);

    vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img,
                           tex_vk->current_layout, 1, &region);

    tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT);

    return true;

error:
    return false;
}

#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH

// For ra_renderpass.priv
struct ra_renderpass_vk {
    // Pipeline / render pass
    VkPipeline pipe;
    VkPipelineLayout pipeLayout;
    VkRenderPass renderPass;
    VkImageLayout initialLayout;
    VkImageLayout finalLayout;
    // Descriptor set (bindings)
    VkDescriptorSetLayout dsLayout;
    VkDescriptorPool dsPool;
    VkDescriptorSet dss[MPVK_NUM_DS];
    int dindex;
    // Vertex buffers (vertices)
    struct ra_buf_pool vbo;

    // For updating
    VkWriteDescriptorSet *dswrite;
    VkDescriptorImageInfo *dsiinfo;
    VkDescriptorBufferInfo *dsbinfo;
};

static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass)
{
    if (!pass)
        return;

    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_renderpass_vk *pass_vk = pass->priv;

    ra_buf_pool_uninit(ra, &pass_vk->vbo);
    vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR);
    vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR);
    vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR);
    vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR);
    vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR);

    talloc_free(pass);
}

MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass);

static const VkDescriptorType dsType[] = {
    [RA_VARTYPE_TEX]    = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
    [RA_VARTYPE_IMG_W]  = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
    [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
    [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
};

static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp,
                                VkFormat *out_fmt)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);

    enum ra_ctype ctype;
    switch (inp->type) {
    case RA_VARTYPE_FLOAT:      ctype = RA_CTYPE_FLOAT; break;
    case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break;
    default: abort();
    }

    assert(inp->dim_m == 1);
    for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) {
        if (fmt->ctype != ctype)
            continue;
        if (fmt->components != inp->dim_v)
            continue;
        if (fmt->bytes != ra_renderpass_input_layout(inp).size)
            continue;

        // Ensure this format is valid for vertex attributes
        VkFormatProperties prop;
        vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
        if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT))
            continue;

        *out_fmt = fmt->iformat;
        return true;
    }

    return false;
}

static const char vk_cache_magic[4] = {'R','A','V','K'};
static const int vk_cache_version = 2;

struct vk_cache_header {
    char magic[sizeof(vk_cache_magic)];
    int cache_version;
    char compiler[SPIRV_NAME_MAX_LEN];
    int compiler_version;
    size_t vert_spirv_len;
    size_t frag_spirv_len;
    size_t comp_spirv_len;
    size_t pipecache_len;
};

static bool vk_use_cached_program(const struct ra_renderpass_params *params,
                                  const struct spirv_compiler *spirv,
                                  struct bstr *vert_spirv,
                                  struct bstr *frag_spirv,
                                  struct bstr *comp_spirv,
                                  struct bstr *pipecache)
{
    struct bstr cache = params->cached_program;
    if (cache.len < sizeof(struct vk_cache_header))
        return false;

    struct vk_cache_header *header = (struct vk_cache_header *)cache.start;
    cache = bstr_cut(cache, sizeof(*header));

    if (strncmp(header->magic, vk_cache_magic, sizeof(vk_cache_magic)) != 0)
        return false;
    if (header->cache_version != vk_cache_version)
        return false;
    if (strncmp(header->compiler, spirv->name, sizeof(header->compiler)) != 0)
        return false;
    if (header->compiler_version != spirv->compiler_version)
        return false;

#define GET(ptr) \
    if (cache.len < header->ptr##_len)                      \
            return false;                                   \
        *ptr = bstr_splice(cache, 0, header->ptr##_len);    \
        cache = bstr_cut(cache, ptr->len);

    GET(vert_spirv);
    GET(frag_spirv);
    GET(comp_spirv);
    GET(pipecache);
    return true;
}

static VkResult vk_compile_glsl(struct ra *ra, void *tactx,
                                enum glsl_shader type, const char *glsl,
                                struct bstr *spirv)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    VkResult ret = VK_SUCCESS;
    int msgl = MSGL_DEBUG;

    if (!vk->spirv->fns->compile_glsl(vk->spirv, tactx, type, glsl, spirv)) {
        ret = VK_ERROR_INVALID_SHADER_NV;
        msgl = MSGL_ERR;
    }

    static const char *shader_names[] = {
        [GLSL_SHADER_VERTEX]   = "vertex",
        [GLSL_SHADER_FRAGMENT] = "fragment",
        [GLSL_SHADER_COMPUTE]  = "compute",
    };

    if (mp_msg_test(ra->log, msgl)) {
        MP_MSG(ra, msgl, "%s shader source:\n", shader_names[type]);
        mp_log_source(ra->log, msgl, glsl);
    }
    return ret;
}

static const VkShaderStageFlags stageFlags[] = {
    [RA_RENDERPASS_TYPE_RASTER]  = VK_SHADER_STAGE_FRAGMENT_BIT,
    [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
};

static struct ra_renderpass *vk_renderpass_create(struct ra *ra,
                                    const struct ra_renderpass_params *params)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    bool success = false;
    assert(vk->spirv);

    struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass);
    pass->params = *ra_renderpass_params_copy(pass, params);
    pass->params.cached_program = (bstr){0};
    struct ra_renderpass_vk *pass_vk = pass->priv =
        talloc_zero(pass, struct ra_renderpass_vk);

    // temporary allocations/objects
    void *tmp = talloc_new(NULL);
    VkPipelineCache pipeCache = NULL;
    VkShaderModule vert_shader = NULL;
    VkShaderModule frag_shader = NULL;
    VkShaderModule comp_shader = NULL;

    static int dsCount[RA_VARTYPE_COUNT] = {0};
    VkDescriptorSetLayoutBinding *bindings = NULL;
    int num_bindings = 0;

    for (int i = 0; i < params->num_inputs; i++) {
        struct ra_renderpass_input *inp = &params->inputs[i];
        switch (inp->type) {
        case RA_VARTYPE_TEX:
        case RA_VARTYPE_IMG_W:
        case RA_VARTYPE_BUF_RO:
        case RA_VARTYPE_BUF_RW: {
            VkDescriptorSetLayoutBinding desc = {
                .binding = inp->binding,
                .descriptorType = dsType[inp->type],
                .descriptorCount = 1,
                .stageFlags = stageFlags[params->type],
            };

            MP_TARRAY_APPEND(tmp, bindings, num_bindings, desc);
            dsCount[inp->type]++;
            break;
        }
        default: abort();
        }
    }

    VkDescriptorPoolSize *dsPoolSizes = NULL;
    int poolSizeCount = 0;

    for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) {
        if (dsCount[t] > 0) {
            VkDescriptorPoolSize dssize = {
                .type = dsType[t],
                .descriptorCount = dsCount[t] * MPVK_NUM_DS,
            };

            MP_TARRAY_APPEND(tmp, dsPoolSizes, poolSizeCount, dssize);
        }
    }

    VkDescriptorPoolCreateInfo pinfo = {
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
        .maxSets = MPVK_NUM_DS,
        .pPoolSizes = dsPoolSizes,
        .poolSizeCount = poolSizeCount,
    };

    VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool));

    pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings);
    pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings);
    pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings);

    VkDescriptorSetLayoutCreateInfo dinfo = {
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
        .pBindings = bindings,
        .bindingCount = num_bindings,
    };

    VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR,
                                   &pass_vk->dsLayout));

    VkDescriptorSetLayout layouts[MPVK_NUM_DS];
    for (int i = 0; i < MPVK_NUM_DS; i++)
        layouts[i] = pass_vk->dsLayout;

    VkDescriptorSetAllocateInfo ainfo = {
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
        .descriptorPool = pass_vk->dsPool,
        .descriptorSetCount = MPVK_NUM_DS,
        .pSetLayouts = layouts,
    };

    VK(vkAllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));

    VkPipelineLayoutCreateInfo linfo = {
        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
        .setLayoutCount = 1,
        .pSetLayouts = &pass_vk->dsLayout,
        .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
        .pPushConstantRanges = &(VkPushConstantRange){
            .stageFlags = stageFlags[params->type],
            .offset = 0,
            .size = params->push_constants_size,
        },
    };

    VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR,
                              &pass_vk->pipeLayout));

    struct bstr vert = {0}, frag = {0}, comp = {0}, pipecache = {0};
    if (vk_use_cached_program(params, vk->spirv, &vert, &frag, &comp, &pipecache)) {
        MP_VERBOSE(ra, "Using cached SPIR-V and VkPipeline.\n");
    } else {
        pipecache.len = 0;
        switch (params->type) {
        case RA_RENDERPASS_TYPE_RASTER:
            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_VERTEX,
                               params->vertex_shader, &vert));
            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_FRAGMENT,
                               params->frag_shader, &frag));
            comp.len = 0;
            break;
        case RA_RENDERPASS_TYPE_COMPUTE:
            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_COMPUTE,
                               params->compute_shader, &comp));
            frag.len = 0;
            vert.len = 0;
            break;
        }
    }

    VkPipelineCacheCreateInfo pcinfo = {
        .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
        .pInitialData = pipecache.start,
        .initialDataSize = pipecache.len,
    };

    VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pipeCache));

    VkShaderModuleCreateInfo sinfo = {
        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
    };

    switch (params->type) {
    case RA_RENDERPASS_TYPE_RASTER: {
        sinfo.pCode = (uint32_t *)vert.start;
        sinfo.codeSize = vert.len;
        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &vert_shader));

        sinfo.pCode = (uint32_t *)frag.start;
        sinfo.codeSize = frag.len;
        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &frag_shader));

        VkVertexInputAttributeDescription *attrs = talloc_array(tmp,
                VkVertexInputAttributeDescription, params->num_vertex_attribs);

        for (int i = 0; i < params->num_vertex_attribs; i++) {
            struct ra_renderpass_input *inp = &params->vertex_attribs[i];
            attrs[i] = (VkVertexInputAttributeDescription) {
                .location = i,
                .binding = 0,
                .offset = inp->offset,
            };

            if (!vk_get_input_format(ra, inp, &attrs[i].format)) {
                MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n",
                       inp->name);
                goto error;
            }
        }

        // This is the most common case, so optimize towards it. In this case,
        // the renderpass will take care of almost all layout transitions
        pass_vk->initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
        pass_vk->finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
        VkAttachmentLoadOp loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;

        // If we're blending, then we need to explicitly load the previous
        // contents of the color attachment
        if (pass->params.enable_blend)
            loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;

        // If we're invalidating the target, we don't need to load or transition
        if (pass->params.invalidate_target) {
            pass_vk->initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
            loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
        }

        VK(vk_create_render_pass(vk->dev, params->target_format, loadOp,
                                 pass_vk->initialLayout, pass_vk->finalLayout,
                                 &pass_vk->renderPass));

        static const VkBlendFactor blendFactors[] = {
            [RA_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
            [RA_BLEND_ONE]                 = VK_BLEND_FACTOR_ONE,
            [RA_BLEND_SRC_ALPHA]           = VK_BLEND_FACTOR_SRC_ALPHA,
            [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
        };

        VkGraphicsPipelineCreateInfo cinfo = {
            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
            .stageCount = 2,
            .pStages = (VkPipelineShaderStageCreateInfo[]) {
                {
                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
                    .stage = VK_SHADER_STAGE_VERTEX_BIT,
                    .module = vert_shader,
                    .pName = "main",
                }, {
                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
                    .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
                    .module = frag_shader,
                    .pName = "main",
                }
            },
            .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
                .vertexBindingDescriptionCount = 1,
                .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
                    .binding = 0,
                    .stride = params->vertex_stride,
                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
                },
                .vertexAttributeDescriptionCount = params->num_vertex_attribs,
                .pVertexAttributeDescriptions = attrs,
            },
            .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
                .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
            },
            .pViewportState = &(VkPipelineViewportStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
                .viewportCount = 1,
                .scissorCount = 1,
            },
            .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
                .polygonMode = VK_POLYGON_MODE_FILL,
                .cullMode = VK_CULL_MODE_NONE,
                .lineWidth = 1.0f,
            },
            .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
                .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
            },
            .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
                .attachmentCount = 1,
                .pAttachments = &(VkPipelineColorBlendAttachmentState) {
                    .blendEnable = params->enable_blend,
                    .colorBlendOp = VK_BLEND_OP_ADD,
                    .srcColorBlendFactor = blendFactors[params->blend_src_rgb],
                    .dstColorBlendFactor = blendFactors[params->blend_dst_rgb],
                    .alphaBlendOp = VK_BLEND_OP_ADD,
                    .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha],
                    .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha],
                    .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
                                      VK_COLOR_COMPONENT_G_BIT |
                                      VK_COLOR_COMPONENT_B_BIT |
                                      VK_COLOR_COMPONENT_A_BIT,
                },
            },
            .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
                .dynamicStateCount = 2,
                .pDynamicStates = (VkDynamicState[]){
                    VK_DYNAMIC_STATE_VIEWPORT,
                    VK_DYNAMIC_STATE_SCISSOR,
                },
            },
            .layout = pass_vk->pipeLayout,
            .renderPass = pass_vk->renderPass,
        };

        VK(vkCreateGraphicsPipelines(vk->dev, pipeCache, 1, &cinfo,
                                     MPVK_ALLOCATOR, &pass_vk->pipe));
        break;
    }
    case RA_RENDERPASS_TYPE_COMPUTE: {
        sinfo.pCode = (uint32_t *)comp.start;
        sinfo.codeSize = comp.len;
        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &comp_shader));

        VkComputePipelineCreateInfo cinfo = {
            .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
            .stage = {
                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
                .module = comp_shader,
                .pName = "main",
            },
            .layout = pass_vk->pipeLayout,
        };

        VK(vkCreateComputePipelines(vk->dev, pipeCache, 1, &cinfo,
                                    MPVK_ALLOCATOR, &pass_vk->pipe));
        break;
    }
    }

    // Update params->cached_program
    struct bstr cache = {0};
    VK(vkGetPipelineCacheData(vk->dev, pipeCache, &cache.len, NULL));
    cache.start = talloc_size(tmp, cache.len);
    VK(vkGetPipelineCacheData(vk->dev, pipeCache, &cache.len, cache.start));

    struct vk_cache_header header = {
        .cache_version = vk_cache_version,
        .compiler_version = vk->spirv->compiler_version,
        .vert_spirv_len = vert.len,
        .frag_spirv_len = frag.len,
        .comp_spirv_len = comp.len,
        .pipecache_len = cache.len,
    };

    for (int i = 0; i < MP_ARRAY_SIZE(header.magic); i++)
        header.magic[i] = vk_cache_magic[i];
    for (int i = 0; i < sizeof(vk->spirv->name); i++)
        header.compiler[i] = vk->spirv->name[i];

    struct bstr *prog = &pass->params.cached_program;
    bstr_xappend(pass, prog, (struct bstr){ (char *) &header, sizeof(header) });
    bstr_xappend(pass, prog, vert);
    bstr_xappend(pass, prog, frag);
    bstr_xappend(pass, prog, comp);
    bstr_xappend(pass, prog, cache);

    success = true;

error:
    if (!success) {
        vk_renderpass_destroy(ra, pass);
        pass = NULL;
    }

    vkDestroyShaderModule(vk->dev, vert_shader, MPVK_ALLOCATOR);
    vkDestroyShaderModule(vk->dev, frag_shader, MPVK_ALLOCATOR);
    vkDestroyShaderModule(vk->dev, comp_shader, MPVK_ALLOCATOR);
    vkDestroyPipelineCache(vk->dev, pipeCache, MPVK_ALLOCATOR);
    talloc_free(tmp);
    return pass;
}

static const VkPipelineStageFlags passStages[] = {
    [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
    [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
};

static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
                                 struct ra_renderpass *pass,
                                 struct ra_renderpass_input_val val,
                                 VkDescriptorSet ds, int idx)
{
    struct ra_renderpass_vk *pass_vk = pass->priv;
    struct ra_renderpass_input *inp = &pass->params.inputs[val.index];

    VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
    *wds = (VkWriteDescriptorSet) {
        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
        .dstSet = ds,
        .dstBinding = inp->binding,
        .descriptorCount = 1,
        .descriptorType = dsType[inp->type],
    };

    switch (inp->type) {
    case RA_VARTYPE_TEX: {
        struct ra_tex *tex = *(struct ra_tex **)val.data;
        struct ra_tex_vk *tex_vk = tex->priv;

        assert(tex->params.render_src);
        tex_barrier(ra, cmd, tex, passStages[pass->params.type],
                    VK_ACCESS_SHADER_READ_BIT,
                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false);

        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
        *iinfo = (VkDescriptorImageInfo) {
            .sampler = tex_vk->sampler,
            .imageView = tex_vk->view,
            .imageLayout = tex_vk->current_layout,
        };

        wds->pImageInfo = iinfo;
        break;
    }
    case RA_VARTYPE_IMG_W: {
        struct ra_tex *tex = *(struct ra_tex **)val.data;
        struct ra_tex_vk *tex_vk = tex->priv;

        assert(tex->params.storage_dst);
        tex_barrier(ra, cmd, tex, passStages[pass->params.type],
                    VK_ACCESS_SHADER_WRITE_BIT,
                    VK_IMAGE_LAYOUT_GENERAL, false);

        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
        *iinfo = (VkDescriptorImageInfo) {
            .imageView = tex_vk->view,
            .imageLayout = tex_vk->current_layout,
        };

        wds->pImageInfo = iinfo;
        break;
    }
    case RA_VARTYPE_BUF_RO:
    case RA_VARTYPE_BUF_RW: {
        struct ra_buf *buf = *(struct ra_buf **)val.data;
        struct ra_buf_vk *buf_vk = buf->priv;

        VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT;
        if (inp->type == RA_VARTYPE_BUF_RW)
            access |= VK_ACCESS_SHADER_WRITE_BIT;

        buf_barrier(ra, cmd, buf, passStages[pass->params.type],
                    access, buf_vk->slice.mem.offset, buf->params.size);

        VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
        *binfo = (VkDescriptorBufferInfo) {
            .buffer = buf_vk->slice.buf,
            .offset = buf_vk->slice.mem.offset,
            .range = buf->params.size,
        };

        wds->pBufferInfo = binfo;
        break;
    }
    }
}

static void vk_release_descriptor(struct ra *ra, struct vk_cmd *cmd,
                                  struct ra_renderpass *pass,
                                  struct ra_renderpass_input_val val)
{
    struct ra_renderpass_input *inp = &pass->params.inputs[val.index];

    switch (inp->type) {
    case RA_VARTYPE_IMG_W:
    case RA_VARTYPE_TEX: {
        struct ra_tex *tex = *(struct ra_tex **)val.data;
        tex_signal(ra, cmd, tex, passStages[pass->params.type]);
        break;
    }
    }
}

static void vk_renderpass_run(struct ra *ra,
                              const struct ra_renderpass_run_params *params)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct ra_renderpass *pass = params->pass;
    struct ra_renderpass_vk *pass_vk = pass->priv;

    static const enum queue_type types[] = {
        [RA_RENDERPASS_TYPE_RASTER]  = GRAPHICS,
        [RA_RENDERPASS_TYPE_COMPUTE] = COMPUTE,
    };

    struct vk_cmd *cmd = vk_require_cmd(ra, types[pass->params.type]);
    if (!cmd)
        goto error;

    static const VkPipelineBindPoint bindPoint[] = {
        [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_BIND_POINT_GRAPHICS,
        [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
    };

    vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe);

    VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++];
    pass_vk->dindex %= MPVK_NUM_DS;

    for (int i = 0; i < params->num_values; i++)
        vk_update_descriptor(ra, cmd, pass, params->values[i], ds, i);

    if (params->num_values > 0) {
        vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite,
                               0, NULL);
    }

    vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
                            pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);

    if (pass->params.push_constants_size) {
        vkCmdPushConstants(cmd->buf, pass_vk->pipeLayout,
                           stageFlags[pass->params.type], 0,
                           pass->params.push_constants_size,
                           params->push_constants);
    }

    switch (pass->params.type) {
    case RA_RENDERPASS_TYPE_COMPUTE:
        vkCmdDispatch(cmd->buf, params->compute_groups[0],
                      params->compute_groups[1],
                      params->compute_groups[2]);
        break;
    case RA_RENDERPASS_TYPE_RASTER: {
        struct ra_tex *tex = params->target;
        struct ra_tex_vk *tex_vk = tex->priv;
        assert(tex->params.render_dst);

        struct ra_buf_params buf_params = {
            .type = RA_BUF_TYPE_VERTEX,
            .size = params->vertex_count * pass->params.vertex_stride,
            .host_mutable = true,
        };

        struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params);
        if (!buf) {
            MP_ERR(ra, "Failed allocating vertex buffer!\n");
            goto error;
        }
        struct ra_buf_vk *buf_vk = buf->priv;

        vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size);

        buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
                    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
                    buf_vk->slice.mem.offset, buf_params.size);

        vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf,
                               &buf_vk->slice.mem.offset);

        tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
                    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, pass_vk->initialLayout,
                    pass->params.invalidate_target);

        VkViewport viewport = {
            .x = params->viewport.x0,
            .y = params->viewport.y0,
            .width  = mp_rect_w(params->viewport),
            .height = mp_rect_h(params->viewport),
        };

        VkRect2D scissor = {
            .offset = {params->scissors.x0, params->scissors.y0},
            .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)},
        };

        vkCmdSetViewport(cmd->buf, 0, 1, &viewport);
        vkCmdSetScissor(cmd->buf, 0, 1, &scissor);

        VkRenderPassBeginInfo binfo = {
            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
            .renderPass = pass_vk->renderPass,
            .framebuffer = tex_vk->framebuffer,
            .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}},
        };

        vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
        vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
        vkCmdEndRenderPass(cmd->buf);

        // The renderPass implicitly transitions the texture to this layout
        tex_vk->current_layout = pass_vk->finalLayout;
        tex_vk->current_access = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
        tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
        break;
    }
    default: abort();
    };

    for (int i = 0; i < params->num_values; i++)
        vk_release_descriptor(ra, cmd, pass, params->values[i]);

    // flush the work so far into its own command buffer, for better cross-frame
    // granularity
    vk_submit(ra);

error:
    return;
}

static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
                    struct mp_rect *dst_rc, struct mp_rect *src_rc)
{
    assert(src->params.blit_src);
    assert(dst->params.blit_dst);

    struct ra_tex_vk *src_vk = src->priv;
    struct ra_tex_vk *dst_vk = dst->priv;

    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
    if (!cmd)
        return;

    tex_barrier(ra, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT,
                VK_ACCESS_TRANSFER_READ_BIT,
                VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                false);

    bool discard = dst_rc->x0 == 0 &&
                   dst_rc->y0 == 0 &&
                   dst_rc->x1 == dst->params.w &&
                   dst_rc->y1 == dst->params.h;

    tex_barrier(ra, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT,
                VK_ACCESS_TRANSFER_WRITE_BIT,
                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                discard);

    // Under certain conditions we can use vkCmdCopyImage instead of
    // vkCmdBlitImage, namely when the blit operation does not require
    // scaling. and the formats are compatible.
    if (src->params.format->pixel_size == dst->params.format->pixel_size &&
        mp_rect_w(*src_rc) == mp_rect_w(*dst_rc) &&
        mp_rect_h(*src_rc) == mp_rect_h(*dst_rc) &&
        mp_rect_w(*src_rc) >= 0 && mp_rect_h(*src_rc) >= 0)
    {
        VkImageCopy region = {
            .srcSubresource = vk_layers,
            .dstSubresource = vk_layers,
            .srcOffset = {src_rc->x0, src_rc->y0, 0},
            .dstOffset = {dst_rc->x0, dst_rc->y0, 0},
            .extent = {mp_rect_w(*src_rc), mp_rect_h(*src_rc), 1},
        };

        vkCmdCopyImage(cmd->buf, src_vk->img, src_vk->current_layout,
                       dst_vk->img, dst_vk->current_layout, 1, &region);
    } else {
        VkImageBlit region = {
            .srcSubresource = vk_layers,
            .dstSubresource = vk_layers,
            .srcOffsets = {{src_rc->x0, src_rc->y0, 0},
                           {src_rc->x1, src_rc->y1, 1}},
            .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0},
                           {dst_rc->x1, dst_rc->y1, 1}},
        };

        vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout,
                       dst_vk->img, dst_vk->current_layout, 1, &region,
                       VK_FILTER_NEAREST);
    }

    tex_signal(ra, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT);
    tex_signal(ra, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT);
}

static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
                     struct mp_rect *rc)
{
    struct ra_vk *p = ra->priv;
    struct ra_tex_vk *tex_vk = tex->priv;
    assert(tex->params.blit_dst);

    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
    if (!cmd)
        return;

    struct mp_rect full = {0, 0, tex->params.w, tex->params.h};
    if (!rc || mp_rect_equals(rc, &full)) {
        // To clear the entire image, we can use the efficient clear command
        tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT,
                    VK_ACCESS_TRANSFER_WRITE_BIT,
                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true);

        VkClearColorValue clearColor = {0};
        for (int c = 0; c < 4; c++)
            clearColor.float32[c] = color[c];

        vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout,
                             &clearColor, 1, &vk_range);

        tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT);
    } else {
        // To simulate per-region clearing, we blit from a 1x1 texture instead
        struct ra_tex_upload_params ul_params = {
            .tex = p->clear_tex,
            .invalidate = true,
            .src = &color[0],
        };
        vk_tex_upload(ra, &ul_params);
        vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1});
    }
}

static int vk_desc_namespace(enum ra_vartype type)
{
    return 0;
}

#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4)

struct vk_timer {
    VkQueryPool pool;
    int index;
    uint64_t result;
};

static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer)
{
    if (!ratimer)
        return;

    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct vk_timer *timer = ratimer;

    vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR);

    talloc_free(timer);
}

MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer);

static ra_timer *vk_timer_create(struct ra *ra)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);

    struct vk_timer *timer = talloc_zero(NULL, struct vk_timer);

    struct VkQueryPoolCreateInfo qinfo = {
        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
        .queryType = VK_QUERY_TYPE_TIMESTAMP,
        .queryCount = VK_QUERY_POOL_SIZE,
    };

    VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool));

    return (ra_timer *)timer;

error:
    vk_timer_destroy(ra, timer);
    return NULL;
}

static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index,
                            VkPipelineStageFlags stage)
{
    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
    if (!cmd)
        return;

    vkCmdWriteTimestamp(cmd->buf, stage, pool, index);
}

static void vk_timer_start(struct ra *ra, ra_timer *ratimer)
{
    struct mpvk_ctx *vk = ra_vk_get(ra);
    struct vk_timer *timer = ratimer;

    timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE;

    uint64_t out[2];
    VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2,
                                         sizeof(out), &out[0], sizeof(uint64_t),
                                         VK_QUERY_RESULT_64_BIT);
    switch (res) {
    case VK_SUCCESS:
        timer->result = (out[1] - out[0]) * vk->limits.timestampPeriod;
        break;
    case VK_NOT_READY:
        timer->result = 0;
        break;
    default:
        MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res));
        return;
    };

    vk_timer_record(ra, timer->pool, timer->index,
                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
}

static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer)
{
    struct vk_timer *timer = ratimer;
    vk_timer_record(ra, timer->pool, timer->index + 1,
                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);

    return timer->result;
}

static struct ra_fns ra_fns_vk = {
    .destroy                = vk_destroy_ra,
    .tex_create             = vk_tex_create,
    .tex_destroy            = vk_tex_destroy_lazy,
    .tex_upload             = vk_tex_upload,
    .buf_create             = vk_buf_create,
    .buf_destroy            = vk_buf_destroy_lazy,
    .buf_update             = vk_buf_update,
    .buf_poll               = vk_buf_poll,
    .clear                  = vk_clear,
    .blit                   = vk_blit,
    .uniform_layout         = std140_layout,
    .push_constant_layout   = std430_layout,
    .desc_namespace         = vk_desc_namespace,
    .renderpass_create      = vk_renderpass_create,
    .renderpass_destroy     = vk_renderpass_destroy_lazy,
    .renderpass_run         = vk_renderpass_run,
    .timer_create           = vk_timer_create,
    .timer_destroy          = vk_timer_destroy_lazy,
    .timer_start            = vk_timer_start,
    .timer_stop             = vk_timer_stop,
};

struct vk_cmd *ra_vk_submit(struct ra *ra, struct ra_tex *tex)
{
    struct ra_vk *p = ra->priv;
    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
    if (!cmd)
        return NULL;

    struct ra_tex_vk *tex_vk = tex->priv;
    assert(tex_vk->external_img);
    tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
                VK_ACCESS_MEMORY_READ_BIT, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
                false);

    // Return this directly instead of going through vk_submit
    p->cmd = NULL;
    return cmd;
}