diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
index ffb010960a..08ccdaee70 100644
--- a/video/out/gpu/ra.h
+++ b/video/out/gpu/ra.h
@@ -53,6 +53,7 @@ enum {
     RA_CAP_GLOBAL_UNIFORM = 1 << 8, // supports using "naked" uniforms (not UBO)
     RA_CAP_GATHER         = 1 << 9, // supports textureGather in GLSL
     RA_CAP_FRAGCOORD      = 1 << 10, // supports reading from gl_FragCoord
+    RA_CAP_PARALLEL_COMPUTE  = 1 << 11, // supports parallel compute shaders
 };
 
 enum ra_ctype {
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 3f0959931d..1b50166dc4 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -1237,6 +1237,11 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
         return;
     }
 
+    // If RA_CAP_PARALLEL_COMPUTE is set, try to prefer compute shaders
+    // over fragment shaders wherever possible.
+    if (!p->pass_compute.active && (p->ra->caps & RA_CAP_PARALLEL_COMPUTE))
+        pass_is_compute(p, 16, 16);
+
     if (p->pass_compute.active) {
         gl_sc_uniform_image2D_wo(p->sc, "out_image", *dst_tex);
         if (!p->pass_compute.directly_writes)
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
index 905fc89596..f0353629e6 100644
--- a/video/out/vulkan/ra_vk.c
+++ b/video/out/vulkan/ra_vk.c
@@ -208,8 +208,13 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
     ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
     ra->max_pushc_size = vk->limits.maxPushConstantsSize;
 
-    if (vk->pool_compute)
+    if (vk->pool_compute) {
         ra->caps |= RA_CAP_COMPUTE;
+        // If we have more compute queues than graphics queues, we probably
+        // want to be using them. (This seems mostly relevant for AMD)
+        if (vk->pool_compute->num_queues > vk->pool_graphics->num_queues)
+            ra->caps |= RA_CAP_PARALLEL_COMPUTE;
+    }
 
     if (!vk_setup_formats(ra))
         goto error;