diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 76b9d829ab..5a4d17e454 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -1755,7 +1755,21 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     } else if (strcmp(name, "oversample") == 0) {
         pass_sample_oversample(p->sc, scaler, w, h);
     } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+        // Use a compute shader where possible, fallback to the slower texture
+        // fragment sampler otherwise. Also use the fragment shader for
+        // very large kernels to avoid exhausting shmem
+        if (p->gl->glsl_version < 430 || scaler->kernel->f.radius > 16) {
+            pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+        } else {
+            // For performance we want to load at least as many pixels
+            // horizontally as there are threads in a warp (32 for nvidia), as
+            // well as enough to take advantage of shmem parallelism
+            const int warp_size = 32, threads = 256;
+            compute_size_minimum(p, warp_size, threads / warp_size);
+            pass_compute_polar(p->sc, scaler, tex.components,
+                               p->compute_w, p->compute_h,
+                               (float)w / tex.w, (float)h / tex.h);
+        }
     } else if (scaler->kernel) {
         pass_sample_separated(p, tex, scaler, w, h);
     } else {
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index a7ecf1a448..fe6e944168 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -106,9 +106,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
 }
 
 // Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0, samples directly. Otherwise, takes the texel from cN[comp]
+// If subtexel < 0 and offset < 0, samples directly.
+// If subtexel >= 0, takes the texel from cN[subtexel]
+// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
 static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
-                         int x, int y, int subtexel, int components)
+                         int x, int y, int subtexel, int offset, int components)
 {
     double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
     double radius_cutoff = scaler->kernel->radius_cutoff;
@@ -137,12 +139,19 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
     }
     GLSL(wsum += w;)
 
-    if (subtexel < 0) {
+    if (subtexel < 0 && offset < 0) {
         GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
         GLSL(color += vec4(w) * c0;)
-    } else {
+    } else if (subtexel >= 0) {
         for (int n = 0; n < components; n++)
             GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
+    } else if (offset >= 0) {
+        for (int n = 0; n <components; n++)
+            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
+                  y + offset, x + offset);
+    } else {
+        // invalid usage
+        abort();
     }
 
     if (maybe_skippable)
@@ -192,13 +201,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
                     static const int yo[4] = {1, 1, 0, 0};
                     if (x+xo[p] > bound || y+yo[p] > bound)
                         continue;
-                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, components);
+                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
                 }
             } else {
                 // switch to direct sampling instead, for efficiency/compatibility
                 for (int yy = y; yy <= bound && yy <= y+1; yy++) {
                     for (int xx = x; xx <= bound && xx <= x+1; xx++)
-                        polar_sample(sc, scaler, xx, yy, -1, components);
+                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
                 }
             }
         }
@@ -208,6 +217,54 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
     GLSLF("}\n");
 }
 
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, float ratiox,
+                        float ratioy)
+{
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    // We need to sample everything from base_min to base_max, so make sure
+    // we have enough space to fit all relevant texels in shmem
+    int iw = (int)ceil(bw / ratiox) + padding + 1,
+        ih = (int)ceil(bh / ratioy) + padding + 1;
+
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 wpos = texmap0(gl_WorkGroupID * gl_WorkGroupSize);)
+    GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - pt * fcoord;)
+    GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    gl_sc_uniform_tex(sc, "lut", scaler->gl_target, scaler->gl_lut);
+
+    // Load all relevant texels into shmem
+    for (int c = 0; c < components; c++)
+        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+
+    GLSL(vec4 c;)
+    GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
+    GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
+    GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
+    for (int c = 0; c < components; c++)
+        GLSLF("in%d[y][x] = c[%d];\n", c, c);
+    GLSLF("}}\n");
+    GLSL(groupMemoryBarrier();)
+    GLSL(barrier();)
+
+    // Dispatch the actual samples
+    GLSLF("// scaler samples\n");
+    for (int y = 1-bound; y <= bound; y++) {
+        for (int x = 1-bound; x <= bound; x++)
+            polar_sample(sc, scaler, x, y, -1, offset, components);
+    }
+
+    GLSL(color = color / vec4(wsum);)
+    GLSLF("}\n");
+}
+
 static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
 {
     // Explanation of how bicubic scaling with only 4 texel fetches is done:
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
index e0594f28f3..597027ca6b 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/opengl/video_shaders.h
@@ -32,6 +32,9 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
                                int d_x, int d_y);
 void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
                        int components, int glsl_version);
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, float ratiox,
+                        float ratioy);
 void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
 void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
                             int w, int h);