diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c index c7a714817a..f6202e2c8c 100644 --- a/video/out/opengl/common.c +++ b/video/out/opengl/common.c @@ -625,8 +625,10 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n), } // GL_ARB_compute_shader & GL_ARB_shader_image_load_store - if (gl->DispatchCompute && gl->BindImageTexture) + if (gl->DispatchCompute && gl->BindImageTexture) { gl->mpgl_caps |= MPGL_CAP_COMPUTE_SHADER; + gl->GetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &gl->max_shmem); + } // Provided for simpler handling if no framebuffer support is available. if (!gl->BindFramebuffer) diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h index 6d8015c8b3..abc8f30192 100644 --- a/video/out/opengl/common.h +++ b/video/out/opengl/common.h @@ -87,6 +87,7 @@ struct GL { int glsl_version; // e.g. 130 for GLSL 1.30 char *extensions; // Equivalent to GL_EXTENSIONS int mpgl_caps; // Bitfield of MPGL_CAP_* constants + int max_shmem; // Maximum shared memory for compute shaders bool debug_context; // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB // Use mpgl_get_native_display() instead. Also, this is set to use the diff --git a/video/out/opengl/gl_headers.h b/video/out/opengl/gl_headers.h index e57cab35dc..9b9d1a506a 100644 --- a/video/out/opengl/gl_headers.h +++ b/video/out/opengl/gl_headers.h @@ -86,6 +86,7 @@ // --- GL 4.3 or GL_ARB_compute_shader #define GL_COMPUTE_SHADER 0x91B9 +#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262 // --- GL 4.3 or GL_ARB_shader_storage_buffer_object diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index 91adc62660..9867751684 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -1714,6 +1714,50 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src, pass_sample_separated_gen(p->sc, scaler, 1, 0); } +// Picks either the compute shader version or the regular sampler version +// depending on hardware support +static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler, + struct img_tex tex, int w, int h) +{ + GL *gl = p->gl; + + GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY; + if (!(gl->mpgl_caps & reqs)) + goto fallback; + + int bound = ceil(scaler->kernel->radius_cutoff); + int offset = bound - 1; // padding top/left + int padding = offset + bound; // total padding + + float ratiox = (float)w / tex.w, + ratioy = (float)h / tex.h; + + // For performance we want to load at least as many pixels + // horizontally as there are threads in a warp (32 for nvidia), as + // well as enough to take advantage of shmem parallelism + const int warp_size = 32, threads = 256; + int bw = warp_size; + int bh = threads / bw; + + // We need to sample everything from base_min to base_max, so make sure + // we have enough room in shmem + int iw = (int)ceil(bw / ratiox) + padding + 1, + ih = (int)ceil(bh / ratioy) + padding + 1; + + int shmem_req = iw * ih * tex.components * sizeof(GLfloat); + if (shmem_req > gl->max_shmem) + goto fallback; + + compute_size_minimum(p, bw, bh); + pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih); + return; + +fallback: + // Fall back to regular polar shader when compute shaders are unsupported + // or the kernel is too big for shmem + pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version); +} + // Sample from img_tex, with the src rectangle given by it. // The dst rectangle is implicit by what the caller will do next, but w and h // must still be what is going to be used (to dimension FBOs correctly). @@ -1753,21 +1797,7 @@ static void pass_sample(struct gl_video *p, struct img_tex tex, } else if (strcmp(name, "oversample") == 0) { pass_sample_oversample(p->sc, scaler, w, h); } else if (scaler->kernel && scaler->kernel->polar) { - GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY; - if ((p->gl->mpgl_caps & reqs) && scaler->kernel->f.radius <= 16) { - // For performance we want to load at least as many pixels - // horizontally as there are threads in a warp (32 for nvidia), as - // well as enough to take advantage of shmem parallelism - const int warp_size = 32, threads = 256; - compute_size_minimum(p, warp_size, threads / warp_size); - pass_compute_polar(p->sc, scaler, tex.components, - p->compute_w, p->compute_h, - (float)w / tex.w, (float)h / tex.h); - } else { - // Fall back to regular polar shader when compute shaders are - // unsupported or the kernel is too big for shmem - pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version); - } + pass_dispatch_sample_polar(p, scaler, tex, w, h); } else if (scaler->kernel) { pass_sample_separated(p, tex, scaler, w, h); } else { diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c index 854c829f1d..c0ca40b48e 100644 --- a/video/out/opengl/video_shaders.c +++ b/video/out/opengl/video_shaders.c @@ -217,18 +217,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, GLSLF("}\n"); } +// bw/bh: block size +// iw/ih: input size (pre-calculated to fit all required texels) void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, - int components, int bw, int bh, float ratiox, - float ratioy) + int components, int bw, int bh, int iw, int ih) { int bound = ceil(scaler->kernel->radius_cutoff); int offset = bound - 1; // padding top/left - int padding = offset + bound; // total padding - - // We need to sample everything from base_min to base_max, so make sure - // we have enough space to fit all relevant texels in shmem - int iw = (int)ceil(bw / ratiox) + padding + 1, - ih = (int)ceil(bh / ratioy) + padding + 1; GLSL(color = vec4(0.0);) GLSLF("{\n"); diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h index 597027ca6b..af59d9b678 100644 --- a/video/out/opengl/video_shaders.h +++ b/video/out/opengl/video_shaders.h @@ -33,8 +33,7 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, int components, int glsl_version); void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, - int components, int bw, int bh, float ratiox, - float ratioy); + int components, int bw, int bh, int iw, int ih); void pass_sample_bicubic_fast(struct gl_shader_cache *sc); void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler, int w, int h);