vo_opengl: check against shmem limits

The radius check was not strict enough, especially not for all
platforms. To fix this, actually check the hardware capabilities instead
of relying on a hard-coded maximum radius.
This commit is contained in:
Niklas Haas 2017-07-26 01:42:19 +02:00
parent 9875f14ad4
commit b31020b193
No known key found for this signature in database
GPG Key ID: 9A09076581B27402
6 changed files with 54 additions and 26 deletions

View File

@ -625,8 +625,10 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
}
// GL_ARB_compute_shader & GL_ARB_shader_image_load_store
if (gl->DispatchCompute && gl->BindImageTexture)
if (gl->DispatchCompute && gl->BindImageTexture) {
gl->mpgl_caps |= MPGL_CAP_COMPUTE_SHADER;
gl->GetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &gl->max_shmem);
}
// Provided for simpler handling if no framebuffer support is available.
if (!gl->BindFramebuffer)

View File

@ -87,6 +87,7 @@ struct GL {
int glsl_version; // e.g. 130 for GLSL 1.30
char *extensions; // Equivalent to GL_EXTENSIONS
int mpgl_caps; // Bitfield of MPGL_CAP_* constants
int max_shmem; // Maximum shared memory for compute shaders
bool debug_context; // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB
// Use mpgl_get_native_display() instead. Also, this is set to use the

View File

@ -86,6 +86,7 @@
// --- GL 4.3 or GL_ARB_compute_shader
#define GL_COMPUTE_SHADER 0x91B9
#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262
// --- GL 4.3 or GL_ARB_shader_storage_buffer_object

View File

@ -1714,6 +1714,50 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
pass_sample_separated_gen(p->sc, scaler, 1, 0);
}
// Picks either the compute shader version or the regular sampler version
// depending on hardware support
static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
struct img_tex tex, int w, int h)
{
GL *gl = p->gl;
GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
if (!(gl->mpgl_caps & reqs))
goto fallback;
int bound = ceil(scaler->kernel->radius_cutoff);
int offset = bound - 1; // padding top/left
int padding = offset + bound; // total padding
float ratiox = (float)w / tex.w,
ratioy = (float)h / tex.h;
// For performance we want to load at least as many pixels
// horizontally as there are threads in a warp (32 for nvidia), as
// well as enough to take advantage of shmem parallelism
const int warp_size = 32, threads = 256;
int bw = warp_size;
int bh = threads / bw;
// We need to sample everything from base_min to base_max, so make sure
// we have enough room in shmem
int iw = (int)ceil(bw / ratiox) + padding + 1,
ih = (int)ceil(bh / ratioy) + padding + 1;
int shmem_req = iw * ih * tex.components * sizeof(GLfloat);
if (shmem_req > gl->max_shmem)
goto fallback;
compute_size_minimum(p, bw, bh);
pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
return;
fallback:
// Fall back to regular polar shader when compute shaders are unsupported
// or the kernel is too big for shmem
pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
}
// Sample from img_tex, with the src rectangle given by it.
// The dst rectangle is implicit by what the caller will do next, but w and h
// must still be what is going to be used (to dimension FBOs correctly).
@ -1753,21 +1797,7 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
} else if (strcmp(name, "oversample") == 0) {
pass_sample_oversample(p->sc, scaler, w, h);
} else if (scaler->kernel && scaler->kernel->polar) {
GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
if ((p->gl->mpgl_caps & reqs) && scaler->kernel->f.radius <= 16) {
// For performance we want to load at least as many pixels
// horizontally as there are threads in a warp (32 for nvidia), as
// well as enough to take advantage of shmem parallelism
const int warp_size = 32, threads = 256;
compute_size_minimum(p, warp_size, threads / warp_size);
pass_compute_polar(p->sc, scaler, tex.components,
p->compute_w, p->compute_h,
(float)w / tex.w, (float)h / tex.h);
} else {
// Fall back to regular polar shader when compute shaders are
// unsupported or the kernel is too big for shmem
pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
}
pass_dispatch_sample_polar(p, scaler, tex, w, h);
} else if (scaler->kernel) {
pass_sample_separated(p, tex, scaler, w, h);
} else {

View File

@ -217,18 +217,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
GLSLF("}\n");
}
// bw/bh: block size
// iw/ih: input size (pre-calculated to fit all required texels)
void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int bw, int bh, float ratiox,
float ratioy)
int components, int bw, int bh, int iw, int ih)
{
int bound = ceil(scaler->kernel->radius_cutoff);
int offset = bound - 1; // padding top/left
int padding = offset + bound; // total padding
// We need to sample everything from base_min to base_max, so make sure
// we have enough space to fit all relevant texels in shmem
int iw = (int)ceil(bw / ratiox) + padding + 1,
ih = (int)ceil(bh / ratioy) + padding + 1;
GLSL(color = vec4(0.0);)
GLSLF("{\n");

View File

@ -33,8 +33,7 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int glsl_version);
void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int bw, int bh, float ratiox,
float ratioy);
int components, int bw, int bh, int iw, int ih);
void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
int w, int h);