mirror of https://github.com/mpv-player/mpv
vo_opengl: check against shmem limits
The radius check was not strict enough, especially not for all platforms. To fix this, actually check the hardware capabilities instead of relying on a hard-coded maximum radius.
This commit is contained in:
parent
9875f14ad4
commit
b31020b193
|
@ -625,8 +625,10 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
|
|||
}
|
||||
|
||||
// GL_ARB_compute_shader & GL_ARB_shader_image_load_store
|
||||
if (gl->DispatchCompute && gl->BindImageTexture)
|
||||
if (gl->DispatchCompute && gl->BindImageTexture) {
|
||||
gl->mpgl_caps |= MPGL_CAP_COMPUTE_SHADER;
|
||||
gl->GetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &gl->max_shmem);
|
||||
}
|
||||
|
||||
// Provided for simpler handling if no framebuffer support is available.
|
||||
if (!gl->BindFramebuffer)
|
||||
|
|
|
@ -87,6 +87,7 @@ struct GL {
|
|||
int glsl_version; // e.g. 130 for GLSL 1.30
|
||||
char *extensions; // Equivalent to GL_EXTENSIONS
|
||||
int mpgl_caps; // Bitfield of MPGL_CAP_* constants
|
||||
int max_shmem; // Maximum shared memory for compute shaders
|
||||
bool debug_context; // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB
|
||||
|
||||
// Use mpgl_get_native_display() instead. Also, this is set to use the
|
||||
|
|
|
@ -86,6 +86,7 @@
|
|||
// --- GL 4.3 or GL_ARB_compute_shader
|
||||
|
||||
#define GL_COMPUTE_SHADER 0x91B9
|
||||
#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262
|
||||
|
||||
// --- GL 4.3 or GL_ARB_shader_storage_buffer_object
|
||||
|
||||
|
|
|
@ -1714,6 +1714,50 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
|
|||
pass_sample_separated_gen(p->sc, scaler, 1, 0);
|
||||
}
|
||||
|
||||
// Picks either the compute shader version or the regular sampler version
|
||||
// depending on hardware support
|
||||
static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
|
||||
struct img_tex tex, int w, int h)
|
||||
{
|
||||
GL *gl = p->gl;
|
||||
|
||||
GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
|
||||
if (!(gl->mpgl_caps & reqs))
|
||||
goto fallback;
|
||||
|
||||
int bound = ceil(scaler->kernel->radius_cutoff);
|
||||
int offset = bound - 1; // padding top/left
|
||||
int padding = offset + bound; // total padding
|
||||
|
||||
float ratiox = (float)w / tex.w,
|
||||
ratioy = (float)h / tex.h;
|
||||
|
||||
// For performance we want to load at least as many pixels
|
||||
// horizontally as there are threads in a warp (32 for nvidia), as
|
||||
// well as enough to take advantage of shmem parallelism
|
||||
const int warp_size = 32, threads = 256;
|
||||
int bw = warp_size;
|
||||
int bh = threads / bw;
|
||||
|
||||
// We need to sample everything from base_min to base_max, so make sure
|
||||
// we have enough room in shmem
|
||||
int iw = (int)ceil(bw / ratiox) + padding + 1,
|
||||
ih = (int)ceil(bh / ratioy) + padding + 1;
|
||||
|
||||
int shmem_req = iw * ih * tex.components * sizeof(GLfloat);
|
||||
if (shmem_req > gl->max_shmem)
|
||||
goto fallback;
|
||||
|
||||
compute_size_minimum(p, bw, bh);
|
||||
pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
|
||||
return;
|
||||
|
||||
fallback:
|
||||
// Fall back to regular polar shader when compute shaders are unsupported
|
||||
// or the kernel is too big for shmem
|
||||
pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
|
||||
}
|
||||
|
||||
// Sample from img_tex, with the src rectangle given by it.
|
||||
// The dst rectangle is implicit by what the caller will do next, but w and h
|
||||
// must still be what is going to be used (to dimension FBOs correctly).
|
||||
|
@ -1753,21 +1797,7 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
|
|||
} else if (strcmp(name, "oversample") == 0) {
|
||||
pass_sample_oversample(p->sc, scaler, w, h);
|
||||
} else if (scaler->kernel && scaler->kernel->polar) {
|
||||
GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
|
||||
if ((p->gl->mpgl_caps & reqs) && scaler->kernel->f.radius <= 16) {
|
||||
// For performance we want to load at least as many pixels
|
||||
// horizontally as there are threads in a warp (32 for nvidia), as
|
||||
// well as enough to take advantage of shmem parallelism
|
||||
const int warp_size = 32, threads = 256;
|
||||
compute_size_minimum(p, warp_size, threads / warp_size);
|
||||
pass_compute_polar(p->sc, scaler, tex.components,
|
||||
p->compute_w, p->compute_h,
|
||||
(float)w / tex.w, (float)h / tex.h);
|
||||
} else {
|
||||
// Fall back to regular polar shader when compute shaders are
|
||||
// unsupported or the kernel is too big for shmem
|
||||
pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
|
||||
}
|
||||
pass_dispatch_sample_polar(p, scaler, tex, w, h);
|
||||
} else if (scaler->kernel) {
|
||||
pass_sample_separated(p, tex, scaler, w, h);
|
||||
} else {
|
||||
|
|
|
@ -217,18 +217,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
|
|||
GLSLF("}\n");
|
||||
}
|
||||
|
||||
// bw/bh: block size
|
||||
// iw/ih: input size (pre-calculated to fit all required texels)
|
||||
void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
|
||||
int components, int bw, int bh, float ratiox,
|
||||
float ratioy)
|
||||
int components, int bw, int bh, int iw, int ih)
|
||||
{
|
||||
int bound = ceil(scaler->kernel->radius_cutoff);
|
||||
int offset = bound - 1; // padding top/left
|
||||
int padding = offset + bound; // total padding
|
||||
|
||||
// We need to sample everything from base_min to base_max, so make sure
|
||||
// we have enough space to fit all relevant texels in shmem
|
||||
int iw = (int)ceil(bw / ratiox) + padding + 1,
|
||||
ih = (int)ceil(bh / ratioy) + padding + 1;
|
||||
|
||||
GLSL(color = vec4(0.0);)
|
||||
GLSLF("{\n");
|
||||
|
|
|
@ -33,8 +33,7 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
|
|||
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
|
||||
int components, int glsl_version);
|
||||
void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
|
||||
int components, int bw, int bh, float ratiox,
|
||||
float ratioy);
|
||||
int components, int bw, int bh, int iw, int ih);
|
||||
void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
|
||||
void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
|
||||
int w, int h);
|
||||
|
|
Loading…
Reference in New Issue