diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index 76b9d829ab..5a4d17e454 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -1755,7 +1755,21 @@ static void pass_sample(struct gl_video *p, struct img_tex tex, } else if (strcmp(name, "oversample") == 0) { pass_sample_oversample(p->sc, scaler, w, h); } else if (scaler->kernel && scaler->kernel->polar) { - pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version); + // Use a compute shader where possible, fallback to the slower texture + // fragment sampler otherwise. Also use the fragment shader for + // very large kernels to avoid exhausting shmem + if (p->gl->glsl_version < 430 || scaler->kernel->f.radius > 16) { + pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version); + } else { + // For performance we want to load at least as many pixels + // horizontally as there are threads in a warp (32 for nvidia), as + // well as enough to take advantage of shmem parallelism + const int warp_size = 32, threads = 256; + compute_size_minimum(p, warp_size, threads / warp_size); + pass_compute_polar(p->sc, scaler, tex.components, + p->compute_w, p->compute_h, + (float)w / tex.w, (float)h / tex.h); + } } else if (scaler->kernel) { pass_sample_separated(p, tex, scaler, w, h); } else { diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c index a7ecf1a448..fe6e944168 100644 --- a/video/out/opengl/video_shaders.c +++ b/video/out/opengl/video_shaders.c @@ -106,9 +106,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler } // Subroutine for computing and adding an individual texel contribution -// If subtexel < 0, samples directly. Otherwise, takes the texel from cN[comp] +// If subtexel < 0 and offset < 0, samples directly. +// If subtexel >= 0, takes the texel from cN[subtexel] +// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset] static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler, - int x, int y, int subtexel, int components) + int x, int y, int subtexel, int offset, int components) { double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale; double radius_cutoff = scaler->kernel->radius_cutoff; @@ -137,12 +139,19 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler, } GLSL(wsum += w;) - if (subtexel < 0) { + if (subtexel < 0 && offset < 0) { GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y); GLSL(color += vec4(w) * c0;) - } else { + } else if (subtexel >= 0) { for (int n = 0; n < components; n++) GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel); + } else if (offset >= 0) { + for (int n = 0; n bound || y+yo[p] > bound) continue; - polar_sample(sc, scaler, x+xo[p], y+yo[p], p, components); + polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components); } } else { // switch to direct sampling instead, for efficiency/compatibility for (int yy = y; yy <= bound && yy <= y+1; yy++) { for (int xx = x; xx <= bound && xx <= x+1; xx++) - polar_sample(sc, scaler, xx, yy, -1, components); + polar_sample(sc, scaler, xx, yy, -1, -1, components); } } } @@ -208,6 +217,54 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, GLSLF("}\n"); } +void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, + int components, int bw, int bh, float ratiox, + float ratioy) +{ + int bound = ceil(scaler->kernel->radius_cutoff); + int offset = bound - 1; // padding top/left + int padding = offset + bound; // total padding + + // We need to sample everything from base_min to base_max, so make sure + // we have enough space to fit all relevant texels in shmem + int iw = (int)ceil(bw / ratiox) + padding + 1, + ih = (int)ceil(bh / ratioy) + padding + 1; + + GLSL(color = vec4(0.0);) + GLSLF("{\n"); + GLSL(vec2 wpos = texmap0(gl_WorkGroupID * gl_WorkGroupSize);) + GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));) + GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));) + GLSL(vec2 base = pos - pt * fcoord;) + GLSL(ivec2 rel = ivec2(round((base - wbase) * size));) + GLSLF("float w, d, wsum = 0.0;\n"); + gl_sc_uniform_tex(sc, "lut", scaler->gl_target, scaler->gl_lut); + + // Load all relevant texels into shmem + for (int c = 0; c < components; c++) + GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw); + + GLSL(vec4 c;) + GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh); + GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw); + GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset); + for (int c = 0; c < components; c++) + GLSLF("in%d[y][x] = c[%d];\n", c, c); + GLSLF("}}\n"); + GLSL(groupMemoryBarrier();) + GLSL(barrier();) + + // Dispatch the actual samples + GLSLF("// scaler samples\n"); + for (int y = 1-bound; y <= bound; y++) { + for (int x = 1-bound; x <= bound; x++) + polar_sample(sc, scaler, x, y, -1, offset, components); + } + + GLSL(color = color / vec4(wsum);) + GLSLF("}\n"); +} + static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s) { // Explanation of how bicubic scaling with only 4 texel fetches is done: diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h index e0594f28f3..597027ca6b 100644 --- a/video/out/opengl/video_shaders.h +++ b/video/out/opengl/video_shaders.h @@ -32,6 +32,9 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler int d_x, int d_y); void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, int components, int glsl_version); +void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, + int components, int bw, int bh, float ratiox, + float ratioy); void pass_sample_bicubic_fast(struct gl_shader_cache *sc); void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler, int w, int h);