From b9406917849748152c45b9347da1ef204970f59e Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sun, 17 Sep 2017 05:37:24 +0200 Subject: [PATCH] vo_gpu: drop the RA_CAP_NESTED_ARRAY req from EWA compute Almost as fast as the old code, but more general. Notably, glslang doesn't support nested arrays. (cf. https://github.com/KhronosGroup/glslang/issues/1057) Also much cleaner code-wise, so I think I'll keep it even if glslang implements array_of_arrays. --- video/out/gpu/video.c | 2 +- video/out/gpu/video_shaders.c | 48 +++++++++++++++++------------------ 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c index 9f1654e584..476dae14a8 100644 --- a/video/out/gpu/video.c +++ b/video/out/gpu/video.c @@ -1671,7 +1671,7 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src, static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler, struct img_tex tex, int w, int h) { - uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY; + uint64_t reqs = RA_CAP_COMPUTE; if ((p->ra->caps & reqs) != reqs) goto fallback; diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c index 60c5ce82ac..48a8bc2eae 100644 --- a/video/out/gpu/video_shaders.c +++ b/video/out/gpu/video_shaders.c @@ -97,11 +97,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler } // Subroutine for computing and adding an individual texel contribution -// If subtexel < 0 and offset < 0, samples directly. -// If subtexel >= 0, takes the texel from cN[subtexel] -// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset] +// If planar is false, samples directly +// If planar is true, takes the pixel from inX[idx] where X is the component and +// `idx` must be defined by the caller static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler, - int x, int y, int subtexel, int offset, int components) + int x, int y, int components, bool planar) { double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale; double radius_cutoff = scaler->kernel->radius_cutoff; @@ -130,19 +130,12 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler, } GLSL(wsum += w;) - if (subtexel < 0 && offset < 0) { - GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y); - GLSL(color += vec4(w) * c0;) - } else if (subtexel >= 0) { + if (planar) { for (int n = 0; n < components; n++) - GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel); - } else if (offset >= 0) { - for (int n = 0; n lut); @@ -180,8 +174,8 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, if (use_gather) { // Gather the four surrounding texels simultaneously for (int n = 0; n < components; n++) { - GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n", - n, x, y, n); + GLSLF("in%d = textureGatherOffset(tex, base, " + "ivec2(%d, %d), %d);\n", n, x, y, n); } // Mix in all of the points with their weights @@ -192,13 +186,14 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler, static const int yo[4] = {1, 1, 0, 0}; if (x+xo[p] > bound || y+yo[p] > bound) continue; - polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components); + GLSLF("idx = %d;\n", p); + polar_sample(sc, scaler, x+xo[p], y+yo[p], components, true); } } else { // switch to direct sampling instead, for efficiency/compatibility for (int yy = y; yy <= bound && yy <= y+1; yy++) { for (int xx = x; xx <= bound && xx <= x+1; xx++) - polar_sample(sc, scaler, xx, yy, -1, -1, components); + polar_sample(sc, scaler, xx, yy, components, false); } } } @@ -223,20 +218,20 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));) GLSL(vec2 base = pos - pt * fcoord;) GLSL(ivec2 rel = ivec2(round((base - wbase) * size));) + GLSL(int idx;) GLSLF("float w, d, wsum = 0.0;\n"); gl_sc_uniform_texture(sc, "lut", scaler->lut); // Load all relevant texels into shmem - gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays"); for (int c = 0; c < components; c++) - GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw); + GLSLHF("shared float in%d[%d];\n", c, ih * iw); GLSL(vec4 c;) GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh); GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw); GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset); for (int c = 0; c < components; c++) - GLSLF("in%d[y][x] = c[%d];\n", c, c); + GLSLF("in%d[%d * y + x] = c[%d];\n", c, iw, c); GLSLF("}}\n"); GLSL(groupMemoryBarrier();) GLSL(barrier();) @@ -244,8 +239,11 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler, // Dispatch the actual samples GLSLF("// scaler samples\n"); for (int y = 1-bound; y <= bound; y++) { - for (int x = 1-bound; x <= bound; x++) - polar_sample(sc, scaler, x, y, -1, offset, components); + for (int x = 1-bound; x <= bound; x++) { + GLSLF("idx = %d * rel.y + rel.x + %d;\n", iw, + iw * (y + offset) + x + offset); + polar_sample(sc, scaler, x, y, components, true); + } } GLSL(color = color / vec4(wsum);)