vo_opengl: use textureGatherOffset for polar filters

This is more efficient on my machine (nvidia), but only when applied to
groups of exactly 4 texels. So we switch to the more efficient
textureGather for groups of 4. Some notes:

- textureGatherOffset seems to be faster than textureGather by a
  non-negligible amount, but for some reason, textureOffset is still
  slower than a straight-up texture
- textureGather* requires GLSL 400; and at least on nvidia, this
  requires actually allocating a GL 4.0 context.
- the code in opengl/common.c that clamped the GLSL version to 330 is
  deprecated, because the old user shader style has been removed
  completely in the meantime
- To combat the growing complexity of the polar sampling code, we drop
  the antiringing functionality from EWA shaders completely, since it
  never really worked well for EWA to begin with. (Horrific artifacting)
This commit is contained in:
Niklas Haas 2017-07-05 00:25:32 +02:00
parent b387f82aa4
commit ad0d6caac7
6 changed files with 91 additions and 45 deletions

View File

@ -4034,7 +4034,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
0.0 and 1.0. The default value of 0.0 disables antiringing entirely.
Note that this doesn't affect the special filters ``bilinear`` and
``bicubic_fast``.
``bicubic_fast``, nor does it affect any polar (EWA) scalers.
``--scale-window=<window>``, ``--cscale-window=<window>``, ``--dscale-window=<window>``, ``--tscale-window=<window>``
(Advanced users only) Choose a custom windowing function for the kernel.

View File

@ -579,8 +579,8 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
int glsl_major = 0, glsl_minor = 0;
if (shader && sscanf(shader, "%d.%d", &glsl_major, &glsl_minor) == 2)
gl->glsl_version = glsl_major * 100 + glsl_minor;
// GLSL 400 defines "sample" as keyword - breaks custom shaders.
gl->glsl_version = MPMIN(gl->glsl_version, 330);
// restrict GLSL version to be forwards compatible
gl->glsl_version = MPMIN(gl->glsl_version, 400);
}
if (is_software_gl(gl)) {

View File

@ -92,6 +92,7 @@ static const struct mpgl_driver *const backends[] = {
// 0-terminated list of desktop GL versions a backend should try to
// initialize. The first entry is the most preferred version.
const int mpgl_preferred_gl_versions[] = {
400,
330,
320,
310,

View File

@ -1583,7 +1583,7 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
} else if (strcmp(name, "oversample") == 0) {
pass_sample_oversample(p->sc, scaler, w, h);
} else if (scaler->kernel && scaler->kernel->polar) {
pass_sample_polar(p->sc, scaler);
pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
} else if (scaler->kernel) {
pass_sample_separated(p, tex, scaler, w, h);
} else {

View File

@ -105,62 +105,106 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
GLSLF("}\n");
}
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler)
// Subroutine for computing and adding an individual texel contribution
// If subtexel < 0, samples directly. Otherwise, takes the texel from cN[comp]
static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
int x, int y, int subtexel, int components)
{
double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
double radius_cutoff = scaler->kernel->radius_cutoff;
int bound = ceil(radius_cutoff);
bool use_ar = scaler->conf.antiring > 0;
// Since we can't know the subpixel position in advance, assume a
// worst case scenario
int yy = y > 0 ? y-1 : y;
int xx = x > 0 ? x-1 : x;
double dmax = sqrt(xx*xx + yy*yy);
// Skip samples definitely outside the radius
if (dmax >= radius_cutoff)
return;
GLSLF("d = length(vec2(%d.0, %d.0) - fcoord)/%f;\n", x, y, radius);
// Check for samples that might be skippable
bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
if (maybe_skippable)
GLSLF("if (d < %f) {\n", radius_cutoff / radius);
// get the weight for this pixel
if (scaler->gl_target == GL_TEXTURE_1D) {
GLSLF("w = texture1D(lut, LUT_POS(d, %d.0)).r;\n",
scaler->lut_size);
} else {
GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d, %d.0))).r;\n",
scaler->lut_size);
}
GLSL(wsum += w;)
if (subtexel < 0) {
GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
GLSL(color += vec4(w) * c0;)
} else {
for (int n = 0; n < components; n++)
GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
}
if (maybe_skippable)
GLSLF("}\n");
}
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int glsl_version)
{
GLSL(color = vec4(0.0);)
GLSLF("{\n");
GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
GLSL(vec2 base = pos - fcoord * pt;)
GLSL(vec4 c;)
GLSLF("float w, d, wsum = 0.0;\n");
if (use_ar) {
GLSL(vec4 lo = vec4(1.0);)
GLSL(vec4 hi = vec4(0.0);)
}
for (int n = 0; n < components; n++)
GLSLF("vec4 c%d;\n", n);
gl_sc_uniform_tex(sc, "lut", scaler->gl_target, scaler->gl_lut);
GLSLF("// scaler samples\n");
for (int y = 1-bound; y <= bound; y++) {
for (int x = 1-bound; x <= bound; x++) {
// Since we can't know the subpixel position in advance, assume a
// worst case scenario
int yy = y > 0 ? y-1 : y;
int xx = x > 0 ? x-1 : x;
double dmax = sqrt(xx*xx + yy*yy);
// Skip samples definitely outside the radius
if (dmax >= radius_cutoff)
continue;
GLSLF("d = length(vec2(%d.0, %d.0) - fcoord)/%f;\n", x, y, radius);
// Check for samples that might be skippable
bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
if (maybe_skippable)
GLSLF("if (d < %f) {\n", radius_cutoff / radius);
if (scaler->gl_target == GL_TEXTURE_1D) {
GLSLF("w = texture1D(lut, LUT_POS(d, %d.0)).r;\n",
scaler->lut_size);
int bound = ceil(scaler->kernel->radius_cutoff);
for (int y = 1-bound; y <= bound; y += 2) {
for (int x = 1-bound; x <= bound; x += 2) {
// First we figure out whether it's more efficient to use direct
// sampling or gathering. The problem is that gathering 4 texels
// only to discard some of them is very wasteful, so only do it if
// we suspect it will be a win rather than a loss. This is the case
// exactly when all four texels are within bounds
bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
// textureGather is only supported in GLSL 400+
if (glsl_version < 400)
use_gather = false;
if (use_gather) {
// Gather the four surrounding texels simultaneously
for (int n = 0; n < components; n++) {
GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
n, x, y, n);
}
// Mix in all of the points with their weights
for (int p = 0; p < 4; p++) {
// The four texels are gathered counterclockwise starting
// from the bottom left
static const int xo[4] = {0, 1, 1, 0};
static const int yo[4] = {1, 1, 0, 0};
if (x+xo[p] > bound || y+yo[p] > bound)
continue;
polar_sample(sc, scaler, x+xo[p], y+yo[p], p, components);
}
} else {
GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d, %d.0))).r;\n",
scaler->lut_size);
// switch to direct sampling instead, for efficiency/compatibility
for (int yy = y; yy <= bound && yy <= y+1; yy++) {
for (int xx = x; xx <= bound && xx <= x+1; xx++)
polar_sample(sc, scaler, xx, yy, -1, components);
}
}
GLSL(wsum += w;)
GLSLF("c = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
GLSL(color += vec4(w) * c;)
if (use_ar && x >= 0 && y >= 0 && x <= 1 && y <= 1) {
GLSL(lo = min(lo, c);)
GLSL(hi = max(hi, c);)
}
if (maybe_skippable)
GLSLF("}\n");
}
}
GLSL(color = color / vec4(wsum);)
if (use_ar)
GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
scaler->conf.antiring);
GLSLF("}\n");
}

View File

@ -30,7 +30,8 @@ extern const struct m_sub_options deband_conf;
void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
int d_x, int d_y);
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler);
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int glsl_version);
void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
int w, int h);