vo_opengl: support HDR peak detection

This is done via compute shaders. As a consequence, the tone mapping
algorithms had to be rewritten to compute their known constants in GLSL
(ahead of time), instead of doing it once. Didn't affect performance.

Using shmem/SSBO atomics in this way is extremely fast on nvidia, but it
might be slow on other platforms. Needs testing.

Unfortunately, setting up the SSBO still requires OpenGL calls, which
means I can't have it in video_shaders.c, where it belongs. But I'll
defer worrying about that until the backend refactor, since then I'll be
breaking up the video/video_shaders structure anyway.
This commit is contained in:
Niklas Haas 2017-07-17 21:39:06 +02:00
parent aad6ba018a
commit b196cadf9f
10 changed files with 201 additions and 22 deletions

View File

@ -4752,6 +4752,14 @@ The following video options are currently all specific to ``--vo=opengl`` and
linear
Specifies the scale factor to use while stretching. Defaults to 1.0.
``--hdr-compute-peak``
Compute the HDR peak per-frame of relying on tagged metadata. These values
are averaged over local regions as well as over several frames to prevent
the value from jittering around too much. This option basically gives you
dynamic, per-scene tone mapping. Requires compute shaders, which is a
fairly recent OpenGL feature, and will probably also perform horribly on
some drivers, so enable at your own risk.
``--tone-mapping-desaturate=<value>``
Apply desaturation for highlights that exceed this level of brightness. The
higher the parameter, the more color information will be preserved. This

View File

@ -344,6 +344,11 @@ static const struct gl_functions gl_functions[] = {
{0}
},
},
{
.ver_core = 430,
.extension = "GL_ARB_shader_storage_buffer_object",
.provides = MPGL_CAP_SSBO,
},
{
.ver_core = 430,
.extension = "GL_ARB_compute_shader",

View File

@ -54,6 +54,7 @@ enum {
MPGL_CAP_EXT16 = (1 << 18), // GL_EXT_texture_norm16
MPGL_CAP_ARB_FLOAT = (1 << 19), // GL_ARB_texture_float
MPGL_CAP_EXT_CR_HFLOAT = (1 << 20), // GL_EXT_color_buffer_half_float
MPGL_CAP_SSBO = (1 << 21), // GL_ARB_shader_storage_buffer_object
MPGL_CAP_SW = (1 << 30), // indirect or sw renderer
};

View File

@ -83,6 +83,11 @@
#define GL_COMPUTE_SHADER 0x91B9
// -- GL 4.3 or GL_ARB_shader_storage_buffer_object
#define GL_SHADER_STORAGE_BUFFER 0x90D2
#define GL_SHADER_STORAGE_BARRIER_BIT 0x00002000
// --- GL_NV_vdpau_interop
#define GLvdpauSurfaceNV GLintptr

View File

@ -473,6 +473,13 @@ struct sc_uniform {
GLenum img_iformat;
};
struct sc_buffer {
char *name;
char *format;
GLuint binding;
GLuint ssbo;
};
struct sc_cached_uniform {
GLint loc;
union uniform_val v;
@ -503,6 +510,7 @@ struct gl_shader_cache {
bstr text;
int next_texture_unit;
int next_image_unit;
int next_buffer_binding;
struct gl_vao *vao; // deprecated
struct sc_entry *entries;
@ -512,6 +520,8 @@ struct gl_shader_cache {
struct sc_uniform *uniforms;
int num_uniforms;
struct sc_buffer *buffers;
int num_buffers;
const struct gl_vao_entry *vertex_entries;
size_t vertex_size;
@ -562,6 +572,11 @@ void gl_sc_reset(struct gl_shader_cache *sc)
}
}
gl->ActiveTexture(GL_TEXTURE0);
for (int n = 0; n < sc->num_buffers; n++) {
struct sc_buffer *b = &sc->buffers[n];
gl->BindBufferBase(GL_SHADER_STORAGE_BUFFER, b->binding, 0);
}
}
sc->prelude_text.len = 0;
@ -570,8 +585,14 @@ void gl_sc_reset(struct gl_shader_cache *sc)
for (int n = 0; n < sc->num_uniforms; n++)
talloc_free(sc->uniforms[n].name);
sc->num_uniforms = 0;
for (int n = 0; n < sc->num_buffers; n++) {
talloc_free(sc->buffers[n].name);
talloc_free(sc->buffers[n].format);
}
sc->num_buffers = 0;
sc->next_texture_unit = 1; // not 0, as 0 is "free for use"
sc->next_image_unit = 1;
sc->next_buffer_binding = 1;
sc->vertex_entries = NULL;
sc->vertex_size = 0;
sc->current_shader = NULL;
@ -680,6 +701,21 @@ static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
return &sc->uniforms[sc->num_uniforms - 1];
}
static struct sc_buffer *find_buffer(struct gl_shader_cache *sc,
const char *name)
{
for (int n = 0; n < sc->num_buffers; n++) {
if (strcmp(sc->buffers[n].name, name) == 0)
return &sc->buffers[n];
}
// not found -> add it
struct sc_buffer new = {
.name = talloc_strdup(NULL, name),
};
MP_TARRAY_APPEND(sc, sc->buffers, sc->num_buffers, new);
return &sc->buffers[sc->num_buffers - 1];
}
const char *mp_sampler_type(GLenum texture_target)
{
switch (texture_target) {
@ -738,6 +774,20 @@ void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint textur
u->img_iformat = iformat;
}
void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
char *format, ...)
{
struct sc_buffer *b = find_buffer(sc, name);
b->binding = sc->next_buffer_binding++;
b->ssbo = ssbo;
b->format = format;
va_list ap;
va_start(ap, format);
b->format = ta_vasprintf(sc, format, ap);
va_end(ap);
}
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f)
{
struct sc_uniform *u = find_uniform(sc, name);
@ -1217,6 +1267,12 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type)
ADD(comp, "uniform %s %s;\n", u->glsl_type, u->name);
}
for (int n = 0; n < sc->num_buffers; n++) {
struct sc_buffer *b = &sc->buffers[n];
ADD(comp, "layout(std430, binding=%d) buffer %s { %s };\n",
b->binding, b->name, b->format);
}
ADD_BSTR(comp, sc->prelude_text);
ADD_BSTR(comp, sc->header_text);
@ -1271,6 +1327,10 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type)
for (int n = 0; n < sc->num_uniforms; n++)
update_uniform(gl, entry, &sc->uniforms[n], n);
for (int n = 0; n < sc->num_buffers; n++) {
struct sc_buffer *b = &sc->buffers[n];
gl->BindBufferBase(GL_SHADER_STORAGE_BUFFER, b->binding, b->ssbo);
}
gl->ActiveTexture(GL_TEXTURE0);

View File

@ -149,6 +149,8 @@ void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target,
void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
GLuint iformat, GLenum access);
void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
char *format, ...);
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, GLint f);
void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, GLfloat f[2]);

View File

@ -236,9 +236,11 @@ struct gl_video {
struct fbotex integer_fbo[4];
struct fbotex indirect_fbo;
struct fbotex blend_subs_fbo;
struct fbotex screen_fbo;
struct fbotex output_fbo;
struct fbosurface surfaces[FBOSURFACES_MAX];
struct fbotex vdpau_deinterleave_fbo[2];
GLuint hdr_peak_ssbo;
int surface_idx;
int surface_now;
@ -368,6 +370,7 @@ const struct m_sub_options gl_video_conf = {
{"hable", TONE_MAPPING_HABLE},
{"gamma", TONE_MAPPING_GAMMA},
{"linear", TONE_MAPPING_LINEAR})),
OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
OPT_FLAG("opengl-pbo", pbo, 0),
@ -541,6 +544,7 @@ static void uninit_rendering(struct gl_video *p)
fbotex_uninit(&p->indirect_fbo);
fbotex_uninit(&p->blend_subs_fbo);
fbotex_uninit(&p->screen_fbo);
for (int n = 0; n < FBOSURFACES_MAX; n++)
fbotex_uninit(&p->surfaces[n].fbotex);
@ -2358,6 +2362,8 @@ static void pass_scale_main(struct gl_video *p)
// by previous passes (i.e. linear scaling)
static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
{
GL *gl = p->gl;
// Figure out the target color space from the options, or auto-guess if
// none were set
struct mp_colorspace dst = {
@ -2417,10 +2423,42 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
dst.gamma = MP_CSP_TRC_GAMMA22;
}
bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
if (detect_peak) {
pass_describe(p, "detect HDR peak");
compute_size_minimum(p, 8, 8); // 8x8 is good for performance
if (!p->hdr_peak_ssbo) {
struct {
GLuint sig_peak_raw;
GLuint index;
GLuint frame_max[PEAK_DETECT_FRAMES+1];
} peak_ssbo = {0};
// Prefill with safe values
int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
peak_ssbo.frame_max[i] = safe;
gl->GenBuffers(1, &p->hdr_peak_ssbo);
gl->BindBuffer(GL_SHADER_STORAGE_BUFFER, p->hdr_peak_ssbo);
gl->BufferData(GL_SHADER_STORAGE_BUFFER, sizeof(peak_ssbo),
&peak_ssbo, GL_STREAM_COPY);
gl->BindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
}
gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
"uint sig_peak_raw;"
"uint index;"
"uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
);
}
// Adapt from src to dst as necessary
pass_color_map(p->sc, src, dst, p->opts.hdr_tone_mapping,
p->opts.tone_mapping_param, p->opts.tone_mapping_desat,
p->use_linear && !osd);
detect_peak, p->use_linear && !osd);
if (p->use_lut_3d) {
gl_sc_uniform_tex(p->sc, "lut_3d", GL_TEXTURE_3D, p->lut_3d_texture);
@ -2710,6 +2748,17 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
pass_colormanage(p, p->image_params.color, false);
// Since finish_pass_direct doesn't work with compute shaders, and neither
// does the checkerboard/dither code, we may need an indirection via
// p->screen_fbo here.
if (p->compute_w > 0 && p->compute_h > 0) {
int o_w = p->dst_rect.x1 - p->dst_rect.x0,
o_h = p->dst_rect.y1 - p->dst_rect.y0;
finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
copy_img_tex(p, &(int){0}, tmp);
}
if (p->has_alpha){
if (p->opts.alpha_mode == ALPHA_BLEND_TILES) {
// Draw checkerboard pattern to indicate transparency
@ -3326,6 +3375,7 @@ static void check_gl_features(struct gl_video *p)
bool have_mglsl = gl->glsl_version >= 130; // modern GLSL (1st class arrays etc.)
bool have_texrg = gl->mpgl_caps & MPGL_CAP_TEX_RG;
bool have_tex16 = !gl->es || (gl->mpgl_caps & MPGL_CAP_EXT16);
bool have_compute = gl->glsl_version >= 430; // easiest way to ensure all
const GLint auto_fbo_fmts[] = {GL_RGBA16, GL_RGBA16F, GL_RGB10_A2,
GL_RGBA8, 0};
@ -3436,6 +3486,10 @@ static void check_gl_features(struct gl_video *p)
p->opts.deband = 0;
MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
}
if (!have_compute && p->opts.compute_hdr_peak) {
p->opts.compute_hdr_peak = 0;
MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
}
}
static void init_gl(struct gl_video *p)
@ -3471,6 +3525,7 @@ void gl_video_uninit(struct gl_video *p)
gl_sc_destroy(p->sc);
gl->DeleteTextures(1, &p->lut_3d_texture);
gl->DeleteBuffers(1, &p->hdr_peak_ssbo);
gl_timer_free(p->upload_timer);
gl_timer_free(p->blit_timer);

View File

@ -99,6 +99,9 @@ enum tone_mapping {
TONE_MAPPING_LINEAR,
};
// How many frames to average over for HDR peak detection
#define PEAK_DETECT_FRAMES 100
struct gl_video_opts {
int dumb_mode;
struct scaler_config scaler[4];
@ -109,6 +112,7 @@ struct gl_video_opts {
int target_trc;
int target_brightness;
int hdr_tone_mapping;
int compute_hdr_peak;
float tone_mapping_param;
float tone_mapping_desat;
int linear_scaling;

View File

@ -521,7 +521,8 @@ void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, floa
GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
}
// Tone map from a known peak brightness to the range [0,1]
// Tone map from a known peak brightness to the range [0,1]. If ref_peak
// is 0, we will use peak detection instead
static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
enum tone_mapping algo, float param, float desat)
{
@ -531,8 +532,42 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
GLSL(float luma = dot(src_luma, color.rgb);)
GLSL(float luma_orig = luma;)
if (!ref_peak) {
// For performance, we want to do as few atomic operations on global
// memory as possible, so use an atomic in shmem for the work group.
// We also want slightly more stable values, so use the group average
// instead of the group max
GLSLHF("shared uint group_sum = 0;\n");
GLSLF("atomicAdd(group_sum, uint(luma * %f));\n", MP_REF_WHITE);
// Have one thread in each work group update the frame maximum
GLSL(memoryBarrierBuffer();)
GLSL(barrier();)
GLSL(if (gl_LocalInvocationIndex == 0))
GLSL(atomicMax(frame_max[index], group_sum /
(gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
// Finally, have one thread per invocation update the total maximum
// and advance the index
GLSL(memoryBarrierBuffer();)
GLSL(barrier();)
GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
GLSL(index = next;)
GLSL(})
GLSL(memoryBarrierBuffer();)
GLSL(barrier();)
GLSLF("const float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
MP_REF_WHITE * PEAK_DETECT_FRAMES);
} else {
GLSLHF("const float sig_peak = %f;\n", ref_peak);
}
// Desaturate the color using a coefficient dependent on the brightness
if (desat > 0 && ref_peak > desat) {
if (desat > 0) {
GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
}
@ -542,23 +577,23 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
GLSLF("luma = clamp(%f * luma, 0.0, 1.0);\n", isnan(param) ? 1.0 : param);
break;
case TONE_MAPPING_MOBIUS: {
float j = isnan(param) ? 0.3 : param;
// solve for M(j) = j; M(ref_peak) = 1.0; M'(j) = 1.0
case TONE_MAPPING_MOBIUS:
GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
// solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
// where M(x) = scale * (x+a)/(x+b)
float a = -j*j * (ref_peak - 1) / (j*j - 2*j + ref_peak),
b = (j*j - 2*j*ref_peak + ref_peak) / (ref_peak - 1);
GLSLF("luma = mix(%f * (luma + %f) / (luma + %f), luma, luma <= %f);\n",
(b*b + 2*b*j + j*j) / (b - a), a, b, j);
GLSLF("const float a = -j*j * (sig_peak - 1) / (j*j - 2*j + sig_peak);\n");
GLSLF("const float b = (j*j - 2*j*sig_peak + sig_peak) / "
"max(1e-6, sig_peak - 1);\n");
GLSLF("const float scale = (b*b + 2*b*j + j*j) / (b-a);\n");
GLSL(luma = mix(luma, scale * (luma + a) / (luma + b), luma > j);)
break;
}
case TONE_MAPPING_REINHARD: {
float contrast = isnan(param) ? 0.5 : param,
offset = (1.0 - contrast) / contrast;
GLSLF("luma = luma / (luma + %f);\n", offset);
GLSLF("luma *= %f;\n", (ref_peak + offset) / ref_peak);
GLSLF("const float lumascale = (sig_peak + %f) / sig_peak;\n", offset);
GLSL(luma *= lumascale;)
break;
}
@ -568,20 +603,19 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
A, C*B, D*E, A, B, D*F, E/F);
GLSLHF("}\n");
GLSLF("luma = hable(luma) / hable(%f);\n", ref_peak);
GLSL(luma = hable(luma) / hable(sig_peak);)
break;
}
case TONE_MAPPING_GAMMA: {
float gamma = isnan(param) ? 1.8 : param;
GLSLF("luma = pow(luma * 1.0/%f, %f);\n", ref_peak, 1.0/gamma);
GLSLF("luma = pow(luma / sig_peak, %f);\n", 1.0/gamma);
break;
}
case TONE_MAPPING_LINEAR: {
float coeff = isnan(param) ? 1.0 : param;
GLSLF("luma = %f * luma;\n", coeff / ref_peak);
GLSLF("luma = %f / sig_peak * luma;\n", coeff);
break;
}
@ -596,11 +630,15 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
// Map colors from one source space to another. These source spaces must be
// known (i.e. not MP_CSP_*_AUTO), as this function won't perform any
// auto-guessing. If is_linear is true, we assume the input has already been
// linearized (e.g. for linear-scaling)
// linearized (e.g. for linear-scaling). If `detect_peak` is true, we will
// detect the peak instead of relying on metadata. Note that this requires
// the caller to have already bound the appropriate SSBO and set up the
// compute shader metadata
void pass_color_map(struct gl_shader_cache *sc,
struct mp_colorspace src, struct mp_colorspace dst,
enum tone_mapping algo, float tone_mapping_param,
float tone_mapping_desat, bool is_linear)
float tone_mapping_desat, bool detect_peak,
bool is_linear)
{
GLSLF("// color mapping\n");
@ -643,8 +681,8 @@ void pass_color_map(struct gl_shader_cache *sc,
// Tone map to prevent clipping when the source signal peak exceeds the
// encodable range
if (src.sig_peak > dst_range) {
pass_tone_map(sc, src.sig_peak / dst_range, algo, tone_mapping_param,
tone_mapping_desat);
float ref_peak = detect_peak ? 0 : src.sig_peak / dst_range;
pass_tone_map(sc, ref_peak, algo, tone_mapping_param, tone_mapping_desat);
}
// Adapt to the right colorspace if necessary

View File

@ -44,7 +44,8 @@ void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, floa
void pass_color_map(struct gl_shader_cache *sc,
struct mp_colorspace src, struct mp_colorspace dst,
enum tone_mapping algo, float tone_mapping_param,
float tone_mapping_desat, bool is_linear);
float tone_mapping_desat, bool use_detected_peak,
bool is_linear);
void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
AVLFG *lfg);