diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c index 6bf0bb31a1..be49551dfb 100644 --- a/video/out/gpu/video.c +++ b/video/out/gpu/video.c @@ -2494,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool if (detect_peak && !p->hdr_peak_ssbo) { struct { float average[2]; - uint32_t frame_sum; + int32_t frame_sum; uint32_t frame_max; uint32_t counter; } peak_ssbo = { @@ -2520,7 +2520,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool pass_is_compute(p, 8, 8, true); // 8x8 is good for performance gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo, "vec2 average;" - "uint frame_sum;" + "int frame_sum;" "uint frame_max;" "uint counter;" ); diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c index fbccd56eb3..127db58ea2 100644 --- a/video/out/gpu/video_shaders.c +++ b/video/out/gpu/video_shaders.c @@ -574,21 +574,24 @@ static void hdr_update_peak(struct gl_shader_cache *sc, GLSL(sig_avg = max(1e-3, average.x);) GLSL(sig_peak = max(1.00, average.y);) + // Chosen to avoid overflowing on an 8K buffer + const float log_min = 1e-3, log_scale = 400.0, sig_scale = 10000.0; + // For performance, and to avoid overflows, we tally up the sub-results per // pixel using shared memory first - GLSLH(shared uint wg_sum;) + GLSLH(shared int wg_sum;) GLSLH(shared uint wg_max;) - GLSL(wg_sum = wg_max = 0;) + GLSL(wg_sum = 0; wg_max = 0;) GLSL(barrier();) - GLSLF("uint sig_uint = uint(sig_max * %f);\n", MP_REF_WHITE); - GLSL(atomicAdd(wg_sum, sig_uint);) - GLSL(atomicMax(wg_max, sig_uint);) + GLSLF("float sig_log = log(max(sig_max, %f));\n", log_min); + GLSLF("atomicAdd(wg_sum, int(sig_log * %f));\n", log_scale); + GLSLF("atomicMax(wg_max, uint(sig_max * %f));\n", sig_scale); // Have one thread per work group update the global atomics GLSL(memoryBarrierShared();) GLSL(barrier();) GLSL(if (gl_LocalInvocationIndex == 0) {) - GLSL( uint wg_avg = wg_sum / (gl_WorkGroupSize.x * gl_WorkGroupSize.y);) + GLSL( int wg_avg = wg_sum / int(gl_WorkGroupSize.x * gl_WorkGroupSize.y);) GLSL( atomicAdd(frame_sum, wg_avg);) GLSL( atomicMax(frame_max, wg_max);) GLSL( memoryBarrierBuffer();) @@ -600,7 +603,8 @@ static void hdr_update_peak(struct gl_shader_cache *sc, GLSL(if (gl_LocalInvocationIndex == 0 && atomicAdd(counter, 1) == num_wg - 1) {) GLSL( counter = 0;) GLSL( vec2 cur = vec2(float(frame_sum) / float(num_wg), frame_max);) - GLSLF(" cur *= 1.0/%f;\n", MP_REF_WHITE); + GLSLF(" cur *= vec2(1.0/%f, 1.0/%f);\n", log_scale, sig_scale); + GLSL( cur.x = exp(cur.x);) // Use an IIR low-pass filter to smooth out the detected values, with a // configurable decay rate based on the desired time constant (tau) @@ -615,7 +619,7 @@ static void hdr_update_peak(struct gl_shader_cache *sc, GLSL( average = mix(average, cur, weight);) // Reset SSBO state for the next frame - GLSL( frame_max = frame_sum = 0;) + GLSL( frame_sum = 0; frame_max = 0;) GLSL( memoryBarrierBuffer();) GLSL(}) }