From 4c2edecd7dc83caaaa37c797d66d9077e105eaee Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Sat, 10 Feb 2018 22:42:11 +0100
Subject: [PATCH] vo_gpu: refactor HDR peak detection algorithm

The major changes are as follows:

1. Use `uint32_t` instead of `unsigned int` for the SSBO size
   calculation. This doesn't really matter, since a too-big buffer will
   still work just fine, but since `uint` is a 32-bit integer by
   definition this is the correct way to do it.

2. Pre-divide the frame_sum by the num_wg immediately at the end of a
   frame. This change was made to prevent overflow. At 4K screen size,
   this code is currently already very at risk of overflow, especially
   once I started playing with longer averaging sizes. Pre-dividing this
   out makes it just about fit into 32-bit even for worst-case PQ
   content. (It's technically also faster and easier this way, so I
   should have done it to begin with). Rename `frame_sum` to `frame_avg`
   to clearly signal the change in semantics.

3. Implement a scene transition detection algorithm. This basically
   compares the current frame's average brightness against the
   (averaged) value of the past frames. If it exceeds a threshold, which
   I experimentally configured, we reset the peak detection SSBO's state
   immediately - so that it just contains the current frame. This
   prevents annoying "eye adaptation"-like effects on scene transitions.

4. As a result of the previous change, we can now use a much larger
   buffer size by default, which results in a more stable and less
   flickery result. I experimented with values between 20 and 256 and
   settled on the new value of 64. (I also switched to a power-of-2
   array size, because I like powers of two)
---
 video/out/gpu/video.c         | 18 ++++++++---------
 video/out/gpu/video.h         |  2 +-
 video/out/gpu/video_shaders.c | 37 +++++++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 9bf7baeb77..c27004e63b 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -2448,13 +2448,13 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
     bool detect_peak = p->opts.compute_hdr_peak >= 0 && mp_trc_is_hdr(src.gamma);
     if (detect_peak && !p->hdr_peak_ssbo) {
         struct {
-            unsigned int counter;
-            unsigned int frame_idx;
-            unsigned int frame_num;
-            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
-            unsigned int frame_sum[PEAK_DETECT_FRAMES+1];
-            unsigned int total_max;
-            unsigned int total_sum;
+            uint32_t counter;
+            uint32_t frame_idx;
+            uint32_t frame_num;
+            uint32_t frame_max[PEAK_DETECT_FRAMES+1];
+            uint32_t frame_sum[PEAK_DETECT_FRAMES+1];
+            uint32_t total_max;
+            uint32_t total_sum;
         } peak_ssbo = {0};
 
         struct ra_buf_params params = {
@@ -2479,9 +2479,9 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
             "uint frame_idx;"
             "uint frame_num;"
             "uint frame_max[%d];"
-            "uint frame_sum[%d];"
+            "uint frame_avg[%d];"
             "uint total_max;"
-            "uint total_sum;",
+            "uint total_avg;",
             PEAK_DETECT_FRAMES + 1,
             PEAK_DETECT_FRAMES + 1
         );
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index 71666059f9..dad6d447f6 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -96,7 +96,7 @@ enum tone_mapping {
 };
 
 // How many frames to average over for HDR peak detection
-#define PEAK_DETECT_FRAMES 20
+#define PEAK_DETECT_FRAMES 63
 
 struct gl_video_opts {
     int dumb_mode;
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
index 8e33255390..eb3f287236 100644
--- a/video/out/gpu/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -559,6 +559,10 @@ void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, floa
 // under a typical presentation gamma of about 2.0.
 static const float sdr_avg = 0.25;
 
+// The threshold for which to consider an average luminance difference to be
+// a sign of a scene change.
+static const int scene_threshold = 0.2 * MP_REF_WHITE;
+
 static void hdr_update_peak(struct gl_shader_cache *sc)
 {
     // For performance, we want to do as few atomic operations on global
@@ -578,14 +582,16 @@ static void hdr_update_peak(struct gl_shader_cache *sc)
     GLSL(if (gl_LocalInvocationIndex == 0) {)
     GLSL(    uint wg_avg = wg_sum / (gl_WorkGroupSize.x * gl_WorkGroupSize.y);)
     GLSL(    atomicMax(frame_max[frame_idx], wg_avg);)
-    GLSL(    atomicAdd(frame_sum[frame_idx], wg_avg);)
+    GLSL(    atomicAdd(frame_avg[frame_idx], wg_avg);)
     GLSL(})
 
+    const float refi = 1.0 / MP_REF_WHITE;
+
     // Update the sig_peak/sig_avg from the old SSBO state
     GLSL(uint num_wg = gl_NumWorkGroups.x * gl_NumWorkGroups.y;)
     GLSL(if (frame_num > 0) {)
-    GLSLF("    float peak = float(total_max) / (%f * float(frame_num));\n", MP_REF_WHITE);
-    GLSLF("    float avg = float(total_sum) / (%f * float(frame_num * num_wg));\n", MP_REF_WHITE);
+    GLSLF("    float peak = %f * float(total_max) / float(frame_num);\n", refi);
+    GLSLF("    float avg = %f * float(total_avg) / float(frame_num);\n", refi);
     GLSLF("    sig_peak = max(1.0, peak);\n");
     GLSLF("    sig_avg  = max(%f, avg);\n", sdr_avg);
     GLSL(});
@@ -594,12 +600,31 @@ static void hdr_update_peak(struct gl_shader_cache *sc)
     GLSL(memoryBarrierBuffer();)
     GLSL(barrier();)
     GLSL(if (gl_LocalInvocationIndex == 0 && atomicAdd(counter, 1) == num_wg - 1) {)
+
+    // Since we sum up all the workgroups, we also still need to divide the
+    // average by the number of work groups
     GLSL(    counter = 0;)
+    GLSL(    frame_avg[frame_idx] /= num_wg;)
+    GLSL(    uint cur_max = frame_max[frame_idx];)
+    GLSL(    uint cur_avg = frame_avg[frame_idx];)
+
+    // Scene change detection
+    GLSL(    int diff = int(frame_num * cur_avg) - int(total_avg);)
+    GLSLF("  if (abs(diff) > frame_num * %d) {\n", scene_threshold);
+    GLSL(        frame_num = 0;)
+    GLSL(        total_max = total_avg = 0;)
+    GLSLF("      for (uint i = 0; i < %d; i++)\n", PEAK_DETECT_FRAMES+1);
+    GLSL(            frame_max[i] = frame_avg[i] = 0;)
+    GLSL(        frame_max[frame_idx] = cur_max;)
+    GLSL(        frame_avg[frame_idx] = cur_avg;)
+    GLSL(    })
+
     // Add the current frame, then subtract and reset the next frame
     GLSLF("  uint next = (frame_idx + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
-    GLSL(    total_max += frame_max[frame_idx] - frame_max[next];)
-    GLSL(    total_sum += frame_sum[frame_idx] - frame_sum[next];)
-    GLSL(    frame_max[next] = frame_sum[next] = 0;)
+    GLSL(    total_max += cur_max - frame_max[next];)
+    GLSL(    total_avg += cur_avg - frame_avg[next];)
+    GLSL(    frame_max[next] = frame_avg[next] = 0;)
+
     // Update the index and count
     GLSL(    frame_idx = next;)
     GLSLF("  frame_num = min(frame_num + 1, %d);\n", PEAK_DETECT_FRAMES);