diff --git a/video/out/gpu/error_diffusion.c b/video/out/gpu/error_diffusion.c index 2bcd2084dd..88c0903d35 100644 --- a/video/out/gpu/error_diffusion.c +++ b/video/out/gpu/error_diffusion.c @@ -67,8 +67,8 @@ int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, int shifted_columns = compute_rightmost_shifted_column(k) + 1; // The shared memory is an array of size rows*shifted_columns. Each element - // is three int, for each RGB component. - return rows * shifted_columns * 3 * 4; + // is a single uint for three RGB component. + return rows * shifted_columns * 4; } void pass_error_diffusion(struct gl_shader_cache *sc, @@ -104,18 +104,13 @@ void pass_error_diffusion(struct gl_shader_cache *sc, int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1; int ring_buffer_size = ring_buffer_rows * ring_buffer_columns; - const char *rgb = "rgb"; - // Defines the ring buffer in shared memory. - for (int comp = 0; comp < 3; comp++) - GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size); + GLSLH("shared uint err_rgb8[%d];\n", ring_buffer_size); // Initialize the ring buffer. - GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n", + GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) ", ring_buffer_size, block_size); - for (int comp = 0; comp < 3; comp++) - GLSL("err_%c[i] = 0;\n", rgb[comp]); - GLSL("}\n"); + GLSL("err_rgb8[i] = 0;\n"); GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks); @@ -141,54 +136,87 @@ void pass_error_diffusion(struct gl_shader_cache *sc, // The dithering will quantize pixel value into multiples of 1/dither_quant. int dither_quant = (1 << depth) - 1; - // The absolute value of the errors to propagate is less than 1/dither_quant, - // multiply by dither_quant24 to have them processed with int in 24 bit - // precision. - double dither_quant24 = (double)(1 << 24) * dither_quant; + + // We encode errors in RGB components into a single 32-bit unsigned integer. + // The error we propagate from the current pixel is in range of + // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the + // sum of all errors been propagated into a pixel is also in the same range. + // It's possible to map errors in this range into [-127, 127], and use an + // unsigned 8-bit integer to store it (using standard two's complement). + // The three 8-bit unsigned integers can then be encoded into a single + // 32-bit unsigned integer, with two 4-bit padding to prevent addition + // operation overflows affecting other component. There are at most 12 + // addition operations on each pixel, so 4-bit padding should be enough. + // The overflow from R component will be discarded. + // + // The following figure is how the encoding looks like. + // + // +------------------------------------+ + // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| + // +------------------------------------+ + // + + // The bitshift position for R and G component. + int bitshift_r = 24, bitshift_g = 12; + // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. + int uint8_mul = 127 * 2; // Adding the error previously propagated into current pixel, and clear it // in the buffer. - GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24); - for (int comp = 0; comp < 3; comp++) - GLSL("err_%c[idx] = 0;\n", rgb[comp]); - - // Dithering to depth. - GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant); - GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24); + GLSL("uint err_u32 = err_rgb8[idx] + %uu;\n", + (128u << bitshift_r) | (128u << bitshift_g) | 128u); + GLSL("pix = pix * %d.0 + vec3(" + "int((err_u32 >> %d) & 255u) - 128," + "int((err_u32 >> %d) & 255u) - 128," + "int( err_u32 & 255u) - 128" + ") / %d.0;\n", dither_quant, bitshift_r, bitshift_g, uint8_mul); + GLSL("err_rgb8[idx] = 0;\n"); // Write the dithered pixel. - GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n"); + GLSL("vec3 dithered = round(pix);\n"); + GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered / %d.0, 0.0));\n", + dither_quant); + + GLSL("vec3 err_divided = (pix - dithered) * %d.0 / %d.0;\n", + uint8_mul, k->divisor); + GLSL("ivec3 tmp;\n"); + + // Group error propagation with same weight factor together, in order to + // reduce the number of annoying error encoding. + for (int dividend = 1; dividend <= k->divisor; dividend++) { + bool err_assigned = false; + + for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { + for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { + if (k->pattern[y][x - EF_MIN_DELTA_X] != dividend) + continue; + + if (!err_assigned) { + err_assigned = true; + + GLSL("tmp = ivec3(round(err_divided * %d.0));\n", dividend); + + GLSL("err_u32 = " + "(uint(tmp.r & 255) << %d)|" + "(uint(tmp.g & 255) << %d)|" + " uint(tmp.b & 255);\n", + bitshift_r, bitshift_g); + } - GLSL("int nidx;\n"); - for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { - for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { - if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) { int shifted_x = x + y * k->shift; // Unlike the right border, errors propagated out from left // border will remain in the ring buffer. This will produce // visible artifacts near the left border, especially for // shift=3 kernels. - bool left_border_check = x < 0; - - if (left_border_check) - GLSL("if (x >= %d) {\n", -x); + if (x < 0) + GLSL("if (x >= %d) ", -x); // Calculate the new position in the ring buffer to propagate // the error into. int ring_buffer_delta = shifted_x * ring_buffer_rows + y; - GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size); - - // Propagate the error with atomic operation. - for (int comp = 0; comp < 3; comp++) { - GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n", - rgb[comp], rgb[comp], - k->pattern[y][x - EF_MIN_DELTA_X], - k->divisor); - } - - if (left_border_check) - GLSL("}\n"); + GLSL("atomicAdd(err_rgb8[(idx + %d) %% %d], err_u32);\n", + ring_buffer_delta, ring_buffer_size); } } }