From f7f947f96098bfca451ca85a9b4f36175907abd6 Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Mon, 13 Apr 2020 02:36:46 +0200
Subject: [PATCH] zimg: add support for some RGB fringe formats

This covers 8 and 16 bit packed RGB formats. It doesn't really help with
any actual use-cases, other than giving the finger to libswscale.

One problem is with different color depths. For example, rgb565 provides
1 bit more resolution to the green channel. zimg can only dither to a
uniform depth. I tried dithering to the highest depth and shifting away
1 bit for the lower channels, but that looked ugly (or I messed up
somewhere), so instead it dithers to the lowest depth, and adjusts the
value range if needed. Testing with bgr4_byte (extreme case with 1/2/1
depths), it looks more "grainy" (ordered dithering artifacts) than
libswscale, but it also looks cleaner and smoother. It doesn't have
libswscale's weird red-shift. So I call it a success.

Big endian formats need to be handled explicitly; the generic big endian
swapper code assumes byte-aligned components.

Unpacking is done with shifts and 3 LUTs. This is symmetric to the
packer. Using a generated palette might be better, but I preferred to
keep the symmetry, and not having to mess with a generated palette and
the pal8 code.

This uses FFmepg pixfmts constants directly. I would have preferred
keeping zimg completely separate. But neither do I want to add an IMGFMT
alias for every of these formats, nor do I want to extend our imgfmt
code such that it can provide a complete description of each packed RGB
format (similar to FFmpeg pixdesc).

It also appears that FFmpeg pixdesc as well as the FFmpeg pixfmt doxygen
have an error regarding RGB8: the R/B bit depths are swapped. libswscale
appears to be handling them differently. Not completely sure, as this is
the only packed format case with R/B havuing different depths (instead
of G, the middle component, where things are symmetric).
---
 video/zimg.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/video/zimg.c b/video/zimg.c
index 9caa052f51..1a0c4e3b0b 100644
--- a/video/zimg.c
+++ b/video/zimg.c
@@ -18,12 +18,14 @@
 #include <math.h>
 
 #include <libavutil/bswap.h>
+#include <libavutil/pixfmt.h>
 
 #include "common/common.h"
 #include "common/msg.h"
 #include "csputils.h"
 #include "options/m_config.h"
 #include "options/m_option.h"
+#include "video/fmt-conversion.h"
 #include "video/img_format.h"
 #include "zimg.h"
 
@@ -85,6 +87,10 @@ struct mp_zimg_repack {
     // Called with user==mp_zimg_repack.
     zimg_filter_graph_callback repack;
 
+    // Output bit depth. If 0, use format defaults. (Used by some packets. This
+    // is simpler than defining fringe planar RGB formats for each depth.)
+    int override_depth;
+
     // Endian-swap (done before/after actual repacker).
     int endian_size;            // 0=no swapping, 2/4=word byte size to swap
     int endian_items[4];        // number of words per pixel/plane
@@ -95,6 +101,11 @@ struct mp_zimg_repack {
     //  unpack: p1 is src, p2 is dst
     void (*packed_repack_scanline)(void *p1, void *p2[], int x0, int x1);
 
+    // Fringe RGB.
+    uint8_t comp_size;
+    uint8_t comp_shifts[3];
+    uint8_t *comp_lut; // 256 * 3
+
     // Temporary memory for slice-wise repacking. This may be set even if repack
     // is not set (then it may be used to avoid alignment issues). This has
     // about one slice worth of data.
@@ -462,6 +473,93 @@ static int packed_repack(void *user, unsigned i, unsigned x0, unsigned x1)
     return 0;
 }
 
+struct fringe_rgb_repacker {
+    // To avoid making a mess of IMGFMT_*, we use av formats directly.
+    enum AVPixelFormat avfmt;
+    // If true, use BGR instead of RGB.
+    //  False:  LSB - R - G - B - pad - MSB
+    //  True:   LSB - B - G - R - pad - MSB
+    bool rev_order;
+    // Size in bit for each component, strictly from LSB to MSB.
+    int bits[3];
+    bool be;
+};
+
+static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
+    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
+    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
+    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
+    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
+    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
+    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
+    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
+    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
+    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
+    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
+    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
+    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
+};
+
+#define PA_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *dst, void *src[], int x0, int x1, uint8_t *lut,  \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = x0; x < x1; x++) {                                     \
+            ((packed_t *)dst)[x] =                                          \
+                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
+                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
+                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
+        }                                                                   \
+    }
+
+
+#define UN_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *src, void *dst[], int x0, int x1, uint8_t *lut,  \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = x0; x < x1; x++) {                                     \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
+            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
+            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
+        }                                                                   \
+    }
+
+PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
+PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
+UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
+UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
+
+static int fringe_rgb_repack(void *user, unsigned i, unsigned x0, unsigned x1)
+{
+    struct mp_zimg_repack *r = user;
+
+    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
+
+    void *p2[4] = {0};
+    for (int p = 0; p < r->num_planes; p++) {
+        int s = r->components[p];
+        p2[p] = r->tmp->planes[s] +
+                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
+    }
+
+    assert(r->comp_size == 1 || r->comp_size == 2);
+
+    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *lut,
+                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
+    if (r->pack) {
+        repack = r->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
+    } else {
+        repack = r->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
+    }
+    repack(p1, p2, x0, x1, r->comp_lut,
+           r->comp_shifts[0], r->comp_shifts[1], r->comp_shifts[2]);
+
+    return 0;
+}
+
 static int unpack_pal(void *user, unsigned i, unsigned x0, unsigned x1)
 {
     struct mp_zimg_repack *r = user;
@@ -588,6 +686,78 @@ static void wrap_buffer(struct mp_zimg_repack *r,
     r->user_mpi = mpi;
 }
 
+static void setup_fringe_rgb_packer(struct mp_zimg_repack *r,
+                                    struct mp_zimg_context *ctx)
+{
+    enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
+
+    const struct fringe_rgb_repacker *fmt = NULL;
+    for (int n = 0; n < MP_ARRAY_SIZE(fringe_rgb_repackers); n++) {
+        if (fringe_rgb_repackers[n].avfmt == avfmt) {
+            fmt = &fringe_rgb_repackers[n];
+            break;
+        }
+    }
+
+    if (!fmt)
+        return;
+
+    struct mp_regular_imgfmt gbrp = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        .forced_csp = MP_CSP_RGB,
+        .component_size = 1,
+        .num_planes = 3,
+        .planes = { {1, {2}}, {1, {3}}, {1, {1}} },
+        .chroma_w = 1,
+        .chroma_h = 1,
+    };
+    r->zimgfmt = mp_find_regular_imgfmt(&gbrp);
+    if (!r->zimgfmt)
+        return;
+    if (ctx)
+        r->comp_lut = talloc_array(ctx, uint8_t, 256 * 3);
+    r->repack = fringe_rgb_repack;
+    static const int c_order_rgb[] = {3, 1, 2};
+    static const int c_order_bgr[] = {2, 1, 3};
+    for (int n = 0; n < 3; n++)
+        r->components[n] = (fmt->rev_order ? c_order_bgr : c_order_rgb)[n] - 1;
+
+    if (r->pack) {
+        // Dither to lowest depth - loses some precision, but result is saner.
+        r->override_depth = fmt->bits[0];
+        for (int n = 0; n < 3; n++)
+            r->override_depth = MPMIN(r->override_depth, fmt->bits[n]);
+    }
+
+    int bitpos = 0;
+    for (int n = 0; n < 3; n++) {
+        int bits = fmt->bits[n];
+        r->comp_shifts[n] = bitpos;
+        if (r->comp_lut) {
+            uint8_t *lut = r->comp_lut + 256 * n;
+            uint8_t zmax = r->pack ? (1 << r->override_depth) - 1 : 255;
+            uint8_t cmax = (1 << bits) - 1;
+            for (int v = 0; v < 256; v++) {
+                if (r->pack) {
+                    lut[v] = (v * cmax + zmax / 2) / zmax;
+                } else {
+                    lut[v] = (v & cmax) * zmax / cmax;
+                }
+            }
+        }
+        bitpos += bits;
+    }
+
+    r->comp_size = (bitpos + 7) / 8;
+    assert(r->comp_size == 1 || r->comp_size == 2);
+
+    if (fmt->be) {
+        assert(r->comp_size == 2);
+        r->endian_size = 2;
+        r->endian_items[0] = 1;
+    }
+}
+
 static void setup_nv_packer(struct mp_zimg_repack *r)
 {
     struct mp_regular_imgfmt desc;
@@ -782,6 +952,8 @@ static bool setup_format_ne(zimg_image_format *zfmt, struct mp_zimg_repack *r,
         setup_misc_packer(r);
     if (!r->repack)
         setup_regular_rgb_packer(r);
+    if (!r->repack)
+        setup_fringe_rgb_packer(r, ctx);
 
     struct mp_regular_imgfmt desc;
     if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
@@ -876,6 +1048,8 @@ static bool setup_format_ne(zimg_image_format *zfmt, struct mp_zimg_repack *r,
 
     // (Formats like P010 are basically reported as P016.)
     zfmt->depth = desc.component_size * 8 + MPMIN(0, desc.component_pad);
+    if (r->override_depth)
+        zfmt->depth = r->override_depth;
 
     zfmt->pixel_range = fmt.color.levels == MP_CSP_LEVELS_PC ?
                         ZIMG_RANGE_FULL : ZIMG_RANGE_LIMITED;