vo_gpu: make the vertex attribs dynamic

This has several advantages: 1. no more redundant texcoords when we don't need them 2. no more arbitrary limit on how many textures we can bind 3. (that extends to user shaders as well) 4. no more arbitrary limits on tscale radius To realize this, the VAO was moved from a hacky stateful approach (gl_sc_set_vertex_attribs) - which always bothered me since it was required for compute shaders as well even though they ignored it - to be a proper parameter of gl_sc_dispatch_draw, and internally plumbed into gl_sc_generate, which will make a (properly mangled) deep copy into params.vertex_attribs.
2024-12-25 08:12:17 +00:00 · 2017-09-28 00:07:42 +02:00 · 2017-09-28 00:07:42 +02:00 · 67fd5882b8
commit 67fd5882b8
parent 002a0ce232
7 changed files with 102 additions and 104 deletions
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@ -3972,10 +3972,6 @@ The following video options are currently all specific to ``--vo=gpu`` and
    ``--tscale`` are separable convolution filters (use ``--tscale=help`` to
    get a list). The default is ``mitchell``.

-    Note that the maximum supported filter radius is currently 3, due to
-    limitations in the number of video textures that can be loaded
-    simultaneously.
-
 ``--scale-param1=<value>``, ``--scale-param2=<value>``, ``--cscale-param1=<value>``, ``--cscale-param2=<value>``, ``--dscale-param1=<value>``, ``--dscale-param2=<value>``, ``--tscale-param1=<value>``, ``--tscale-param2=<value>``
    Set filter parameters. Ignored if the filter is not tunable. Currently,
    this affects the following filter parameters:
--- a/video/out/gpu/osd.c
+++ b/video/out/gpu/osd.c
@ -47,7 +47,6 @@ static const struct ra_renderpass_input vertex_vao[] = {
    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
-    {0}
 };

 struct mpgl_osd_part {
@ -231,8 +230,6 @@ bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
        abort();
    }

-    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
-
    return true;
 }

@ -317,7 +314,8 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
    const int *factors = &blend_factors[part->format][0];
    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);

-    gl_sc_dispatch_draw(sc, fbo.tex, part->vertices, part->num_vertices);
+    gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
+                        sizeof(struct vertex), part->vertices, part->num_vertices);
 }

 static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
--- a/video/out/gpu/shader_cache.c
+++ b/video/out/gpu/shader_cache.c
@ -449,20 +449,6 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
        transpose3x3(&u->v.f[0]);
 }

-// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
-// data layout and attribute names. The entries array is terminated with a {0}
-// entry. The array memory must remain valid indefinitely (for now).
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *entries,
-                             int vertex_stride)
-{
-    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
-    sc->params.num_vertex_attribs = 0;
-    while (entries[sc->params.num_vertex_attribs].name)
-        sc->params.num_vertex_attribs++;
-    sc->params.vertex_stride = vertex_stride;
-}
-
 void gl_sc_blend(struct gl_shader_cache *sc,
                 enum ra_blend blend_src_rgb,
                 enum ra_blend blend_dst_rgb,
@ -577,16 +563,6 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
    if (sc->text.len)
        mp_log_source(sc->log, MSGL_V, sc->text.start);

-    // The vertex shader uses mangled names for the vertex attributes, so that
-    // the fragment shader can use the "real" names. But the shader is expecting
-    // the vertex attribute names (at least with older GLSL targets for GL).
-    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
-                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
-    for (int n = 0; n < params.num_vertex_attribs; n++) {
-        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
-        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
-    }
-
    const char *cache_header = "mpv shader cache v1\n";
    char *cache_filename = NULL;
    char *cache_dir = NULL;
@ -773,7 +749,9 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
 //    and fragment operations needed for the next program have to be re-added.)
 static void gl_sc_generate(struct gl_shader_cache *sc,
                           enum ra_renderpass_type type,
-                           const struct ra_format *target_format)
+                           const struct ra_format *target_format,
+                           const struct ra_renderpass_input *vao,
+                           int vao_len, size_t vertex_stride)
 {
    int glsl_version = sc->ra->glsl_version;
    int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
@ -785,9 +763,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
    assert(!sc->needs_reset);
    sc->needs_reset = true;

-    // gl_sc_set_vertex_format() must always be called
-    assert(sc->params.vertex_attribs);
-
    // If using a UBO, pick a binding (needed for shader generation)
    if (sc->ubo_size)
        sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
@ -844,8 +819,8 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
        bstr *vert_body = &sc->tmp[2];
        ADD(vert_body, "void main() {\n");
        bstr *frag_vaos = &sc->tmp[3];
-        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
-            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
+        for (int n = 0; n < vao_len; n++) {
+            const struct ra_renderpass_input *e = &vao[n];
            const char *glsl_type = vao_glsl_type(e);
            char loc[32] = {0};
            if (sc->ra->glsl_vulkan)
@ -956,6 +931,19 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
            .total = bstrdup(entry, *hash_total),
            .timer = timer_pool_create(sc->ra),
        };
+
+        // The vertex shader uses mangled names for the vertex attributes, so
+        // that the fragment shader can use the "real" names. But the shader is
+        // expecting the vertex attribute names (at least with older GLSL
+        // targets for GL).
+        sc->params.vertex_stride = vertex_stride;
+        for (int n = 0; n < vao_len; n++) {
+            struct ra_renderpass_input attrib = vao[n];
+            attrib.name = talloc_asprintf(entry, "vertex_%s", attrib.name);
+            MP_TARRAY_APPEND(sc, sc->params.vertex_attribs,
+                             sc->params.num_vertex_attribs, attrib);
+        }
+
        for (int n = 0; n < sc->num_uniforms; n++) {
            struct sc_cached_uniform u = {0};
            if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
@ -997,11 +985,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc,

 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                        struct ra_tex *target,
-                                        void *ptr, size_t num)
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
+                                        void *vertices, size_t num_vertices)
 {
    struct timer_pool *timer = NULL;

-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format,
+                   vao, vao_len, vertex_stride);
    if (!sc->current_shader)
        goto error;

@ -1015,8 +1006,8 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
        .num_values = sc->num_values,
        .push_constants = sc->current_shader->pushc,
        .target = target,
-        .vertex_data = ptr,
-        .vertex_count = num,
+        .vertex_data = vertices,
+        .vertex_count = num_vertices,
        .viewport = full_rc,
        .scissors = full_rc,
    };
@ -1035,7 +1026,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
 {
    struct timer_pool *timer = NULL;

-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL, NULL, 0, 0);
    if (!sc->current_shader)
        goto error;

--- a/video/out/gpu/shader_cache.h
+++ b/video/out/gpu/shader_cache.h
@ -43,9 +43,6 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
                        bool transpose, float *v);
 void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
                        bool transpose, float *v);
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *vertex_attribs,
-                             int vertex_stride);
 void gl_sc_blend(struct gl_shader_cache *sc,
                 enum ra_blend blend_src_rgb,
                 enum ra_blend blend_dst_rgb,
@ -54,6 +51,8 @@ void gl_sc_blend(struct gl_shader_cache *sc,
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                        struct ra_tex *target,
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
                                        void *ptr, size_t num);
 struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
                                           int w, int h, int d);
--- a/video/out/gpu/user_shaders.h
+++ b/video/out/gpu/user_shaders.h
@ -22,7 +22,7 @@
 #include "ra.h"

 #define SHADER_MAX_HOOKS 16
-#define SHADER_MAX_BINDS 6
+#define SHADER_MAX_BINDS 16
 #define MAX_SZEXP_SIZE 32

 enum szexp_op {
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@ -60,28 +60,12 @@ static const char *const fixed_tscale_filters[] = {
 // must be sorted, and terminated with 0
 int filter_sizes[] =
    {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
-int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
+int tscale_sizes[] = {2, 4, 6, 8, 0};

 struct vertex_pt {
    float x, y;
 };

-struct vertex {
-    struct vertex_pt position;
-    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
-    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
-    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
-    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
-    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
-    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
-    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
-    {0}
-};
-
 struct texplane {
    struct ra_tex *tex;
    int w, h;
@ -213,6 +197,13 @@ struct gl_video {
    bool dumb_mode;
    bool forced_dumb_mode;

+    // Cached vertex array, to avoid re-allocation per frame. For simplicity,
+    // our vertex format is simply a list of `vertex_pt`s, since this greatly
+    // simplifies offset calculation at the cost of (unneeded) flexibility.
+    struct vertex_pt *tmp_vertex;
+    struct ra_renderpass_input *vao;
+    int vao_len;
+
    const struct ra_format *fbo_format;
    struct ra_tex *merge_tex[4];
    struct ra_tex *scale_tex[4];
@ -252,8 +243,8 @@ struct gl_video {

    // temporary during rendering
    struct compute_info pass_compute; // compute shader metadata for this pass
-    struct image pass_img[TEXUNIT_VIDEO_NUM]; // bound images for this pass
-    int pass_img_num;
+    struct image *pass_imgs;          // bound images for this pass
+    int num_pass_imgs;
    struct saved_img *saved_imgs;     // saved (named) images for this frame
    int num_saved_imgs;

@ -631,13 +622,12 @@ static struct image image_wrap(struct ra_tex *tex, enum plane_type type,
    };
 }

-// Bind an image to a free texture unit and return its ID. At most
-// TEXUNIT_VIDEO_NUM texture units can be bound at once
+// Bind an image to a free texture unit and return its ID.
 static int pass_bind(struct gl_video *p, struct image img)
 {
-    assert(p->pass_img_num < TEXUNIT_VIDEO_NUM);
-    p->pass_img[p->pass_img_num] = img;
-    return p->pass_img_num++;
+    int idx = p->num_pass_imgs;
+    MP_TARRAY_APPEND(p, p->pass_imgs, p->num_pass_imgs, img);
+    return idx;
 }

 // Rotation by 90° and flipping.
@ -1062,8 +1052,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
 {
    struct gl_shader_cache *sc = p->sc;

-    for (int n = 0; n < p->pass_img_num; n++) {
-        struct image *s = &p->pass_img[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
        if (!s->tex)
            continue;

@ -1087,6 +1077,11 @@ static void pass_prepare_src_tex(struct gl_video *p)
    }
 }

+static void cleanup_binds(struct gl_video *p)
+{
+    p->num_pass_imgs = 0;
+}
+
 // Sets the appropriate compute shader metadata for an implicit compute pass
 // bw/bh: block size
 static void pass_is_compute(struct gl_video *p, int bw, int bh)
@ -1098,12 +1093,6 @@ static void pass_is_compute(struct gl_video *p, int bw, int bh)
    };
 }

-static void cleanup_binds(struct gl_video *p)
-{
-    memset(&p->pass_img, 0, sizeof(p->pass_img));
-    p->pass_img_num = 0;
-}
-
 // w/h: the width/height of the compute shader's operating domain (e.g. the
 // target target that needs to be written, or the source texture that needs to
 // be reduced)
@ -1115,7 +1104,6 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
            info.threads_h > 0 ? info.threads_h : info.block_h);

    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));

    // Since we don't actually have vertices, we pretend for convenience
    // reasons that we do and calculate the right texture coordinates based on
@ -1123,14 +1111,13 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
    gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
    PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");

-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct image *s = &p->pass_img[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
        if (!s->tex)
            continue;

        // We need to rescale the coordinates to the true texture size
-        char tex_scale[32];
-        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+        char *tex_scale = mp_tprintf(32, "tex_scale%d", n);
        gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
                (float)s->w / s->tex->params.w,
                (float)s->h / s->tex->params.h,
@ -1155,7 +1142,24 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
                                            struct ra_fbo fbo,
                                            const struct mp_rect *dst)
 {
-    struct vertex va[6] = {0};
+    // The first element is reserved for `vec2 position`
+    int num_vertex_attribs = 1 + p->num_pass_imgs;
+    size_t vertex_stride = num_vertex_attribs * sizeof(struct vertex_pt);
+
+    // Expand the VAO if necessary
+    while (p->vao_len < num_vertex_attribs) {
+        MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+            .name = talloc_asprintf(p, "texcoord%d", p->vao_len - 1),
+            .type = RA_VARTYPE_FLOAT,
+            .dim_v = 2,
+            .dim_m = 1,
+            .offset = p->vao_len * sizeof(struct vertex_pt),
+        });
+    }
+
+    int num_vertices = 6; // quad as triangle list
+    int num_attribs_total = num_vertices * num_vertex_attribs;
+    MP_TARRAY_GROW(p, p->tmp_vertex, num_attribs_total);

    struct gl_transform t;
    gl_transform_ortho_fbo(&t, fbo);
@ -1166,11 +1170,12 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
    gl_transform_vec(t, &x[1], &y[1]);

    for (int n = 0; n < 4; n++) {
-        struct vertex *v = &va[n];
-        v->position.x = x[n / 2];
-        v->position.y = y[n % 2];
-        for (int i = 0; i < p->pass_img_num; i++) {
-            struct image *s = &p->pass_img[i];
+        struct vertex_pt *vs = &p->tmp_vertex[num_vertex_attribs * n];
+        // vec2 position in idx 0
+        vs[0].x = x[n / 2];
+        vs[0].y = y[n % 2];
+        for (int i = 0; i < p->num_pass_imgs; i++) {
+            struct image *s = &p->pass_imgs[i];
            if (!s->tex)
                continue;
            struct gl_transform tr = s->transform;
@ -1178,22 +1183,28 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
            float ty = (n % 2) * s->h;
            gl_transform_vec(tr, &tx, &ty);
            bool rect = s->tex->params.non_normalized;
-            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
-            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
+            // vec2 texcoordN in idx N+1
+            vs[i + 1].x = tx / (rect ? 1 : s->tex->params.w);
+            vs[i + 1].y = ty / (rect ? 1 : s->tex->params.h);
        }
    }

-    va[4] = va[2];
-    va[5] = va[1];
+    memmove(&p->tmp_vertex[num_vertex_attribs * 4],
+            &p->tmp_vertex[num_vertex_attribs * 2],
+            vertex_stride);

-    return gl_sc_dispatch_draw(p->sc, fbo.tex, va, 6);
+    memmove(&p->tmp_vertex[num_vertex_attribs * 5],
+            &p->tmp_vertex[num_vertex_attribs * 1],
+            vertex_stride);
+
+    return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, p->vao_len, vertex_stride,
+                               p->tmp_vertex, num_vertices);
 }

 static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo,
                            const struct mp_rect *dst)
 {
    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
    pass_record(p, render_pass_quad(p, fbo, dst));
    debug_check_gl(p, "after rendering");
    cleanup_binds(p);
@ -1340,7 +1351,7 @@ static void saved_img_store(struct gl_video *p, const char *name,
 static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
                                  struct image img, struct tex_hook *hook)
 {
-    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+    for (int t = 0; t < SHADER_MAX_BINDS; t++) {
        char *bind_name = (char *)hook->bind_tex[t];

        if (!bind_name)
@ -1370,7 +1381,7 @@ static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
            // Clean up texture bindings and move on to the next hook
            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
                   name, bind_name);
-            p->pass_img_num -= t;
+            p->num_pass_imgs -= t;
            return false;
        }

@ -1481,7 +1492,7 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name,
                goto found;
        }

-        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+        for (int b = 0; b < SHADER_MAX_BINDS; b++) {
            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
                goto found;
        }
@ -2855,7 +2866,6 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
    } else {
        assert(tscale->kernel && !tscale->kernel->polar);
        size = ceil(tscale->kernel->size);
-        assert(size <= TEXUNIT_VIDEO_NUM);
    }

    int radius = size/2;
@ -3580,6 +3590,14 @@ struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
    p->opts = *opts;
    for (int n = 0; n < SCALER_COUNT; n++)
        p->scaler[n] = (struct scaler){.index = n};
+    // our VAO always has the vec2 position as the first element
+    MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+        .name = "position",
+        .type = RA_VARTYPE_FLOAT,
+        .dim_v = 2,
+        .dim_m = 1,
+        .offset = 0,
+    });
    init_gl(p);
    reinit_from_options(p);
    return p;
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@ -28,10 +28,6 @@
 #include "video/csputils.h"
 #include "video/out/filter_kernels.h"

-// Assume we have this many texture units for sourcing additional passes.
-// The actual texture unit assignment is dynamic.
-#define TEXUNIT_VIDEO_NUM 6
-
 struct scaler_fun {
    char *name;
    float params[2];