diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 02cb4d826d..80dfdceb54 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -4329,16 +4329,26 @@ The following video options are currently all specific to ``--vo=opengl`` and should be stored in the texture, up to 4 (rgba). By default, this value is equal to the number of components in HOOKED. - COMPUTE bw bh + COMPUTE [ ] Specifies that this shader should be treated as a compute shader, with the block size bw and bh. The compute shader will be dispatched with however many blocks are necessary to completely tile over the output. - Compute shaders in mpv are treated similarly to fragment shaders, and - are still required to produce an output color. In addition, mpv - provides a special function NAME_map(id) to map from the global ID - space to the texture coordinates for all bound textures. The only real - difference is the fact that you can use shared memory inside compute - shaders. + Within each block, there will bw tw*th threads, forming a single work + group. In other words: tw and th specify the work group size, which can + be different from the block size. So for example, a compute shader with + bw, bh = 32 and tw, th = 8 running on a 500x500 texture would dispatch + 16x16 blocks (rounded up), each with 8x8 threads. + + Compute shaders in mpv are treated a bit different from fragment + shaders. Instead of defining a ``vec4 hook`` that produces an output + sample, you directly define ``void hook`` which writes to a fixed + writeonly image unit named ``out_image`` (this is bound by mpv) using + `imageStore`. To help translate texture coordinates in the absence of + vertices, mpv provides a special function ``NAME_map(id)`` to map from + the texel space of the output image to the texture coordinates for all + bound textures. In particular, ``NAME_pos`` is equivalent to + ``NAME_map(gl_GlobalInvocationID)``, although using this only really + makes sense if (tw,th) == (bw,bh). Each bound mpv texture (via ``BIND``) will make available the following definitions to that shader pass, where NAME is the name of the bound diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c index 799367f3e1..58a1ac9e64 100644 --- a/video/out/opengl/user_shaders.c +++ b/video/out/opengl/user_shaders.c @@ -259,7 +259,14 @@ static bool parse_hook(struct mp_log *log, struct bstr *body, } if (bstr_eatstart0(&line, "COMPUTE")) { - if (bstr_sscanf(line, "%d %d", &out->compute_w, &out->compute_h) != 2) { + struct compute_info *ci = &out->compute; + int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h, + &ci->threads_w, &ci->threads_h); + + if (num == 2 || num == 4) { + ci->active = true; + ci->directly_writes = true; + } else { mp_err(log, "Error while parsing COMPUTE!\n"); return false; } diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h index 888422608c..5f3f1d0d93 100644 --- a/video/out/opengl/user_shaders.h +++ b/video/out/opengl/user_shaders.h @@ -55,6 +55,13 @@ struct szexp { } val; }; +struct compute_info { + bool active; + int block_w, block_h; // Block size (each block corresponds to one WG) + int threads_w, threads_h; // How many threads form a working group + bool directly_writes; // If true, shader is assumed to imageStore(out_image) +}; + struct gl_user_shader_hook { struct bstr pass_desc; struct bstr hook_tex[SHADER_MAX_HOOKS]; @@ -66,8 +73,7 @@ struct gl_user_shader_hook { struct szexp height[MAX_SZEXP_SIZE]; struct szexp cond[MAX_SZEXP_SIZE]; int components; - int compute_w; - int compute_h; + struct compute_info compute; }; struct gl_user_shader_tex { diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index 024b8d4bbe..f9f31e31cc 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -768,8 +768,8 @@ static const char *mp_image2D_type(GLenum access) } } -void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture, - GLuint iformat, GLenum access) +void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name, + GLuint texture, GLuint iformat, GLenum access) { gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store"); diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h index 2a15d85b71..48e139dcc7 100644 --- a/video/out/opengl/utils.h +++ b/video/out/opengl/utils.h @@ -150,8 +150,8 @@ void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target, void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name, struct ra_tex *tex); void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture); -void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture, - GLuint iformat, GLenum access); +void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name, + GLuint texture, GLuint iformat, GLenum access); void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo, char *format, ...) PRINTF_ATTRIBUTE(4, 5); void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f); diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index b6be230b53..811c7b717b 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -262,9 +262,9 @@ struct gl_video { // temporary during rendering struct img_tex pass_tex[TEXUNIT_VIDEO_NUM]; + struct compute_info pass_compute; // compute shader metadata for this pass int pass_tex_num; int texture_w, texture_h; - int compute_w, compute_h; // presence indicates the use of a compute shader struct gl_transform texture_offset; // texture transform without rotation int components; bool use_linear; @@ -1132,26 +1132,28 @@ static void pass_prepare_src_tex(struct gl_video *p) } } -// Update the compute work group size requirements for the current shader. -// Since we assume that all shaders can work with bigger working groups, just -// never smaller ones, this effectively becomes the maximum of all size -// requirements -static void compute_size_minimum(struct gl_video *p, int bw, int bh) +// Sets the appropriate compute shader metadata for an implicit compute pass +// bw/bh: block size +static void pass_is_compute(struct gl_video *p, int bw, int bh) { - p->compute_w = MPMAX(p->compute_w, bw); - p->compute_h = MPMAX(p->compute_h, bh); + p->pass_compute = (struct compute_info){ + .active = true, + .block_w = bw, + .block_h = bh, + }; } // w/h: the width/height of the compute shader's operating domain (e.g. the // target target that needs to be written, or the source texture that needs to // be reduced) -// bw/bh: the width/height of the block (working group), which is tiled over -// w/h as necessary -static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh) +static void dispatch_compute(struct gl_video *p, int w, int h, + struct compute_info info) { GL *gl = p->gl; - PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh); + PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", + info.threads_w > 0 ? info.threads_w : info.block_w, + info.threads_h > 0 ? info.threads_h : info.block_h); pass_prepare_src_tex(p); gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex)); @@ -1188,8 +1190,8 @@ static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh) // always round up when dividing to make sure we don't leave off a part of // the image - int num_x = (w + bw - 1) / bw, - num_y = (h + bh - 1) / bh; + int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1, + num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1; gl->DispatchCompute(num_x, num_y, 1); gl_sc_reset(p->sc); @@ -1263,18 +1265,19 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo, { fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags); - if (p->compute_w > 0 && p->compute_h > 0) { + if (p->pass_compute.active) { gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture, dst_fbo->iformat, GL_WRITE_ONLY); - GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);) - dispatch_compute(p, w, h, p->compute_w, p->compute_h); + if (!p->pass_compute.directly_writes) + GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);) + + dispatch_compute(p, w, h, p->pass_compute); p->gl->MemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); + p->pass_compute = (struct compute_info){0}; } else { finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh, &(struct mp_rect){0, 0, w, h}); } - - p->compute_w = p->compute_h = 0; } static const char *get_tex_swizzle(struct img_tex *img) @@ -1756,7 +1759,7 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler if (shmem_req > gl->max_shmem) goto fallback; - compute_size_minimum(p, bw, bh); + pass_is_compute(p, bw, bh); pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih); return; @@ -1923,13 +1926,17 @@ static void user_hook(struct gl_video *p, struct img_tex tex, { struct gl_user_shader_hook *shader = priv; assert(shader); + load_shader(p, shader->pass_body); pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc), plane_names[tex.type]); - compute_size_minimum(p, shader->compute_w, shader->compute_h); - load_shader(p, shader->pass_body); - GLSLF("color = hook();\n"); + if (shader->compute.active) { + p->pass_compute = shader->compute; + GLSLF("hook();\n"); + } else { + GLSLF("color = hook();\n"); + } // Make sure we at least create a legal FBO on failure, since it's better // to do this and display an error message than just crash OpenGL @@ -2487,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma); if (detect_peak) { pass_describe(p, "detect HDR peak"); - compute_size_minimum(p, 8, 8); // 8x8 is good for performance + pass_is_compute(p, 8, 8); // 8x8 is good for performance if (!p->hdr_peak_ssbo) { struct { @@ -2808,7 +2815,7 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo) // Since finish_pass_direct doesn't work with compute shaders, and neither // does the checkerboard/dither code, we may need an indirection via // p->screen_fbo here. - if (p->compute_w > 0 && p->compute_h > 0) { + if (p->pass_compute.active) { int o_w = p->dst_rect.x1 - p->dst_rect.x0, o_h = p->dst_rect.y1 - p->dst_rect.y0; finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);