mirror of
https://github.com/mpv-player/mpv
synced 2025-03-24 12:22:25 +00:00
vo_opengl: make compute shaders more flexible
This allows users to do their own custom sample writing, mainly meant to address use cases such as RAVU. Also clean up the compute shader code a bit.
This commit is contained in:
parent
e7d31d12be
commit
83f3910398
@ -4329,16 +4329,26 @@ The following video options are currently all specific to ``--vo=opengl`` and
|
|||||||
should be stored in the texture, up to 4 (rgba). By default, this value
|
should be stored in the texture, up to 4 (rgba). By default, this value
|
||||||
is equal to the number of components in HOOKED.
|
is equal to the number of components in HOOKED.
|
||||||
|
|
||||||
COMPUTE bw bh
|
COMPUTE <bw> <bh> [<tw> <th>]
|
||||||
Specifies that this shader should be treated as a compute shader, with
|
Specifies that this shader should be treated as a compute shader, with
|
||||||
the block size bw and bh. The compute shader will be dispatched with
|
the block size bw and bh. The compute shader will be dispatched with
|
||||||
however many blocks are necessary to completely tile over the output.
|
however many blocks are necessary to completely tile over the output.
|
||||||
Compute shaders in mpv are treated similarly to fragment shaders, and
|
Within each block, there will bw tw*th threads, forming a single work
|
||||||
are still required to produce an output color. In addition, mpv
|
group. In other words: tw and th specify the work group size, which can
|
||||||
provides a special function NAME_map(id) to map from the global ID
|
be different from the block size. So for example, a compute shader with
|
||||||
space to the texture coordinates for all bound textures. The only real
|
bw, bh = 32 and tw, th = 8 running on a 500x500 texture would dispatch
|
||||||
difference is the fact that you can use shared memory inside compute
|
16x16 blocks (rounded up), each with 8x8 threads.
|
||||||
shaders.
|
|
||||||
|
Compute shaders in mpv are treated a bit different from fragment
|
||||||
|
shaders. Instead of defining a ``vec4 hook`` that produces an output
|
||||||
|
sample, you directly define ``void hook`` which writes to a fixed
|
||||||
|
writeonly image unit named ``out_image`` (this is bound by mpv) using
|
||||||
|
`imageStore`. To help translate texture coordinates in the absence of
|
||||||
|
vertices, mpv provides a special function ``NAME_map(id)`` to map from
|
||||||
|
the texel space of the output image to the texture coordinates for all
|
||||||
|
bound textures. In particular, ``NAME_pos`` is equivalent to
|
||||||
|
``NAME_map(gl_GlobalInvocationID)``, although using this only really
|
||||||
|
makes sense if (tw,th) == (bw,bh).
|
||||||
|
|
||||||
Each bound mpv texture (via ``BIND``) will make available the following
|
Each bound mpv texture (via ``BIND``) will make available the following
|
||||||
definitions to that shader pass, where NAME is the name of the bound
|
definitions to that shader pass, where NAME is the name of the bound
|
||||||
|
@ -259,7 +259,14 @@ static bool parse_hook(struct mp_log *log, struct bstr *body,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (bstr_eatstart0(&line, "COMPUTE")) {
|
if (bstr_eatstart0(&line, "COMPUTE")) {
|
||||||
if (bstr_sscanf(line, "%d %d", &out->compute_w, &out->compute_h) != 2) {
|
struct compute_info *ci = &out->compute;
|
||||||
|
int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
|
||||||
|
&ci->threads_w, &ci->threads_h);
|
||||||
|
|
||||||
|
if (num == 2 || num == 4) {
|
||||||
|
ci->active = true;
|
||||||
|
ci->directly_writes = true;
|
||||||
|
} else {
|
||||||
mp_err(log, "Error while parsing COMPUTE!\n");
|
mp_err(log, "Error while parsing COMPUTE!\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,13 @@ struct szexp {
|
|||||||
} val;
|
} val;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct compute_info {
|
||||||
|
bool active;
|
||||||
|
int block_w, block_h; // Block size (each block corresponds to one WG)
|
||||||
|
int threads_w, threads_h; // How many threads form a working group
|
||||||
|
bool directly_writes; // If true, shader is assumed to imageStore(out_image)
|
||||||
|
};
|
||||||
|
|
||||||
struct gl_user_shader_hook {
|
struct gl_user_shader_hook {
|
||||||
struct bstr pass_desc;
|
struct bstr pass_desc;
|
||||||
struct bstr hook_tex[SHADER_MAX_HOOKS];
|
struct bstr hook_tex[SHADER_MAX_HOOKS];
|
||||||
@ -66,8 +73,7 @@ struct gl_user_shader_hook {
|
|||||||
struct szexp height[MAX_SZEXP_SIZE];
|
struct szexp height[MAX_SZEXP_SIZE];
|
||||||
struct szexp cond[MAX_SZEXP_SIZE];
|
struct szexp cond[MAX_SZEXP_SIZE];
|
||||||
int components;
|
int components;
|
||||||
int compute_w;
|
struct compute_info compute;
|
||||||
int compute_h;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gl_user_shader_tex {
|
struct gl_user_shader_tex {
|
||||||
|
@ -768,8 +768,8 @@ static const char *mp_image2D_type(GLenum access)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
|
void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
|
||||||
GLuint iformat, GLenum access)
|
GLuint texture, GLuint iformat, GLenum access)
|
||||||
{
|
{
|
||||||
gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
|
gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
|
||||||
|
|
||||||
|
@ -150,8 +150,8 @@ void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target,
|
|||||||
void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
|
void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
|
||||||
struct ra_tex *tex);
|
struct ra_tex *tex);
|
||||||
void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
|
void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
|
||||||
void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
|
void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
|
||||||
GLuint iformat, GLenum access);
|
GLuint texture, GLuint iformat, GLenum access);
|
||||||
void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
|
void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
|
||||||
char *format, ...) PRINTF_ATTRIBUTE(4, 5);
|
char *format, ...) PRINTF_ATTRIBUTE(4, 5);
|
||||||
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
|
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
|
||||||
|
@ -262,9 +262,9 @@ struct gl_video {
|
|||||||
|
|
||||||
// temporary during rendering
|
// temporary during rendering
|
||||||
struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
|
struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
|
||||||
|
struct compute_info pass_compute; // compute shader metadata for this pass
|
||||||
int pass_tex_num;
|
int pass_tex_num;
|
||||||
int texture_w, texture_h;
|
int texture_w, texture_h;
|
||||||
int compute_w, compute_h; // presence indicates the use of a compute shader
|
|
||||||
struct gl_transform texture_offset; // texture transform without rotation
|
struct gl_transform texture_offset; // texture transform without rotation
|
||||||
int components;
|
int components;
|
||||||
bool use_linear;
|
bool use_linear;
|
||||||
@ -1132,26 +1132,28 @@ static void pass_prepare_src_tex(struct gl_video *p)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the compute work group size requirements for the current shader.
|
// Sets the appropriate compute shader metadata for an implicit compute pass
|
||||||
// Since we assume that all shaders can work with bigger working groups, just
|
// bw/bh: block size
|
||||||
// never smaller ones, this effectively becomes the maximum of all size
|
static void pass_is_compute(struct gl_video *p, int bw, int bh)
|
||||||
// requirements
|
|
||||||
static void compute_size_minimum(struct gl_video *p, int bw, int bh)
|
|
||||||
{
|
{
|
||||||
p->compute_w = MPMAX(p->compute_w, bw);
|
p->pass_compute = (struct compute_info){
|
||||||
p->compute_h = MPMAX(p->compute_h, bh);
|
.active = true,
|
||||||
|
.block_w = bw,
|
||||||
|
.block_h = bh,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// w/h: the width/height of the compute shader's operating domain (e.g. the
|
// w/h: the width/height of the compute shader's operating domain (e.g. the
|
||||||
// target target that needs to be written, or the source texture that needs to
|
// target target that needs to be written, or the source texture that needs to
|
||||||
// be reduced)
|
// be reduced)
|
||||||
// bw/bh: the width/height of the block (working group), which is tiled over
|
static void dispatch_compute(struct gl_video *p, int w, int h,
|
||||||
// w/h as necessary
|
struct compute_info info)
|
||||||
static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
|
|
||||||
{
|
{
|
||||||
GL *gl = p->gl;
|
GL *gl = p->gl;
|
||||||
|
|
||||||
PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh);
|
PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
|
||||||
|
info.threads_w > 0 ? info.threads_w : info.block_w,
|
||||||
|
info.threads_h > 0 ? info.threads_h : info.block_h);
|
||||||
|
|
||||||
pass_prepare_src_tex(p);
|
pass_prepare_src_tex(p);
|
||||||
gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
|
gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
|
||||||
@ -1188,8 +1190,8 @@ static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
|
|||||||
|
|
||||||
// always round up when dividing to make sure we don't leave off a part of
|
// always round up when dividing to make sure we don't leave off a part of
|
||||||
// the image
|
// the image
|
||||||
int num_x = (w + bw - 1) / bw,
|
int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
|
||||||
num_y = (h + bh - 1) / bh;
|
num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
|
||||||
|
|
||||||
gl->DispatchCompute(num_x, num_y, 1);
|
gl->DispatchCompute(num_x, num_y, 1);
|
||||||
gl_sc_reset(p->sc);
|
gl_sc_reset(p->sc);
|
||||||
@ -1263,18 +1265,19 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
|
|||||||
{
|
{
|
||||||
fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
|
fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
|
||||||
|
|
||||||
if (p->compute_w > 0 && p->compute_h > 0) {
|
if (p->pass_compute.active) {
|
||||||
gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture,
|
gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture,
|
||||||
dst_fbo->iformat, GL_WRITE_ONLY);
|
dst_fbo->iformat, GL_WRITE_ONLY);
|
||||||
|
if (!p->pass_compute.directly_writes)
|
||||||
GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
|
GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
|
||||||
dispatch_compute(p, w, h, p->compute_w, p->compute_h);
|
|
||||||
|
dispatch_compute(p, w, h, p->pass_compute);
|
||||||
p->gl->MemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
|
p->gl->MemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
|
||||||
|
p->pass_compute = (struct compute_info){0};
|
||||||
} else {
|
} else {
|
||||||
finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
|
finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
|
||||||
&(struct mp_rect){0, 0, w, h});
|
&(struct mp_rect){0, 0, w, h});
|
||||||
}
|
}
|
||||||
|
|
||||||
p->compute_w = p->compute_h = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *get_tex_swizzle(struct img_tex *img)
|
static const char *get_tex_swizzle(struct img_tex *img)
|
||||||
@ -1756,7 +1759,7 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
|
|||||||
if (shmem_req > gl->max_shmem)
|
if (shmem_req > gl->max_shmem)
|
||||||
goto fallback;
|
goto fallback;
|
||||||
|
|
||||||
compute_size_minimum(p, bw, bh);
|
pass_is_compute(p, bw, bh);
|
||||||
pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
|
pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -1923,13 +1926,17 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
|
|||||||
{
|
{
|
||||||
struct gl_user_shader_hook *shader = priv;
|
struct gl_user_shader_hook *shader = priv;
|
||||||
assert(shader);
|
assert(shader);
|
||||||
|
load_shader(p, shader->pass_body);
|
||||||
|
|
||||||
pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
|
pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
|
||||||
plane_names[tex.type]);
|
plane_names[tex.type]);
|
||||||
|
|
||||||
compute_size_minimum(p, shader->compute_w, shader->compute_h);
|
if (shader->compute.active) {
|
||||||
load_shader(p, shader->pass_body);
|
p->pass_compute = shader->compute;
|
||||||
|
GLSLF("hook();\n");
|
||||||
|
} else {
|
||||||
GLSLF("color = hook();\n");
|
GLSLF("color = hook();\n");
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure we at least create a legal FBO on failure, since it's better
|
// Make sure we at least create a legal FBO on failure, since it's better
|
||||||
// to do this and display an error message than just crash OpenGL
|
// to do this and display an error message than just crash OpenGL
|
||||||
@ -2487,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
|
|||||||
bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
|
bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
|
||||||
if (detect_peak) {
|
if (detect_peak) {
|
||||||
pass_describe(p, "detect HDR peak");
|
pass_describe(p, "detect HDR peak");
|
||||||
compute_size_minimum(p, 8, 8); // 8x8 is good for performance
|
pass_is_compute(p, 8, 8); // 8x8 is good for performance
|
||||||
|
|
||||||
if (!p->hdr_peak_ssbo) {
|
if (!p->hdr_peak_ssbo) {
|
||||||
struct {
|
struct {
|
||||||
@ -2808,7 +2815,7 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
|
|||||||
// Since finish_pass_direct doesn't work with compute shaders, and neither
|
// Since finish_pass_direct doesn't work with compute shaders, and neither
|
||||||
// does the checkerboard/dither code, we may need an indirection via
|
// does the checkerboard/dither code, we may need an indirection via
|
||||||
// p->screen_fbo here.
|
// p->screen_fbo here.
|
||||||
if (p->compute_w > 0 && p->compute_h > 0) {
|
if (p->pass_compute.active) {
|
||||||
int o_w = p->dst_rect.x1 - p->dst_rect.x0,
|
int o_w = p->dst_rect.x1 - p->dst_rect.x0,
|
||||||
o_h = p->dst_rect.y1 - p->dst_rect.y0;
|
o_h = p->dst_rect.y1 - p->dst_rect.y0;
|
||||||
finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
|
finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
|
||||||
|
Loading…
Reference in New Issue
Block a user