vo_opengl: move timers to struct ra

In order to prevent code duplication and keep the ra abstraction as
small as possible, `ra` only implements the actual timer queries,
it does not do pooling/averaging of the results. This is instead moved
to a ra-neutral struct timer_pool in utils.c.
This commit is contained in:
Niklas Haas 2017-08-05 18:20:45 +02:00
parent a680c643eb
commit f2298f394e
9 changed files with 247 additions and 168 deletions

View File

@ -61,9 +61,10 @@
#define GL_TEXTURE_RECTANGLE 0x84F5
// --- GL 3.3
// --- GL 3.3 or GL_ARB_timer_query
#define GL_TIME_ELAPSED 0x88BF
#define GL_TIMESTAMP 0x8E28
// --- GL 4.3 or GL_ARB_debug_output

View File

@ -282,125 +282,6 @@ void gl_set_debug_logger(GL *gl, struct mp_log *log)
gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
}
// Maximum number of simultaneous query objects to keep around. Reducing this
// number might cause rendering to block until the result of a previous query is
// available
#define QUERY_OBJECT_NUM 8
struct gl_timer {
GL *gl;
GLuint query[QUERY_OBJECT_NUM];
int query_idx;
// these numbers are all in nanoseconds
uint64_t samples[PERF_SAMPLE_COUNT];
int sample_idx;
int sample_count;
uint64_t avg_sum;
uint64_t peak;
};
struct mp_pass_perf gl_timer_measure(struct gl_timer *timer)
{
assert(timer);
struct mp_pass_perf res = {
.count = timer->sample_count,
.index = (timer->sample_idx - timer->sample_count) % PERF_SAMPLE_COUNT,
.peak = timer->peak,
.samples = timer->samples,
};
res.last = timer->samples[(timer->sample_idx - 1) % PERF_SAMPLE_COUNT];
if (timer->sample_count > 0) {
res.avg = timer->avg_sum / timer->sample_count;
}
return res;
}
struct gl_timer *gl_timer_create(GL *gl)
{
struct gl_timer *timer = talloc_ptrtype(NULL, timer);
*timer = (struct gl_timer){ .gl = gl };
if (gl->GenQueries)
gl->GenQueries(QUERY_OBJECT_NUM, timer->query);
return timer;
}
void gl_timer_free(struct gl_timer *timer)
{
if (!timer)
return;
GL *gl = timer->gl;
if (gl && gl->DeleteQueries) {
// this is a no-op on already uninitialized queries
gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query);
}
talloc_free(timer);
}
static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
{
// Input res into the buffer and grab the previous value
uint64_t old = timer->samples[timer->sample_idx];
timer->samples[timer->sample_idx++] = new;
timer->sample_idx %= PERF_SAMPLE_COUNT;
// Update average and sum
timer->avg_sum = timer->avg_sum + new - old;
timer->sample_count = MPMIN(timer->sample_count + 1, PERF_SAMPLE_COUNT);
// Update peak if necessary
if (new >= timer->peak) {
timer->peak = new;
} else if (timer->peak == old) {
// It's possible that the last peak was the value we just removed,
// if so we need to scan for the new peak
uint64_t peak = new;
for (int i = 0; i < PERF_SAMPLE_COUNT; i++)
peak = MPMAX(peak, timer->samples[i]);
timer->peak = peak;
}
}
// If no free query is available, this can block. Shouldn't ever happen in
// practice, though. (If it does, consider increasing QUERY_OBJECT_NUM)
// IMPORTANT: only one gl_timer object may ever be active at a single time.
// The caling code *MUST* ensure this
void gl_timer_start(struct gl_timer *timer)
{
assert(timer);
GL *gl = timer->gl;
if (!gl->BeginQuery)
return;
// Get the next query object
GLuint id = timer->query[timer->query_idx++];
timer->query_idx %= QUERY_OBJECT_NUM;
// If this query object already holds a result, we need to get and
// record it first
if (gl->IsQuery(id)) {
GLuint64 elapsed;
gl->GetQueryObjectui64v(id, GL_QUERY_RESULT, &elapsed);
gl_timer_record(timer, elapsed);
}
gl->BeginQuery(GL_TIME_ELAPSED, id);
}
void gl_timer_stop(GL *gl)
{
if (gl->EndQuery)
gl->EndQuery(GL_TIME_ELAPSED);
}
// Upload a texture, going through a PBO. PBO supposedly can facilitate
// asynchronous copy from CPU to GPU, so this is an optimization. Note that
// changing format/type/tex_w/tex_h or reusing the PBO in the same frame can

View File

@ -55,14 +55,6 @@ void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
void gl_set_debug_logger(GL *gl, struct mp_log *log);
struct gl_timer;
struct gl_timer *gl_timer_create(GL *gl);
void gl_timer_free(struct gl_timer *timer);
void gl_timer_start(struct gl_timer *timer);
void gl_timer_stop(GL *gl);
struct mp_pass_perf gl_timer_measure(struct gl_timer *timer);
#define NUM_PBO_BUFFERS 3
struct gl_pbo_upload {

View File

@ -264,6 +264,10 @@ enum {
RA_TEX_UPLOAD_DISCARD = 1 << 0, // discard pre-existing data not in the region
};
// This is an opaque type provided by the implementation, but we want to at
// least give it a saner name than void* for code readability purposes.
typedef void ra_timer;
// Rendering API entrypoints. (Note: there are some additional hidden features
// you need to take care of. For example, hwdec mapping will be provided
// separately from ra, but might need to call into ra private code.)
@ -347,6 +351,24 @@ struct ra_fns {
// This is an extremely common operation.
void (*renderpass_run)(struct ra *ra,
const struct ra_renderpass_run_params *params);
// Create a timer object. Returns NULL on failure, or if timers are
// unavailable.
ra_timer *(*timer_create)(struct ra *ra);
void (*timer_destroy)(struct ra *ra, ra_timer *timer);
// Start recording a timer. Note that valid usage requires you to pair
// every start with a stop. Trying to start a timer twice, or trying to
// stop a timer before having started it, consistutes invalid usage.
void (*timer_start)(struct ra *ra, ra_timer *timer);
// Stop recording a timer. This also returns any results that have been
// measured since the last usage of this ra_timer. It's important to note
// that GPU timer measurement are asynchronous, so this function does not
// always produce a value - and the values it does produce are typically
// delayed by a few frames. When no value is available, this returns 0.
uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
};
struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params);

View File

@ -840,6 +840,75 @@ static void gl_renderpass_run(struct ra *ra,
pass_gl->first_run = false;
}
// Timers in GL use query objects, and are asynchronous. So pool a few of
// these together. GL_QUERY_OBJECT_NUM should be large enough to avoid this
// ever blocking. We can afford to throw query objects around, there's no
// practical limit on them and their overhead is small.
#define GL_QUERY_OBJECT_NUM 8
struct gl_timer {
GLuint start[GL_QUERY_OBJECT_NUM];
GLuint stop[GL_QUERY_OBJECT_NUM];
int idx;
uint64_t result;
};
static ra_timer *gl_timer_create(struct ra *ra)
{
GL *gl = ra_gl_get(ra);
if (!gl->GenQueries)
return NULL;
struct gl_timer *timer = talloc_zero(NULL, struct gl_timer);
gl->GenQueries(GL_QUERY_OBJECT_NUM, timer->start);
gl->GenQueries(GL_QUERY_OBJECT_NUM, timer->stop);
return (ra_timer *)timer;
}
static void gl_timer_destroy(struct ra *ra, ra_timer *ratimer)
{
if (!ratimer)
return;
GL *gl = ra_gl_get(ra);
struct gl_timer *timer = ratimer;
gl->DeleteQueries(GL_QUERY_OBJECT_NUM, timer->start);
gl->DeleteQueries(GL_QUERY_OBJECT_NUM, timer->stop);
talloc_free(timer);
}
static void gl_timer_start(struct ra *ra, ra_timer *ratimer)
{
GL *gl = ra_gl_get(ra);
struct gl_timer *timer = ratimer;
// If this query object already contains a result, we need to retrieve it
timer->result = 0;
if (gl->IsQuery(timer->start[timer->idx])) {
uint64_t start = 0, stop = 0;
gl->GetQueryObjectui64v(timer->start[timer->idx], GL_QUERY_RESULT, &start);
gl->GetQueryObjectui64v(timer->stop[timer->idx], GL_QUERY_RESULT, &stop);
timer->result = stop - start;
}
gl->QueryCounter(timer->start[timer->idx], GL_TIMESTAMP);
}
static uint64_t gl_timer_stop(struct ra *ra, ra_timer *ratimer)
{
GL *gl = ra_gl_get(ra);
struct gl_timer *timer = ratimer;
gl->QueryCounter(timer->stop[timer->idx++], GL_TIMESTAMP);
timer->idx %= GL_QUERY_OBJECT_NUM;
return timer->result;
}
static struct ra_fns ra_fns_gl = {
.destroy = gl_destroy,
.tex_create = gl_tex_create,
@ -853,4 +922,8 @@ static struct ra_fns ra_fns_gl = {
.renderpass_create = gl_renderpass_create,
.renderpass_destroy = gl_renderpass_destroy,
.renderpass_run = gl_renderpass_run,
.timer_create = gl_timer_create,
.timer_destroy = gl_timer_destroy,
.timer_start = gl_timer_start,
.timer_stop = gl_timer_stop,
};

View File

@ -16,6 +16,7 @@
#include "shader_cache.h"
#include "formats.h"
#include "ra_gl.h"
#include "utils.h"
// Force cache flush if more than this number of shaders is created.
#define SC_MAX_ENTRIES 48
@ -42,7 +43,7 @@ struct sc_entry {
struct sc_cached_uniform *cached_uniforms;
int num_cached_uniforms;
bstr total;
struct gl_timer *timer;
struct timer_pool *timer;
};
struct gl_shader_cache {
@ -108,11 +109,6 @@ struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
// Unbind all GL state managed by sc - the current program and texture units.
static void gl_sc_reset(struct gl_shader_cache *sc)
{
GL *gl = sc->gl;
if (sc->needs_reset)
gl_timer_stop(gl);
sc->prelude_text.len = 0;
sc->header_text.len = 0;
sc->text.len = 0;
@ -135,7 +131,7 @@ static void sc_flush_cache(struct gl_shader_cache *sc)
struct sc_entry *e = sc->entries[n];
if (e->pass)
sc->ra->fns->renderpass_destroy(sc->ra, e->pass);
gl_timer_free(e->timer);
timer_pool_destroy(e->timer);
talloc_free(e);
}
sc->num_entries = 0;
@ -541,12 +537,7 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
// 1. Unbind the program and all textures.
// 2. Reset the sc state and prepare for a new shader program. (All uniforms
// and fragment operations needed for the next program have to be re-added.)
// The return value is a mp_pass_perf containing performance metrics for the
// execution of the generated shader. (Note: execution is measured up until
// the corresponding gl_sc_reset call)
// 'type' must be valid
static struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc,
enum ra_renderpass_type type)
static void gl_sc_generate(struct gl_shader_cache *sc, enum ra_renderpass_type type)
{
int glsl_version = sc->ra->glsl_version;
int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
@ -703,7 +694,7 @@ static struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc,
entry = talloc_ptrtype(NULL, entry);
*entry = (struct sc_entry){
.total = bstrdup(entry, *hash_total),
.timer = gl_timer_create(sc->gl),
.timer = timer_pool_create(sc->ra),
};
for (int n = 0; n < sc->num_uniforms; n++) {
struct sc_cached_uniform u = {0};
@ -716,7 +707,7 @@ static struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc,
MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
}
if (!entry->pass)
return (struct mp_pass_perf){0}; // not sure what to return?
return;
assert(sc->num_uniforms == entry->num_cached_uniforms);
assert(sc->num_uniforms == entry->pass->params.num_inputs);
@ -725,20 +716,21 @@ static struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc,
for (int n = 0; n < sc->num_uniforms; n++)
update_uniform(sc, entry, &sc->uniforms[n], n);
gl_timer_start(entry->timer);
sc->current_shader = entry;
return gl_timer_measure(entry->timer);
}
struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
struct ra_tex *target,
void *ptr, size_t num)
{
struct mp_pass_perf perf = gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER);
struct timer_pool *timer = NULL;
gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER);
if (!sc->current_shader)
goto error;
timer = sc->current_shader->timer;
struct mp_rect full_rc = {0, 0, target->params.w, target->params.h};
struct ra_renderpass_run_params run = {
@ -752,20 +744,26 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
.scissors = full_rc,
};
timer_pool_start(timer);
sc->ra->fns->renderpass_run(sc->ra, &run);
timer_pool_stop(timer);
error:
gl_sc_reset(sc);
return perf;
return timer_pool_measure(timer);
}
struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
int w, int h, int d)
{
struct mp_pass_perf perf = gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE);
struct timer_pool *timer = NULL;
gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE);
if (!sc->current_shader)
goto error;
timer = sc->current_shader->timer;
struct ra_renderpass_run_params run = {
.pass = sc->current_shader->pass,
.values = sc->values,
@ -773,9 +771,11 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
.compute_groups = {w, h, d},
};
timer_pool_start(timer);
sc->ra->fns->renderpass_run(sc->ra, &run);
timer_pool_stop(timer);
error:
gl_sc_reset(sc);
return perf;
return timer_pool_measure(timer);
}

View File

@ -1,4 +1,5 @@
#include "common/msg.h"
#include "video/out/vo.h"
#include "utils.h"
// Standard parallel 2D projection, except y1 < y0 means that the coordinate
@ -118,3 +119,105 @@ void fbotex_uninit(struct fbotex *fbo)
*fbo = (struct fbotex) {0};
}
}
struct timer_pool {
struct ra *ra;
ra_timer *timer;
bool running; // detect invalid usage
uint64_t samples[PERF_SAMPLE_COUNT];
int sample_idx;
int sample_count;
uint64_t avg_sum;
uint64_t peak;
};
struct timer_pool *timer_pool_create(struct ra *ra)
{
ra_timer *timer = ra->fns->timer_create(ra);
if (!timer)
return NULL;
struct timer_pool *pool = talloc(NULL, struct timer_pool);
if (!pool) {
ra->fns->timer_destroy(ra, timer);
return NULL;
}
*pool = (struct timer_pool){ .ra = ra, .timer = timer };
return pool;
}
void timer_pool_destroy(struct timer_pool *pool)
{
if (!pool)
return;
pool->ra->fns->timer_destroy(pool->ra, pool->timer);
talloc_free(pool);
}
void timer_pool_start(struct timer_pool *pool)
{
if (!pool)
return;
assert(!pool->running);
pool->ra->fns->timer_start(pool->ra, pool->timer);
pool->running = true;
}
void timer_pool_stop(struct timer_pool *pool)
{
if (!pool)
return;
assert(pool->running);
uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
pool->running = false;
if (res) {
// Input res into the buffer and grab the previous value
uint64_t old = pool->samples[pool->sample_idx];
pool->samples[pool->sample_idx++] = res;
pool->sample_idx %= PERF_SAMPLE_COUNT;
// Update average and sum
pool->avg_sum = pool->avg_sum + res - old;
pool->sample_count = MPMIN(pool->sample_count + 1, PERF_SAMPLE_COUNT);
// Update peak if necessary
if (res >= pool->peak) {
pool->peak = res;
} else if (pool->peak == old) {
// It's possible that the last peak was the value we just removed,
// if so we need to scan for the new peak
uint64_t peak = res;
for (int i = 0; i < PERF_SAMPLE_COUNT; i++)
peak = MPMAX(peak, pool->samples[i]);
pool->peak = peak;
}
}
}
struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
{
if (!pool)
return (struct mp_pass_perf){0};
struct mp_pass_perf res = {
.count = pool->sample_count,
.index = (pool->sample_idx - pool->sample_count) % PERF_SAMPLE_COUNT,
.peak = pool->peak,
.samples = pool->samples,
};
res.last = pool->samples[(pool->sample_idx - 1) % PERF_SAMPLE_COUNT];
if (pool->sample_count > 0) {
res.avg = pool->avg_sum / pool->sample_count;
}
return res;
}

View File

@ -77,3 +77,12 @@ bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
#define FBOTEX_FUZZY_W 1
#define FBOTEX_FUZZY_H 2
#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
// A wrapper around ra_timer that does result pooling, averaging etc.
struct timer_pool;
struct timer_pool *timer_pool_create(struct ra *ra);
void timer_pool_destroy(struct timer_pool *pool);
void timer_pool_start(struct timer_pool *pool);
void timer_pool_stop(struct timer_pool *pool);
struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);

View File

@ -269,8 +269,8 @@ struct gl_video {
struct pass_info pass_redraw[PASS_INFO_MAX];
struct pass_info *pass;
int pass_idx;
struct gl_timer *upload_timer;
struct gl_timer *blit_timer;
struct timer_pool *upload_timer;
struct timer_pool *blit_timer;
// intermediate textures
struct saved_tex saved_tex[SHADER_MAX_SAVED];
@ -3097,11 +3097,11 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
rc.y1 = -p->vp_h - p->dst_rect.y0;
rc.y0 = -p->vp_h - p->dst_rect.y1;
}
gl_timer_start(p->blit_timer);
timer_pool_start(p->blit_timer);
p->ra->fns->blit(p->ra, target, p->output_fbo.tex,
rc.x0, rc.y0, &rc);
gl_timer_stop(gl);
pass_record(p, gl_timer_measure(p->blit_timer));
timer_pool_stop(p->blit_timer);
pass_record(p, timer_pool_measure(p->blit_timer));
}
}
}
@ -3233,7 +3233,6 @@ static void reinterleave_vdpau(struct gl_video *p, struct gl_hwdec_frame *frame,
// Returns false on failure.
static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id)
{
GL *gl = p->gl;
struct video_image *vimg = &p->image;
if (vimg->id == id)
@ -3255,10 +3254,10 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
struct gl_hwdec_frame gl_frame = {0};
pass_describe(p, "map frame (hwdec)");
gl_timer_start(p->upload_timer);
timer_pool_start(p->upload_timer);
bool ok = p->hwdec->driver->map_frame(p->hwdec, vimg->mpi, &gl_frame) >= 0;
gl_timer_stop(gl);
pass_record(p, gl_timer_measure(p->upload_timer));
timer_pool_stop(p->upload_timer);
pass_record(p, timer_pool_measure(p->upload_timer));
vimg->hwdec_mapped = true;
if (ok) {
@ -3290,7 +3289,7 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
// Software decoding
assert(mpi->num_planes == p->plane_count);
gl_timer_start(p->upload_timer);
timer_pool_start(p->upload_timer);
for (int n = 0; n < p->plane_count; n++) {
struct texplane *plane = &vimg->planes[n];
@ -3310,10 +3309,10 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
}
}
gl_timer_stop(gl);
timer_pool_stop(p->upload_timer);
const char *mode = p->using_dr_path ? "DR" : p->opts.pbo ? "PBO" : "naive";
pass_describe(p, "upload frame (%s)", mode);
pass_record(p, gl_timer_measure(p->upload_timer));
pass_record(p, timer_pool_measure(p->upload_timer));
return true;
@ -3488,12 +3487,10 @@ static void check_gl_features(struct gl_video *p)
static void init_gl(struct gl_video *p)
{
GL *gl = p->gl;
debug_check_gl(p, "before init_gl");
p->upload_timer = gl_timer_create(gl);
p->blit_timer = gl_timer_create(gl);
p->upload_timer = timer_pool_create(p->ra);
p->blit_timer = timer_pool_create(p->ra);
debug_check_gl(p, "after init_gl");
@ -3515,8 +3512,9 @@ void gl_video_uninit(struct gl_video *p)
ra_tex_free(p->ra, &p->lut_3d_texture);
gl->DeleteBuffers(1, &p->hdr_peak_ssbo);
gl_timer_free(p->upload_timer);
gl_timer_free(p->blit_timer);
timer_pool_destroy(p->upload_timer);
timer_pool_destroy(p->blit_timer);
for (int i = 0; i < PASS_INFO_MAX; i++) {
talloc_free(p->pass_fresh[i].desc.start);
talloc_free(p->pass_redraw[i].desc.start);