vo_opengl: add time queries

To avoid blocking the CPU, we use 8 time objects and rotate through
them, only blocking until the last possible moment (before we need
access to them on the next iteration through the ring buffer). I tested
it out on my machine and 4 query objects were enough to guarantee
block-free querying, but the extra margin shouldn't hurt.

Frame render times are just output at the end of each frame, via MP_DBG.
This might be improved in the future. (In particular, I want to expose
these numbers as properties so that users get some more visible feedback
about render times)

Currently, we measure pass_render_frame and pass_draw_to_screen
separately because the former might be called multiple times due to
interpolation. Doing it this way gives more faithful numbers. Same goes
for frame upload times.
This commit is contained in:
Niklas Haas 2016-06-05 21:55:30 +02:00 committed by wm4
parent 88b584656d
commit 8ceb935bd8
5 changed files with 208 additions and 0 deletions

View File

@ -273,6 +273,23 @@ static const struct gl_functions gl_functions[] = {
{0}
},
},
{
.ver_core = 330,
.extension = "GL_ARB_timer_query",
.functions = (const struct gl_function[]) {
DEF_FN(GenQueries),
DEF_FN(DeleteQueries),
DEF_FN(BeginQuery),
DEF_FN(EndQuery),
DEF_FN(QueryCounter),
DEF_FN(IsQuery),
DEF_FN(GetQueryObjectiv),
DEF_FN(GetQueryObjecti64v),
DEF_FN(GetQueryObjectuiv),
DEF_FN(GetQueryObjectui64v),
{0}
},
},
{
.ver_core = 430,
.ver_es_core = 300,

View File

@ -186,6 +186,17 @@ struct GL {
GLenum (GLAPIENTRY *ClientWaitSync)(GLsync, GLbitfield, GLuint64);
void (GLAPIENTRY *DeleteSync)(GLsync sync);
void (GLAPIENTRY *GenQueries)(GLsizei, GLuint *);
void (GLAPIENTRY *DeleteQueries)(GLsizei, const GLuint *);
void (GLAPIENTRY *BeginQuery)(GLenum, GLuint);
void (GLAPIENTRY *EndQuery)(GLenum);
void (GLAPIENTRY *QueryCounter)(GLuint, GLenum);
GLboolean (GLAPIENTRY *IsQuery)(GLuint);
void (GLAPIENTRY *GetQueryObjectiv)(GLuint, GLenum, GLint *);
void (GLAPIENTRY *GetQueryObjecti64v)(GLuint, GLenum, GLint64 *);
void (GLAPIENTRY *GetQueryObjectuiv)(GLuint, GLenum, GLuint *);
void (GLAPIENTRY *GetQueryObjectui64v)(GLuint, GLenum, GLuint64 *);
void (GLAPIENTRY *VDPAUInitNV)(const GLvoid *, const GLvoid *);
void (GLAPIENTRY *VDPAUFiniNV)(void);
GLvdpauSurfaceNV (GLAPIENTRY *VDPAURegisterOutputSurfaceNV)

View File

@ -1019,3 +1019,129 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
gl_sc_reset(sc);
}
// Maximum number of simultaneous query objects to keep around. Reducing this
// number might cause rendering to block until the result of a previous query is
// available
#define QUERY_OBJECT_NUM 8
// How many samples to keep around, for the sake of average and peak
// calculations. This corresponds to a few seconds (exact time variable)
#define QUERY_SAMPLE_SIZE 256
struct gl_timer {
GL *gl;
GLuint query[QUERY_OBJECT_NUM];
int query_idx;
GLuint64 samples[QUERY_SAMPLE_SIZE];
int sample_idx;
int sample_count;
uint64_t avg_sum;
uint64_t peak;
};
int gl_timer_sample_count(struct gl_timer *timer)
{
return timer->sample_count;
}
uint64_t gl_timer_last_us(struct gl_timer *timer)
{
return timer->samples[(timer->sample_idx - 1) % QUERY_SAMPLE_SIZE] / 1000;
}
uint64_t gl_timer_avg_us(struct gl_timer *timer)
{
if (timer->sample_count <= 0)
return 0;
return timer->avg_sum / timer->sample_count / 1000;
}
uint64_t gl_timer_peak_us(struct gl_timer *timer)
{
return timer->peak / 1000;
}
struct gl_timer *gl_timer_create(GL *gl)
{
struct gl_timer *timer = talloc_ptrtype(NULL, timer);
*timer = (struct gl_timer){ .gl = gl };
if (gl->GenQueries)
gl->GenQueries(QUERY_OBJECT_NUM, timer->query);
return timer;
}
void gl_timer_free(struct gl_timer *timer)
{
if (!timer)
return;
GL *gl = timer->gl;
if (gl && gl->DeleteQueries) {
// this is a no-op on already uninitialized queries
gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query);
}
talloc_free(timer);
}
static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
{
// Input res into the buffer and grab the previous value
GLuint64 old = timer->samples[timer->sample_idx];
timer->samples[timer->sample_idx++] = new;
timer->sample_idx %= QUERY_SAMPLE_SIZE;
// Update average and sum
timer->avg_sum = timer->avg_sum + new - old;
timer->sample_count = MPMIN(timer->sample_count + 1, QUERY_SAMPLE_SIZE);
// Update peak if necessary
if (new >= timer->peak) {
timer->peak = new;
} else if (timer->peak == old) {
// It's possible that the last peak was the value we just removed,
// if so we need to scan for the new peak
uint64_t peak = new;
for (int i = 0; i < QUERY_SAMPLE_SIZE; i++)
peak = MPMAX(peak, timer->samples[i]);
timer->peak = peak;
}
}
// If no free query is available, this can block. Shouldn't ever happen in
// practice, though. (If it does, consider increasing QUERY_OBJECT_NUM)
// IMPORTANT: only one gl_timer object may ever be active at a single time.
// The caling code *MUST* ensure this
void gl_timer_start(struct gl_timer *timer)
{
GL *gl = timer->gl;
if (!gl->BeginQuery)
return;
// Get the next query object
GLuint id = timer->query[timer->query_idx++];
timer->query_idx %= QUERY_OBJECT_NUM;
// If this query object already holds a result, we need to get and
// record it first
if (gl->IsQuery(id)) {
GLuint64 elapsed;
gl->GetQueryObjectui64v(id, GL_QUERY_RESULT, &elapsed);
gl_timer_record(timer, elapsed);
}
gl->BeginQuery(GL_TIME_ELAPSED, id);
}
void gl_timer_stop(struct gl_timer *timer)
{
GL *gl = timer->gl;
if (gl->EndQuery)
gl->EndQuery(GL_TIME_ELAPSED);
}

View File

@ -172,4 +172,16 @@ void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc);
void gl_sc_reset(struct gl_shader_cache *sc);
struct gl_timer;
struct gl_timer *gl_timer_create(GL *gl);
void gl_timer_free(struct gl_timer *timer);
void gl_timer_start(struct gl_timer *timer);
void gl_timer_stop(struct gl_timer *timer);
int gl_timer_sample_count(struct gl_timer *timer);
uint64_t gl_timer_last_us(struct gl_timer *timer);
uint64_t gl_timer_avg_us(struct gl_timer *timer);
uint64_t gl_timer_peak_us(struct gl_timer *timer);
#endif

View File

@ -196,6 +196,10 @@ struct gl_video {
GLuint nnedi3_weights_buffer;
struct gl_timer *upload_timer;
struct gl_timer *render_timer;
struct gl_timer *present_timer;
struct mp_image_params real_image_params; // configured format
struct mp_image_params image_params; // texture format (mind hwdec case)
struct mp_imgfmt_desc image_desc;
@ -2497,6 +2501,11 @@ static void pass_render_frame(struct gl_video *p)
if (p->dumb_mode)
return;
// start the render timer here. it will continue to the end of this
// function, to render the time needed to draw (excluding screen
// presentation)
gl_timer_start(p->render_timer);
p->use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
pass_read_video(p);
pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
@ -2553,10 +2562,14 @@ static void pass_render_frame(struct gl_video *p)
}
pass_opt_hook_point(p, "SCALED", NULL);
gl_timer_stop(p->render_timer);
}
static void pass_draw_to_screen(struct gl_video *p, int fbo)
{
gl_timer_start(p->present_timer);
if (p->dumb_mode)
pass_render_frame_dumb(p, fbo);
@ -2582,6 +2595,8 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
pass_dither(p);
finish_pass_direct(p, fbo, p->vp_w, p->vp_h, &p->dst_rect);
gl_timer_stop(p->present_timer);
}
// Draws an interpolate frame to fbo, based on the frame timing in t
@ -2754,6 +2769,16 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
p->frames_drawn += 1;
}
static void timer_dbg(struct gl_video *p, const char *name, struct gl_timer *t)
{
if (gl_timer_sample_count(t) > 0) {
MP_DBG(p, "%s time: last %dus avg %dus peak %dus\n", name,
(int)gl_timer_last_us(t),
(int)gl_timer_avg_us(t),
(int)gl_timer_peak_us(t));
}
}
// (fbo==0 makes BindFramebuffer select the screen backbuffer)
void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
{
@ -2857,6 +2882,11 @@ done:
gl->Flush();
p->frames_rendered++;
// Report performance metrics
timer_dbg(p, "upload", p->upload_timer);
timer_dbg(p, "render", p->render_timer);
timer_dbg(p, "present", p->present_timer);
}
// vp_w/vp_h is the implicit size of the target framebuffer.
@ -2971,6 +3001,8 @@ static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
assert(mpi->num_planes == p->plane_count);
gl_timer_start(p->upload_timer);
mp_image_t pbo_mpi = *mpi;
bool pbo = map_image(p, &pbo_mpi);
if (pbo) {
@ -2998,6 +3030,8 @@ static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
if (pbo)
gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
gl_timer_stop(p->upload_timer);
return true;
error:
@ -3227,6 +3261,10 @@ static void init_gl(struct gl_video *p)
gl->DeleteTextures(1, &tex);
}
p->upload_timer = gl_timer_create(p->gl);
p->render_timer = gl_timer_create(p->gl);
p->present_timer = gl_timer_create(p->gl);
debug_check_gl(p, "after init_gl");
}
@ -3245,6 +3283,10 @@ void gl_video_uninit(struct gl_video *p)
gl->DeleteTextures(1, &p->lut_3d_texture);
gl_timer_free(p->upload_timer);
gl_timer_free(p->render_timer);
gl_timer_free(p->present_timer);
mpgl_osd_destroy(p->osd);
gl_set_debug_logger(gl, NULL);