From dfe846f9de30e2606a38afb6ca6de755e84df3df Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Fri, 25 Feb 2022 22:07:56 +0100 Subject: [PATCH] vo_gpu_next: add support for hardware decoding There are two major ways of going about this: 1. Expose the native ra_gl/ra_pl/ra_d3d11 objects to the pre-existing hwdec mappers, and then add code in vo_gpu_next to rewrap those ra_tex objects into pl_tex. 2. Wrap the underlying pl_opengl/pl_d3d11 into a ra_pl object and expose it to the hwdec mappers, then directly use the resulting pl_tex. I ultimately opted for approach 1 because it enables compatibility with more hardware decoders, specifically including ones that use native OpenGL calls currently. The second approach only really works with cuda_vk and vaapi_pl. --- DOCS/man/vo.rst | 4 - video/out/gpu_next/context.h | 1 - video/out/vo_gpu_next.c | 203 ++++++++++++++++++++++++++++------- 3 files changed, 167 insertions(+), 41 deletions(-) diff --git a/DOCS/man/vo.rst b/DOCS/man/vo.rst index cd995df3c7..1843a56e4f 100644 --- a/DOCS/man/vo.rst +++ b/DOCS/man/vo.rst @@ -262,10 +262,6 @@ Available video output drivers are: the same set of features as ``--vo=gpu``. See `GPU renderer options`_ for a list. - Currently, this only supports Vulkan, OpenGL, D3D11 and no hardware - decoding. Unlike ``--vo=gpu``, the FBO formats are not tunable, but you can - still set ``--gpu-dumb-mode=yes`` to forcibly disable their use. - Should generally be faster and higher quality, but some features may still be missing or misbehave. Expect (and report!) bugs. See here for a list of known differences and bugs: diff --git a/video/out/gpu_next/context.h b/video/out/gpu_next/context.h index 2e2bc3cd29..b98b9e7251 100644 --- a/video/out/gpu_next/context.h +++ b/video/out/gpu_next/context.h @@ -26,7 +26,6 @@ struct gl_video_opts; struct gpu_ctx { struct mp_log *log; - struct ra_ctx *ra_ctx; pl_log pllog; diff --git a/video/out/vo_gpu_next.c b/video/out/vo_gpu_next.c index 0ba5eb5f77..0b78bf0acc 100644 --- a/video/out/vo_gpu_next.c +++ b/video/out/vo_gpu_next.c @@ -32,15 +32,22 @@ #include "options/path.h" #include "osdep/io.h" #include "stream/stream.h" -#include "video/mp_image.h" #include "video/fmt-conversion.h" +#include "video/mp_image.h" +#include "video/out/placebo/ra_pl.h" #include "placebo/utils.h" #include "gpu/context.h" +#include "gpu/hwdec.h" #include "gpu/video.h" #include "gpu/video_shaders.h" #include "sub/osd.h" #include "gpu_next/context.h" +#if HAVE_GL && defined(PL_HAVE_OPENGL) +#include +#include "video/out/opengl/ra_gl.h" +#endif + struct osd_entry { pl_tex tex; struct pl_overlay_part *parts; @@ -75,6 +82,11 @@ struct priv { struct mpv_global *global; struct ra_ctx *ra_ctx; struct gpu_ctx *context; + struct ra_hwdec_ctx hwdec_ctx; + + // Pooled/cached mappers, for performance + struct ra_hwdec_mapper **hwdec_mappers; + int num_hwdec_mappers; pl_log pllog; pl_gpu gpu; @@ -306,6 +318,7 @@ struct frame_priv { struct vo *vo; struct osd_state subs; uint64_t osd_sync; + struct ra_hwdec_mapper *hwdec_mapper; }; static int plane_data_from_imgfmt(struct pl_plane_data out_data[4], @@ -441,19 +454,70 @@ static struct pl_color_space get_mpi_csp(struct vo *vo, struct mp_image *mpi) return csp; } +// For RAs not based on ra_pl, this creates a new pl_tex wrapper +static pl_tex hwdec_get_tex(struct frame_priv *fp, int n) +{ + struct priv *p = fp->vo->priv; + struct ra_tex *ratex = fp->hwdec_mapper->tex[n]; + struct ra *ra = fp->hwdec_mapper->ra; + if (ra_pl_get(ra)) + return (pl_tex) ratex->priv; + +#if HAVE_GL && defined(PL_HAVE_OPENGL) + if (ra_is_gl(ra) && pl_opengl_get(p->gpu)) { + struct pl_opengl_wrap_params par = { + .width = ratex->params.w, + .height = ratex->params.h, + }; + + ra_gl_get_format(ratex->params.format, &par.iformat, + &(GLenum){0}, &(GLenum){0}); + ra_gl_get_raw_tex(ra, ratex, &par.texture, &par.target); + return pl_opengl_wrap(p->gpu, &par); + } +#endif + + // TODO: d3d11 wrapping/unwrapping + + MP_ERR(p, "Failed mapping hwdec frame? Open a bug!\n"); + return false; +} + static bool map_frame(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src, struct pl_frame *frame) { struct mp_image *mpi = src->frame_data; const struct mp_image_params *par = &mpi->params; struct frame_priv *fp = mpi->priv; - struct pl_plane_data data[4] = {0}; struct vo *vo = fp->vo; struct priv *p = vo->priv; - // TODO: implement support for hwdec wrappers + struct ra_hwdec *hwdec = ra_hwdec_get(&p->hwdec_ctx, mpi->imgfmt); + if (hwdec) { + if (MP_TARRAY_POP(p->hwdec_mappers, p->num_hwdec_mappers, &fp->hwdec_mapper)) { + if (!mp_image_params_equal(&mpi->params, &fp->hwdec_mapper->src_params)) + ra_hwdec_mapper_free(&fp->hwdec_mapper); + } + + if (!fp->hwdec_mapper) { + fp->hwdec_mapper = ra_hwdec_mapper_create(hwdec, &mpi->params); + if (!fp->hwdec_mapper) { + MP_ERR(p, "Initializing texture for hardware decoding failed.\n"); + return false; + } + } + + if (ra_hwdec_mapper_map(fp->hwdec_mapper, mpi) < 0) { + MP_ERR(p, "Mapping hardware decoded surface failed.\n"); + MP_TARRAY_APPEND(p, p->hwdec_mappers, p->num_hwdec_mappers, fp->hwdec_mapper); + fp->hwdec_mapper = NULL; + return false; + } + + par = &fp->hwdec_mapper->dst_params; + } + *frame = (struct pl_frame) { - .num_planes = mpi->num_planes, .color = get_mpi_csp(vo, mpi), .repr = { .sys = mp_csp_to_pl(par->color.space), @@ -485,43 +549,71 @@ static bool map_frame(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src default: break; } - enum pl_chroma_location chroma = mp_chroma_to_pl(par->chroma_location); - int planes = plane_data_from_imgfmt(data, &frame->repr.bits, mpi->imgfmt); - for (int n = 0; n < planes; n++) { - struct pl_plane *plane = &frame->planes[n]; - data[n].width = mp_image_plane_w(mpi, n); - data[n].height = mp_image_plane_h(mpi, n); - if (mpi->stride[n] < 0) { - data[n].pixels = mpi->planes[n] + (data[n].height - 1) * mpi->stride[n]; - data[n].row_stride = -mpi->stride[n]; - plane->flipped = true; - } else { - data[n].pixels = mpi->planes[n]; - data[n].row_stride = mpi->stride[n]; + if (hwdec) { + + struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(par->imgfmt); + frame->num_planes = desc.num_planes; + for (int n = 0; n < frame->num_planes; n++) { + struct pl_plane *plane = &frame->planes[n]; + plane->texture = hwdec_get_tex(fp, n); + if (!plane->texture) + return false; + + int *map = plane->component_mapping; + for (int c = 0; c < mp_imgfmt_desc_get_num_comps(&desc); c++) { + if (desc.comps[c].plane != n) + continue; + + // Sort by component offset + uint8_t offset = desc.comps[c].offset; + int index = plane->components++; + while (index > 0 && desc.comps[map[index - 1]].offset > offset) { + map[index] = map[index - 1]; + index--; + } + map[index] = c; + } } - pl_buf buf = get_dr_buf(mpi); - if (buf) { - data[n].buf = buf; - data[n].buf_offset = (uint8_t *) data[n].pixels - buf->data; - data[n].pixels = NULL; - } else if (gpu->limits.callbacks) { - data[n].callback = talloc_free; - data[n].priv = mp_image_new_ref(mpi); + } else { // swdec + + struct pl_plane_data data[4] = {0}; + frame->num_planes = plane_data_from_imgfmt(data, &frame->repr.bits, mpi->imgfmt); + for (int n = 0; n < frame->num_planes; n++) { + struct pl_plane *plane = &frame->planes[n]; + data[n].width = mp_image_plane_w(mpi, n); + data[n].height = mp_image_plane_h(mpi, n); + if (mpi->stride[n] < 0) { + data[n].pixels = mpi->planes[n] + (data[n].height - 1) * mpi->stride[n]; + data[n].row_stride = -mpi->stride[n]; + plane->flipped = true; + } else { + data[n].pixels = mpi->planes[n]; + data[n].row_stride = mpi->stride[n]; + } + + pl_buf buf = get_dr_buf(mpi); + if (buf) { + data[n].buf = buf; + data[n].buf_offset = (uint8_t *) data[n].pixels - buf->data; + data[n].pixels = NULL; + } else if (gpu->limits.callbacks) { + data[n].callback = talloc_free; + data[n].priv = mp_image_new_ref(mpi); + } + + if (!pl_upload_plane(gpu, plane, &tex[n], &data[n])) { + MP_ERR(vo, "Failed uploading frame!\n"); + talloc_free(data[n].priv); + return false; + } } - if (!pl_upload_plane(gpu, plane, &tex[n], &data[n])) { - MP_ERR(vo, "Failed uploading frame!\n"); - talloc_free(data[n].priv); - return false; - } - - if (mpi->fmt.xs[n] || mpi->fmt.ys[n]) { - pl_chroma_location_offset(chroma, &plane->shift_x, &plane->shift_y); - plane->shift_y = -plane->shift_y; - } } + // Update chroma location, must be done after initializing planes + pl_frame_set_chroma_location(frame, mp_chroma_to_pl(par->chroma_location)); + #ifdef PL_HAVE_LAV_DOLBY_VISION if (mpi->dovi) { const AVDOVIMetadata *metadata = (AVDOVIMetadata *) mpi->dovi->data; @@ -558,6 +650,17 @@ static void unmap_frame(pl_gpu gpu, struct pl_frame *frame, struct mp_image *mpi = src->frame_data; struct frame_priv *fp = mpi->priv; struct priv *p = fp->vo->priv; + if (fp->hwdec_mapper) { + // Clean up after wrapped plane textures + if (!ra_pl_get(fp->hwdec_mapper->ra)) { + for (int n = 0; n < frame->num_planes; n++) + pl_tex_destroy(p->gpu, &frame->planes[n].texture); + } + + ra_hwdec_mapper_unmap(fp->hwdec_mapper); + MP_TARRAY_APPEND(p, p->hwdec_mappers, p->num_hwdec_mappers, fp->hwdec_mapper); + fp->hwdec_mapper = NULL; + } for (int i = 0; i < MP_ARRAY_SIZE(fp->subs.entries); i++) { pl_tex tex = fp->subs.entries[i].tex; if (tex) @@ -844,6 +947,9 @@ static void get_vsync(struct vo *vo, struct vo_vsync_info *info) static int query_format(struct vo *vo, int format) { struct priv *p = vo->priv; + if (ra_hwdec_get(&p->hwdec_ctx, format)) + return true; + struct pl_bit_encoding bits; struct pl_plane_data data[4] = {0}; int planes = plane_data_from_imgfmt(data, &bits, format); @@ -1086,6 +1192,10 @@ static int control(struct vo *vo, uint32_t request, void *data) case VOCTRL_EXTERNAL_RESIZE: reconfig(vo, NULL); return true; + + case VOCTRL_LOAD_HWDEC_API: + ra_hwdec_ctx_load_fmt(&p->hwdec_ctx, vo->hwdec_devs, (intptr_t) data); + return true; } int events = 0; @@ -1144,6 +1254,14 @@ static void uninit(struct vo *vo) for (int i = 0; i < p->num_user_hooks; i++) pl_mpv_user_shader_destroy(&p->user_hooks[i].hook); + if (vo->hwdec_devs) { + for (int n = 0; n < p->num_hwdec_mappers; n++) + ra_hwdec_mapper_free(&p->hwdec_mappers[n]); + ra_hwdec_ctx_uninit(&p->hwdec_ctx); + hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL); + hwdec_devices_destroy(vo->hwdec_devs); + } + char *cache_file = get_cache_file(p); if (cache_file) { FILE *cache = fopen(cache_file, "wb"); @@ -1167,6 +1285,11 @@ static void uninit(struct vo *vo) gpu_ctx_destroy(&p->context); } +static void load_hwdec_api(void *ctx, int imgfmt) +{ + vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t) imgfmt); +} + static int preinit(struct vo *vo) { struct priv *p = vo->priv; @@ -1176,7 +1299,6 @@ static int preinit(struct vo *vo) p->log = vo->log; struct gl_video_opts *gl_opts = p->opts_cache->opts; - p->context = gpu_ctx_create(vo, gl_opts); if (!p->context) goto err_out; @@ -1185,6 +1307,15 @@ static int preinit(struct vo *vo) p->pllog = p->context->pllog; p->gpu = p->context->gpu; p->sw = p->context->swapchain; + p->hwdec_ctx = (struct ra_hwdec_ctx) { + .log = p->log, + .global = p->global, + .ra = p->ra_ctx->ra, + }; + + vo->hwdec_devs = hwdec_devices_create(); + hwdec_devices_set_loader(vo->hwdec_devs, load_hwdec_api, vo); + ra_hwdec_ctx_init(&p->hwdec_ctx, vo->hwdec_devs, gl_opts->hwdec_interop, false); p->rr = pl_renderer_create(p->pllog, p->gpu); p->queue = pl_queue_create(p->gpu);