diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index df640c8af5..e73ef6eee4 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -1013,6 +1013,23 @@ Video
     (default: 3). If this is a number, then fallback will be triggered if
     N frames fail to decode in a row. 1 is equivalent to ``yes``.
 
+``--vd-lavc-dr=<yes|no>``
+    Enable direct rendering (default: no). If this is set to ``yes``, the
+    video will be decoded directly to GPU video memory (or staging buffers).
+    This can speed up video upload, and may help with large resolutions or
+    slow hardware. This works only with the following VOs:
+
+        - ``opengl``: requires at least OpenGL 4.4.
+
+    (In particular, this can't be made work with ``opengl-cb``.)
+
+    Using video filters of any kind that write to the image data (or output
+    newly allocated frames) will silently disable the DR code path.
+
+    There are some corner cases that will result in undefined behavior (crashes
+    and other strange behavior) if this option is enabled. These are pending
+    towards being fixed properly at a later point.
+
 ``--vd-lavc-bitexact``
     Only use bit-exact algorithms in all decoding steps (for codec testing).
 
diff --git a/player/video.c b/player/video.c
index c1aee44674..108d65a35b 100644
--- a/player/video.c
+++ b/player/video.c
@@ -425,6 +425,7 @@ int init_video_decoder(struct MPContext *mpctx, struct track *track)
     d_video->header = track->stream;
     d_video->codec = track->stream->codec;
     d_video->fps = d_video->header->codec->fps;
+    d_video->vo = mpctx->vo_chain->vo;
 
     // Note: at least mpv_opengl_cb_uninit_gl() relies on being able to get
     //       rid of all references to the VO by destroying the VO chain. Thus,
diff --git a/video/decode/dec_video.h b/video/decode/dec_video.h
index 261f47fca8..73570f8ed5 100644
--- a/video/decode/dec_video.h
+++ b/video/decode/dec_video.h
@@ -37,6 +37,7 @@ struct dec_video {
     struct mp_hwdec_devices *hwdec_devs; // video output hwdec handles
     struct sh_stream *header;
     struct mp_codec_params *codec;
+    struct vo *vo; // required for direct rendering into video memory
 
     char *decoder_desc;
 
diff --git a/video/decode/lavc.h b/video/decode/lavc.h
index 44b103e3f5..9e27a6e18c 100644
--- a/video/decode/lavc.h
+++ b/video/decode/lavc.h
@@ -2,6 +2,7 @@
 #define MPV_LAVC_H
 
 #include <stdbool.h>
+#include <pthread.h>
 
 #include <libavcodec/avcodec.h>
 
@@ -72,6 +73,12 @@ typedef struct lavc_ctx {
     struct mp_image_pool *hwdec_swpool;
 
     AVBufferRef *cached_hw_frames_ctx;
+
+    // --- The following fields are protected by dr_lock.
+    pthread_mutex_t dr_lock;
+    bool dr_failed;
+    struct mp_image_pool *dr_pool;
+    int dr_imgfmt, dr_w, dr_h, dr_stride_align;
 } vd_ffmpeg_ctx;
 
 struct vd_lavc_hwdec {
diff --git a/video/decode/vd_lavc.c b/video/decode/vd_lavc.c
index 27861171f5..f1b2a83749 100644
--- a/video/decode/vd_lavc.c
+++ b/video/decode/vd_lavc.c
@@ -57,6 +57,7 @@
 #include "demux/packet.h"
 #include "video/csputils.h"
 #include "video/sws_utils.h"
+#include "video/out/vo.h"
 
 #if LIBAVCODEC_VERSION_MICRO >= 100
 #include <libavutil/mastering_display_metadata.h>
@@ -74,6 +75,7 @@ static void init_avctx(struct dec_video *vd, const char *decoder,
                        struct vd_lavc_hwdec *hwdec);
 static void uninit_avctx(struct dec_video *vd);
 
+static int get_buffer2_direct(AVCodecContext *avctx, AVFrame *pic, int flags);
 static int get_buffer2_hwdec(AVCodecContext *avctx, AVFrame *pic, int flags);
 static enum AVPixelFormat get_format_hwdec(struct AVCodecContext *avctx,
                                            const enum AVPixelFormat *pix_fmt);
@@ -92,6 +94,7 @@ struct vd_lavc_params {
     int check_hw_profile;
     int software_fallback;
     char **avopts;
+    int dr;
 };
 
 static const struct m_opt_choice_alternatives discard_names[] = {
@@ -121,6 +124,7 @@ const struct m_sub_options vd_lavc_conf = {
         OPT_CHOICE_OR_INT("software-fallback", software_fallback, 0, 1, INT_MAX,
                           ({"no", INT_MAX}, {"yes", 1})),
         OPT_KEYVALUELIST("o", avopts, 0),
+        OPT_FLAG("dr", dr, 0),
         {0}
     },
     .size = sizeof(struct vd_lavc_params),
@@ -425,7 +429,11 @@ static struct vd_lavc_hwdec *probe_hwdec(struct dec_video *vd, bool autoprobe,
 
 static void uninit(struct dec_video *vd)
 {
+    vd_ffmpeg_ctx *ctx = vd->priv;
+
     uninit_avctx(vd);
+
+    pthread_mutex_destroy(&ctx->dr_lock);
     talloc_free(vd->priv);
 }
 
@@ -514,6 +522,9 @@ static int init(struct dec_video *vd, const char *decoder)
     ctx->decoder = talloc_strdup(ctx, decoder);
     ctx->hwdec_devs = vd->hwdec_devs;
     ctx->hwdec_swpool = talloc_steal(ctx, mp_image_pool_new(17));
+    ctx->dr_pool = talloc_steal(ctx, mp_image_pool_new(INT_MAX));
+
+    pthread_mutex_init(&ctx->dr_lock, NULL);
 
     reinit(vd);
 
@@ -597,6 +608,12 @@ static void init_avctx(struct dec_video *vd, const char *decoder,
         mp_set_avcodec_threads(vd->log, avctx, lavc_param->threads);
     }
 
+    if (!ctx->hwdec && vd->vo && lavc_param->dr) {
+        avctx->opaque = vd;
+        avctx->get_buffer2 = get_buffer2_direct;
+        avctx->thread_safe_callbacks = 1;
+    }
+
     avctx->flags |= lavc_param->bitexact ? AV_CODEC_FLAG_BITEXACT : 0;
     avctx->flags2 |= lavc_param->fast ? AV_CODEC_FLAG2_FAST : 0;
 
@@ -917,6 +934,87 @@ static enum AVPixelFormat get_format_hwdec(struct AVCodecContext *avctx,
     return select;
 }
 
+static int get_buffer2_direct(AVCodecContext *avctx, AVFrame *pic, int flags)
+{
+    struct dec_video *vd = avctx->opaque;
+    vd_ffmpeg_ctx *p = vd->priv;
+
+    pthread_mutex_lock(&p->dr_lock);
+
+    int w = pic->width;
+    int h = pic->height;
+    int linesize_align[AV_NUM_DATA_POINTERS] = {0};
+    avcodec_align_dimensions2(avctx, &w, &h, linesize_align);
+
+    // We assume that different alignments are just different power-of-2s.
+    // Thus, a higher alignment always satisfies a lower alignment.
+    int stride_align = 0;
+    for (int n = 0; n < AV_NUM_DATA_POINTERS; n++)
+        stride_align = MPMAX(stride_align, linesize_align[n]);
+
+    int imgfmt = pixfmt2imgfmt(pic->format);
+    if (!imgfmt)
+        goto fallback;
+
+    if (p->dr_failed)
+        goto fallback;
+
+    // (For simplicity, we realloc on any parameter change, instead of trying
+    // to be clever.)
+    if (stride_align != p->dr_stride_align || w != p->dr_w || h != p->dr_h ||
+        imgfmt != p->dr_imgfmt)
+    {
+        mp_image_pool_clear(p->dr_pool);
+        p->dr_imgfmt = imgfmt;
+        p->dr_w = w;
+        p->dr_h = h;
+        p->dr_stride_align = stride_align;
+        MP_VERBOSE(p, "DR parameter change to %dx%d %s align=%d\n", w, h,
+                   mp_imgfmt_to_name(imgfmt), stride_align);
+    }
+
+    struct mp_image *img = mp_image_pool_get_no_alloc(p->dr_pool, imgfmt, w, h);
+    if (!img) {
+        MP_VERBOSE(p, "Allocating new DR image...\n");
+        img = vo_get_image(vd->vo, imgfmt, w, h, stride_align);
+        if (!img) {
+            MP_VERBOSE(p, "...failed..\n");
+            goto fallback;
+        }
+
+        // Now make the mp_image part of the pool. This requires doing magic to
+        // the image, so just add it to the pool and get it back to avoid
+        // dealing with magic ourselves. (Normally this never fails.)
+        mp_image_pool_add(p->dr_pool, img);
+        img = mp_image_pool_get_no_alloc(p->dr_pool, imgfmt, w, h);
+        if (!img)
+            goto fallback;
+    }
+
+    // get_buffer2 callers seem very unappreciative of overwriting pic with a
+    // new reference. The AVCodecContext.get_buffer2 comments tell us exactly
+    // what we should do, so follow that.
+    for (int n = 0; n < 4; n++) {
+        pic->data[n] = img->planes[n];
+        pic->linesize[n] = img->stride[n];
+        pic->buf[n] = img->bufs[n];
+        img->bufs[n] = NULL;
+    }
+    talloc_free(img);
+
+    pthread_mutex_unlock(&p->dr_lock);
+
+    return 0;
+
+fallback:
+    if (!p->dr_failed)
+        MP_VERBOSE(p, "DR failed - disabling.\n");
+    p->dr_failed = true;
+    pthread_mutex_unlock(&p->dr_lock);
+
+    return avcodec_default_get_buffer2(avctx, pic, flags);
+}
+
 static int get_buffer2_hwdec(AVCodecContext *avctx, AVFrame *pic, int flags)
 {
     struct dec_video *vd = avctx->opaque;
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index 203c14b7ef..c7eee414ac 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -327,6 +327,14 @@ static const struct gl_functions gl_functions[] = {
             {0}
         },
     },
+    {
+        .ver_core = 440,
+        .extension = "GL_ARB_buffer_storage",
+        .functions = (const struct gl_function[]) {
+            DEF_FN(BufferStorage),
+            {0}
+        },
+    },
     // Swap control, always an OS specific extension
     // The OSX code loads this manually.
     {
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index c9162f2479..7842c5a910 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -192,6 +192,8 @@ struct GL {
     GLenum (GLAPIENTRY *ClientWaitSync)(GLsync, GLbitfield, GLuint64);
     void (GLAPIENTRY *DeleteSync)(GLsync sync);
 
+    void (GLAPIENTRY *BufferStorage)(GLenum, intptr_t, const GLvoid *, GLenum);
+
     void (GLAPIENTRY *GenQueries)(GLsizei, GLuint *);
     void (GLAPIENTRY *DeleteQueries)(GLsizei, const GLuint *);
     void (GLAPIENTRY *BeginQuery)(GLenum,  GLuint);
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index 20b16b73ef..ab98eddbf9 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -92,6 +92,7 @@ static const struct mpgl_driver *const backends[] = {
 // 0-terminated list of desktop GL versions a backend should try to
 // initialize. The first entry is the most preferred version.
 const int mpgl_preferred_gl_versions[] = {
+    440,
     400,
     330,
     320,
diff --git a/video/out/opengl/gl_headers.h b/video/out/opengl/gl_headers.h
index bfefc3d3bf..74a4947137 100644
--- a/video/out/opengl/gl_headers.h
+++ b/video/out/opengl/gl_headers.h
@@ -70,6 +70,13 @@
 #define GL_DEBUG_SEVERITY_LOW             0x9148
 #define GL_DEBUG_SEVERITY_NOTIFICATION    0x826B
 
+// --- GL 4.4 or GL_ARB_buffer_storage
+
+#define GL_MAP_PERSISTENT_BIT             0x0040
+#define GL_MAP_COHERENT_BIT               0x0080
+#define GL_DYNAMIC_STORAGE_BIT            0x0100
+#define GL_CLIENT_STORAGE_BIT             0x0200
+
 // --- GL_NV_vdpau_interop
 
 #define GLvdpauSurfaceNV GLintptr
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index ba54dd4c6c..9587eaaa5f 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -174,6 +174,17 @@ struct pass_info {
 
 #define PASS_INFO_MAX (SHADER_MAX_HOOKS + 32)
 
+struct dr_buffer {
+    void *ptr;
+    size_t size;
+    GLuint pbo;
+    // While a PBO is read-accessed by GL, we must not write to the mapped data.
+    // The fence tells us when GL is done, and the mpi reference will keep the
+    // data from being recycled (or from other references gaining write access).
+    GLsync fence;
+    struct mp_image *mpi;
+};
+
 struct gl_video {
     GL *gl;
 
@@ -212,6 +223,11 @@ struct gl_video {
 
     struct video_image image;
 
+    struct dr_buffer *dr_buffers;
+    int num_dr_buffers;
+
+    bool using_dr_path;
+
     bool dumb_mode;
     bool forced_dumb_mode;
 
@@ -933,11 +949,56 @@ static void unmap_current_image(struct gl_video *p)
     }
 }
 
+static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
+{
+   for (int i = 0; i < p->num_dr_buffers; i++) {
+        struct dr_buffer *buf = &p->dr_buffers[i];
+        if (ptr >= (uint8_t *)buf->ptr && ptr < (uint8_t *)buf->ptr + buf->size)
+            return buf;
+    }
+
+    return NULL;
+}
+
+static void gc_pending_dr_fences(struct gl_video *p, bool force)
+{
+    GL *gl = p->gl;
+
+again:;
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (!buffer->fence)
+            continue;
+
+        GLenum res = gl->ClientWaitSync(buffer->fence, 0, 0); // non-blocking
+        if (res == GL_ALREADY_SIGNALED || force) {
+            gl->DeleteSync(buffer->fence);
+            buffer->fence = NULL;
+            // Unreferencing the image could cause gl_video_dr_free_buffer()
+            // to be called by the talloc destructor (if it was the last
+            // reference). This will implicitly invalidate the buffer pointer
+            // and change the p->dr_buffers array. To make it worse, it could
+            // free multiple dr_buffers due to weird theoretical corner cases.
+            // This is also why we use the goto to iterate again from the
+            // start, because everything gets fucked up. Hail satan!
+            struct mp_image *ref = buffer->mpi;
+            buffer->mpi = NULL;
+            talloc_free(ref);
+            goto again;
+        }
+    }
+}
+
 static void unref_current_image(struct gl_video *p)
 {
     unmap_current_image(p);
-    mp_image_unrefp(&p->image.mpi);
     p->image.id = 0;
+
+    mp_image_unrefp(&p->image.mpi);
+
+    // While we're at it, also garbage collect pending fences in here to
+    // get it out of the way.
+    gc_pending_dr_fences(p, false);
 }
 
 // If overlay mode is used, make sure to remove the overlay.
@@ -3088,10 +3149,34 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
         plane->flipped = mpi->stride[0] < 0;
 
         gl->BindTexture(plane->gl_target, plane->gl_texture);
-        gl_pbo_upload_tex(&plane->pbo, gl, p->opts.pbo, plane->gl_target,
-                          plane->gl_format, plane->gl_type, plane->w, plane->h,
-                          mpi->planes[n], mpi->stride[n],
+
+        struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
+        if (mapped) {
+            assert(mapped->pbo > 0);
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, mapped->pbo);
+            uintptr_t offset = mpi->planes[n] - (uint8_t *)mapped->ptr;
+            gl_upload_tex(gl, plane->gl_target,
+                          plane->gl_format, plane->gl_type,
+                          (void *)offset, mpi->stride[n],
                           0, 0, plane->w, plane->h);
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+            // Make sure the PBO is not reused until GL is done with it. If a
+            // previous operation is pending, "update" it by creating a new
+            // fence that will cover the previous operation as well.
+            gl->DeleteSync(mapped->fence);
+            mapped->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+            if (!mapped->mpi)
+                mapped->mpi = mp_image_new_ref(mpi);
+        } else {
+            gl_pbo_upload_tex(&plane->pbo, gl, p->opts.pbo, plane->gl_target,
+                              plane->gl_format, plane->gl_type, plane->w, plane->h,
+                              mpi->planes[n], mpi->stride[n],
+                              0, 0, plane->w, plane->h);
+        }
+        if (p->using_dr_path != !!mapped) {
+            p->using_dr_path = !!mapped;
+            MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
+        }
         gl->BindTexture(plane->gl_target, 0);
     }
     gl_timer_stop(gl);
@@ -3319,6 +3404,13 @@ void gl_video_uninit(struct gl_video *p)
 
     gl_set_debug_logger(gl, NULL);
 
+    // Forcibly destroy possibly remaining image references. This should also
+    // cause gl_video_dr_free_buffer() to be called for the remaining buffers.
+    gc_pending_dr_fences(p, true);
+
+    // Should all have been unreffed already.
+    assert(!p->num_dr_buffers);
+
     talloc_free(p);
 }
 
@@ -3603,3 +3695,58 @@ void gl_video_set_hwdec(struct gl_video *p, struct gl_hwdec *hwdec)
     p->hwdec = hwdec;
     unref_current_image(p);
 }
+
+void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
+{
+    GL *gl = p->gl;
+
+    if (gl->version < 440)
+        return NULL;
+
+    MP_TARRAY_GROW(p, p->dr_buffers, p->num_dr_buffers);
+    int index = p->num_dr_buffers++;
+    struct dr_buffer *buffer = &p->dr_buffers[index];
+
+    *buffer = (struct dr_buffer){
+        .size = size,
+    };
+
+    unsigned flags = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
+                     GL_MAP_COHERENT_BIT;
+
+    gl->GenBuffers(1, &buffer->pbo);
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer->pbo);
+    gl->BufferStorage(GL_PIXEL_UNPACK_BUFFER, size, NULL, flags);
+    buffer->ptr = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, size, flags);
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+    if (!buffer->ptr) {
+        gl_check_error(p->gl, p->log, "mapping buffer");
+        gl->DeleteBuffers(1, &buffer->pbo);
+        MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, index);
+        return NULL;
+    }
+
+    return buffer->ptr;
+};
+
+void gl_video_dr_free_buffer(struct gl_video *p, void *ptr)
+{
+    GL *gl = p->gl;
+
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (buffer->ptr == ptr) {
+            assert(!buffer->mpi); // can't be freed while it has a ref
+            gl->DeleteSync(buffer->fence);
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer->pbo);
+            gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+            gl->DeleteBuffers(1, &buffer->pbo);
+
+            MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, n);
+            return;
+        }
+    }
+    // not found - must not happen
+    assert(0);
+}
diff --git a/video/out/opengl/video.h b/video/out/opengl/video.h
index 09083da41b..f3608626e4 100644
--- a/video/out/opengl/video.h
+++ b/video/out/opengl/video.h
@@ -182,4 +182,7 @@ void gl_video_set_hwdec(struct gl_video *p, struct gl_hwdec *hwdec);
 struct vo;
 void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
 
+void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size);
+void gl_video_dr_free_buffer(struct gl_video *p, void *ptr);
+
 #endif
diff --git a/video/out/vo.c b/video/out/vo.c
index 79fc4f3bb4..e52495e195 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -23,9 +23,12 @@
 #include <pthread.h>
 #include <math.h>
 
+#include <libavutil/buffer.h>
+
 #include "mpv_talloc.h"
 
 #include "config.h"
+#include "osdep/atomic.h"
 #include "osdep/timer.h"
 #include "osdep/threads.h"
 #include "misc/dispatch.h"
@@ -113,6 +116,8 @@ struct vo_internal {
     pthread_t thread;
     struct mp_dispatch_queue *dispatch;
 
+    atomic_ullong dr_in_flight;
+
     // --- The following fields are protected by lock
     pthread_mutex_t lock;
     pthread_cond_t wakeup;
@@ -955,6 +960,7 @@ static void *vo_thread(void *ptr)
     talloc_free(in->current_frame);
     in->current_frame = NULL;
     vo->driver->uninit(vo);
+    assert(atomic_load(&vo->in->dr_in_flight) == 0);
     return NULL;
 }
 
@@ -1258,3 +1264,88 @@ int lookup_keymap_table(const struct mp_keymap *map, int key)
         map++;
     return map->to;
 }
+
+struct free_dr_context {
+    struct vo *vo;
+    AVBufferRef *ref;
+};
+
+static void vo_thread_free(void *ptr)
+{
+    struct free_dr_context *ctx = ptr;
+
+    unsigned long long v = atomic_fetch_add(&ctx->vo->in->dr_in_flight, -1);
+    assert(v); // value before sub is 0 - unexpected underflow.
+
+    av_buffer_unref(&ctx->ref);
+    talloc_free(ctx);
+}
+
+static void free_dr_buffer_on_vo_thread(void *opaque, uint8_t *data)
+{
+    struct free_dr_context *ctx = opaque;
+
+    // The image could be unreffed even on the VO thread. In practice, this
+    // matters most on VO destruction.
+    if (pthread_equal(ctx->vo->in->thread, pthread_self())) {
+        vo_thread_free(ctx);
+    } else {
+        mp_dispatch_run(ctx->vo->in->dispatch, vo_thread_free, ctx);
+    }
+}
+
+struct get_image_cmd {
+    struct vo *vo;
+    int imgfmt, w, h, stride_align;
+    struct mp_image *res;
+};
+
+static void sync_get_image(void *ptr)
+{
+    struct get_image_cmd *cmd = ptr;
+    struct vo *vo = cmd->vo;
+
+    cmd->res = vo->driver->get_image(vo, cmd->imgfmt, cmd->w, cmd->h,
+                                     cmd->stride_align);
+    if (!cmd->res)
+        return;
+
+    // We require exactly 1 AVBufferRef.
+    assert(cmd->res->bufs[0]);
+    assert(!cmd->res->bufs[1]);
+
+    // Apply some magic to get it free'd on the VO thread as well. For this to
+    // work, we create a dummy-ref that aliases the original ref, which is why
+    // the original ref must be writable in the first place. (A newly allocated
+    // image should be always writable of course.)
+    assert(mp_image_is_writeable(cmd->res));
+
+    struct free_dr_context *ctx = talloc_zero(NULL, struct free_dr_context);
+    *ctx = (struct free_dr_context){
+        .vo = vo,
+        .ref = cmd->res->bufs[0],
+    };
+
+    AVBufferRef *new_ref = av_buffer_create(ctx->ref->data, ctx->ref->size,
+                                            free_dr_buffer_on_vo_thread, ctx, 0);
+    if (!new_ref)
+        abort(); // tiny malloc OOM
+
+    cmd->res->bufs[0] = new_ref;
+
+    atomic_fetch_add(&vo->in->dr_in_flight, 1);
+}
+
+struct mp_image *vo_get_image(struct vo *vo, int imgfmt, int w, int h,
+                              int stride_align)
+{
+    if (!vo->driver->get_image)
+        return NULL;
+
+    struct get_image_cmd cmd = {
+        .vo = vo,
+        .imgfmt = imgfmt, .w = w, .h = h, .stride_align = stride_align,
+    };
+    mp_dispatch_run(vo->in->dispatch, sync_get_image, &cmd);
+    return cmd.res;
+}
diff --git a/video/out/vo.h b/video/out/vo.h
index 6dce8f6c2f..82ec284219 100644
--- a/video/out/vo.h
+++ b/video/out/vo.h
@@ -279,6 +279,36 @@ struct vo_driver {
      */
     int (*control)(struct vo *vo, uint32_t request, void *data);
 
+    /*
+     * lavc callback for direct rendering
+     *
+     * Optional. To make implementation easier, the callback is always run on
+     * the VO thread. The returned mp_image's destructor callback is also called
+     * on the VO thread, even if it's actually unref'ed from another thread.
+     *
+     * It is guaranteed that the last reference to an image is destroyed before
+     * ->uninit is called (except it's not - libmpv screenshots can hold the
+     * reference longer, fuck).
+     *
+     * The allocated image - or a part of it, can be passed to draw_frame(). The
+     * point of this mechanism is that the decoder directly renders to GPU
+     * staging memory, to avoid a memcpy on frame upload. But this is not a
+     * guarantee. A filter could change the data pointers or return a newly
+     * allocated image. It's even possible that only 1 plane uses the buffer
+     * allocated by the get_image function. The VO has to check for this.
+     *
+     * stride_align is always a value >=1 that is a power of 2. The stride
+     * values of the returned image must be divisible by this value.
+     *
+     * Currently, the returned image must have exactly 1 AVBufferRef set, for
+     * internal implementation simplicity.
+     *
+     * returns: an allocated, refcounted image; if NULL is returned, the caller
+     * will silently fallback to a default allocator
+     */
+    struct mp_image *(*get_image)(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align);
+
     /*
      * Render the given frame to the VO's backbuffer. This operation will be
      * followed by a draw_osd and a flip_page[_timed] call.
@@ -410,6 +440,8 @@ double vo_get_estimated_vsync_jitter(struct vo *vo);
 double vo_get_display_fps(struct vo *vo);
 double vo_get_delay(struct vo *vo);
 void vo_discard_timing_info(struct vo *vo);
+struct mp_image *vo_get_image(struct vo *vo, int imgfmt, int w, int h,
+                              int stride_align);
 
 void vo_wakeup(struct vo *vo);
 void vo_wait_default(struct vo *vo, int64_t until_time);
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
index f5b0bd37c4..d3b8bbffa3 100644
--- a/video/out/vo_opengl.c
+++ b/video/out/vo_opengl.c
@@ -343,6 +343,34 @@ static void wait_events(struct vo *vo, int64_t until_time_us)
     }
 }
 
+static void vo_opengl_free_dr(void *opaque, uint8_t *data)
+{
+    struct gl_priv *p = opaque;
+    gl_video_dr_free_buffer(p->renderer, data);
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct gl_priv *p = vo->priv;
+
+    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
+    if (size < 0)
+        return NULL;
+
+    int alloc_size = size + stride_align;
+    void *ptr = gl_video_dr_alloc_buffer(p->renderer, alloc_size);
+    if (!ptr)
+        return NULL;
+
+    struct mp_image *res = mp_image_from_buffer(imgfmt, w, h, stride_align,
+                                                ptr, alloc_size, p,
+                                                vo_opengl_free_dr);
+    if (!res)
+        gl_video_dr_free_buffer(p->renderer, ptr);
+    return res;
+}
+
 static void uninit(struct vo *vo)
 {
     struct gl_priv *p = vo->priv;
@@ -427,6 +455,7 @@ const struct vo_driver video_out_opengl = {
     .query_format = query_format,
     .reconfig = reconfig,
     .control = control,
+    .get_image = get_image,
     .draw_frame = draw_frame,
     .flip_page = flip_page,
     .wait_events = wait_events,