diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index bf98370eee..82c51fc503 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -703,8 +703,8 @@ Video
         mechanism in the opengl output path. To use this deinterlacing you
         must pass the option: ``vd-lavc-o=deint=[weave|bob|adaptive]``. Pass
         ``weave`` to not attempt any deinterlacing.
-        10bit HEVC is available if the hardware supports it but it will be
-        rounded down to 8 bits.
+        10 and 12bit HEVC is available if the hardware supports it and a
+        sufficiently new driver (> 375.xx) is used.
 
         ``cuda-copy`` has the same behaviour as ``cuda`` - including the ability
         to deinterlace inside the decoder. However, traditional deinterlacing
diff --git a/video/decode/cuda.c b/video/decode/cuda.c
index f9dd418fd5..b606315906 100644
--- a/video/decode/cuda.c
+++ b/video/decode/cuda.c
@@ -21,6 +21,7 @@
 #include <libavutil/hwcontext_cuda.h>
 
 #include "common/av_common.h"
+#include "video/fmt-conversion.h"
 #include "video/decode/lavc.h"
 
 typedef struct CUVIDContext {
@@ -114,7 +115,7 @@ static void uninit(struct lavc_ctx *ctx)
 static struct mp_image *process_image(struct lavc_ctx *ctx, struct mp_image *img)
 {
     if (img->imgfmt == IMGFMT_CUDA)
-        img->params.hw_subfmt = IMGFMT_NV12;
+        img->params.hw_subfmt = pixfmt2imgfmt(ctx->avctx->sw_pix_fmt);
     return img;
 }
 
diff --git a/video/fmt-conversion.c b/video/fmt-conversion.c
index 32330ddeb1..8b991c5d19 100644
--- a/video/fmt-conversion.c
+++ b/video/fmt-conversion.c
@@ -112,6 +112,9 @@ static const struct {
 #ifdef AV_PIX_FMT_P010
     {IMGFMT_P010, AV_PIX_FMT_P010},
 #endif
+#ifdef AV_PIX_FMT_P016
+    {IMGFMT_P016, AV_PIX_FMT_P016},
+#endif
 
     {0, AV_PIX_FMT_NONE}
 };
diff --git a/video/img_format.h b/video/img_format.h
index a91dcf865c..ee731aa51c 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -151,8 +151,10 @@ enum mp_imgfmt {
     IMGFMT_NV12,
     IMGFMT_NV21,
 
-    // Like IMGFMT_NV12, but with 16 bits per component
+    // Like IMGFMT_NV12, but with 10 bits per component (and 6 bits of padding)
     IMGFMT_P010,
+    // Like IMGFMT_NV12, but with 16 bits per component
+    IMGFMT_P016,
 
     // RGB/BGR Formats
 
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index 539acbd4ba..4dc842706c 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -42,7 +42,7 @@ struct priv {
     GLuint gl_textures[2];
     CUgraphicsResource cu_res[2];
     CUarray cu_array[2];
-    bool mapped;
+    int sample_width;
 
     CUcontext cuda_ctx;
 };
@@ -81,7 +81,21 @@ static struct mp_image *cuda_download_image(struct mp_hwdec_ctx *ctx,
     if (hw_image->imgfmt != IMGFMT_CUDA)
         return NULL;
 
-    struct mp_image *out = mp_image_pool_get(swpool, IMGFMT_NV12,
+    int sample_width;
+    switch (hw_image->params.hw_subfmt) {
+    case IMGFMT_NV12:
+        sample_width = 1;
+        break;
+    case IMGFMT_P010:
+    case IMGFMT_P016:
+        sample_width = 2;
+        break;
+    default:
+        return NULL;
+    }
+
+    struct mp_image *out = mp_image_pool_get(swpool,
+                                             hw_image->params.hw_subfmt,
                                              hw_image->w, hw_image->h);
     if (!out)
         return NULL;
@@ -101,7 +115,8 @@ static struct mp_image *cuda_download_image(struct mp_hwdec_ctx *ctx,
             .dstHost       = out->planes[n],
             .srcPitch      = hw_image->stride[n],
             .dstPitch      = out->stride[n],
-            .WidthInBytes  = mp_image_plane_w(out, n) * (n + 1),
+            .WidthInBytes  = mp_image_plane_w(out, n) *
+                             (n + 1) * sample_width,
             .Height        = mp_image_plane_h(out, n),
         };
 
@@ -176,11 +191,32 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     int ret = 0, eret = 0;
 
     assert(params->imgfmt == hw->driver->imgfmt);
-    params->imgfmt = IMGFMT_NV12;
+    params->imgfmt = params->hw_subfmt;
     params->hw_subfmt = 0;
 
     mp_image_set_params(&p->layout, params);
 
+    GLint luma_format, chroma_format;
+    GLenum type;
+    switch (params->imgfmt) {
+    case IMGFMT_NV12:
+        luma_format = GL_R8;
+        chroma_format = GL_RG8;
+        type = GL_UNSIGNED_BYTE;
+        p->sample_width = 1;
+        break;
+    case IMGFMT_P010:
+    case IMGFMT_P016:
+        luma_format = GL_R16;
+        chroma_format = GL_RG16;
+        type = GL_UNSIGNED_SHORT;
+        p->sample_width = 2;
+        break;
+    default:
+        MP_ERR(hw, "Unsupported format: %s\n", mp_imgfmt_to_name(params->imgfmt));
+        return -1;
+    }
+
     ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
     if (ret < 0)
         return ret;
@@ -193,10 +229,10 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-        gl->TexImage2D(GL_TEXTURE_2D, 0, n == 0 ? GL_R8 : GL_RG8,
+        gl->TexImage2D(GL_TEXTURE_2D, 0, n == 0 ? luma_format : chroma_format,
                        mp_image_plane_w(&p->layout, n),
                        mp_image_plane_h(&p->layout, n),
-                       0, n == 0 ? GL_RED : GL_RG, GL_UNSIGNED_BYTE, NULL);
+                       0, n == 0 ? GL_RED : GL_RG, type, NULL);
         gl->BindTexture(GL_TEXTURE_2D, 0);
 
         ret = CHECK_CU(cuGraphicsGLRegisterImage(&p->cu_res[n], p->gl_textures[n],
@@ -261,7 +297,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
 
     for (int n = 0; n < 2; n++) {
         // widthInBytes must account for the chroma plane
-        // elements being two bytes wide.
+        // elements being two samples wide.
         CUDA_MEMCPY2D cpy = {
             .srcMemoryType = CU_MEMORYTYPE_DEVICE,
             .dstMemoryType = CU_MEMORYTYPE_ARRAY,
@@ -269,7 +305,8 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
             .srcPitch      = hw_image->stride[n],
             .srcY          = 0,
             .dstArray      = p->cu_array[n],
-            .WidthInBytes  = mp_image_plane_w(&p->layout, n) * (n + 1),
+            .WidthInBytes  = mp_image_plane_w(&p->layout, n) *
+                             (n + 1) * p->sample_width,
             .Height        = mp_image_plane_h(&p->layout, n),
         };
         ret = CHECK_CU(cuMemcpy2D(&cpy));