diff --git a/configure b/configure index 3a614c76c1..591aa53753 100755 --- a/configure +++ b/configure @@ -2951,6 +2951,7 @@ exr_decoder_deps="zlib" exr_encoder_deps="zlib" ffv1_decoder_select="rangecoder" ffv1_encoder_select="rangecoder" +ffv1_vulkan_encoder_select="vulkan spirv_compiler" ffvhuff_decoder_select="huffyuv_decoder" ffvhuff_encoder_select="huffyuv_encoder" fic_decoder_select="golomb" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 676ff542af..a6e0e0b55e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -370,6 +370,7 @@ OBJS-$(CONFIG_EXR_ENCODER) += exrenc.o float2half.o OBJS-$(CONFIG_FASTAUDIO_DECODER) += fastaudio.o OBJS-$(CONFIG_FFV1_DECODER) += ffv1dec.o ffv1.o OBJS-$(CONFIG_FFV1_ENCODER) += ffv1enc.o ffv1.o +OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += ffv1enc.o ffv1.o ffv1enc_vulkan.o OBJS-$(CONFIG_FFWAVESYNTH_DECODER) += ffwavesynth.o OBJS-$(CONFIG_FIC_DECODER) += fic.o OBJS-$(CONFIG_FITS_DECODER) += fitsdec.o fits.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index d8a5866435..0b559dfc58 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -116,6 +116,7 @@ extern const FFCodec ff_escape130_decoder; extern const FFCodec ff_exr_encoder; extern const FFCodec ff_exr_decoder; extern const FFCodec ff_ffv1_encoder; +extern const FFCodec ff_ffv1_vulkan_encoder; extern const FFCodec ff_ffv1_decoder; extern const FFCodec ff_ffvhuff_encoder; extern const FFCodec ff_ffvhuff_decoder; diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c new file mode 100644 index 0000000000..2f776307c1 --- /dev/null +++ b/libavcodec/ffv1enc_vulkan.c @@ -0,0 +1,1604 @@ +/* + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/crc.h" +#include "libavutil/vulkan.h" +#include "libavutil/vulkan_spirv.h" + +#include "avcodec.h" +#include "internal.h" +#include "hwconfig.h" +#include "encode.h" +#include "libavutil/opt.h" +#include "codec_internal.h" + +#include "ffv1.h" +#include "ffv1enc.h" + +/* Parallel Golomb alignment */ +#define LG_ALIGN_W 32 +#define LG_ALIGN_H 32 + +typedef struct VulkanEncodeFFv1Context { + FFV1Context ctx; + + FFVulkanContext s; + FFVkQueueFamilyCtx qf; + FFVkExecPool exec_pool; + + FFVulkanShader setup; + FFVulkanShader reset; + FFVulkanShader rct; + FFVulkanShader enc; + + /* Constant read-only buffers */ + FFVkBuffer quant_buf; + FFVkBuffer rangecoder_static_buf; + FFVkBuffer crc_tab_buf; + + /* Slice data buffer pool */ + AVBufferPool *slice_data_pool; + AVBufferRef *keyframe_slice_data_ref; + + /* Output data buffer */ + AVBufferPool *out_data_pool; + + /* Temporary data buffer */ + AVBufferPool *tmp_data_pool; + + /* Slice results buffer */ + AVBufferPool *results_data_pool; + + /* Intermediate frame pool */ + AVBufferRef *intermediate_frames_ref; + + /* Representation mode */ + enum FFVkShaderRepFormat rep_fmt; + + int num_h_slices; + int num_v_slices; + int force_pcm; + + int is_rgb; + int ppi; + int chunks; +} VulkanEncodeFFv1Context; + +extern const char *ff_source_common_comp; +extern const char *ff_source_rangecoder_comp; +extern const char *ff_source_ffv1_vlc_comp; +extern const char *ff_source_ffv1_common_comp; +extern const char *ff_source_ffv1_reset_comp; +extern const char *ff_source_ffv1_enc_common_comp; +extern const char *ff_source_ffv1_enc_rct_comp; +extern const char *ff_source_ffv1_enc_vlc_comp; +extern const char *ff_source_ffv1_enc_ac_comp; +extern const char *ff_source_ffv1_enc_setup_comp; +extern const char *ff_source_ffv1_enc_comp; +extern const char *ff_source_ffv1_enc_rgb_comp; + +typedef struct FFv1VkRCTParameters { + int offset; + uint8_t planar_rgb; + uint8_t transparency; + uint8_t padding[2]; +} FFv1VkRCTParameters; + +typedef struct FFv1VkResetParameters { + VkDeviceAddress slice_state; + uint32_t plane_state_size; + uint32_t context_count; + uint8_t codec_planes; + uint8_t key_frame; + uint8_t padding[3]; +} FFv1VkResetParameters; + +typedef struct FFv1VkParameters { + VkDeviceAddress slice_state; + VkDeviceAddress scratch_data; + VkDeviceAddress out_data; + + int32_t sar[2]; + uint32_t chroma_shift[2]; + + uint32_t slice_size_max; + uint32_t plane_state_size; + uint32_t context_count; + uint32_t crcref; + + uint8_t bits_per_raw_sample; + uint8_t context_model; + uint8_t version; + uint8_t micro_version; + uint8_t force_pcm; + uint8_t key_frame; + uint8_t planes; + uint8_t codec_planes; + uint8_t transparency; + uint8_t colorspace; + uint8_t pic_mode; + uint8_t ec; + uint8_t ppi; + uint8_t chunks; + uint8_t padding[2]; +} FFv1VkParameters; + +static void add_push_data(FFVulkanShader *shd) +{ + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, u8buf slice_state; ); + GLSLC(1, u8buf scratch_data; ); + GLSLC(1, u8buf out_data; ); + GLSLC(0, ); + GLSLC(1, ivec2 sar; ); + GLSLC(1, uvec2 chroma_shift; ); + GLSLC(0, ); + GLSLC(1, uint slice_size_max; ); + GLSLC(1, uint plane_state_size; ); + GLSLC(1, uint context_count; ); + GLSLC(1, uint32_t crcref; ); + GLSLC(0, ); + GLSLC(1, uint8_t bits_per_raw_sample; ); + GLSLC(1, uint8_t context_model; ); + GLSLC(1, uint8_t version; ); + GLSLC(1, uint8_t micro_version; ); + GLSLC(1, uint8_t force_pcm; ); + GLSLC(1, uint8_t key_frame; ); + GLSLC(1, uint8_t planes; ); + GLSLC(1, uint8_t codec_planes; ); + GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t colorspace; ); + GLSLC(1, uint8_t pic_mode; ); + GLSLC(1, uint8_t ec; ); + GLSLC(1, uint8_t ppi; ); + GLSLC(1, uint8_t chunks; ); + GLSLC(1, uint8_t padding[2]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters), + VK_SHADER_STAGE_COMPUTE_BIT); +} + +static int run_rct(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *enc_in, VkImageView *enc_in_views, + AVFrame **intermediate_frame, VkImageView *intermediate_views, + VkImageMemoryBarrier2 *img_bar, int *nb_img_bar, + VkBufferMemoryBarrier2 *buf_bar, int *nb_buf_bar, + FFVkBuffer *slice_data_buf, uint32_t slice_data_size) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVulkanFunctions *vk = &fv->s.vkfn; + AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data; + FFv1VkRCTParameters pd; + + /* Create a temporaty frame */ + *intermediate_frame = av_frame_alloc(); + if (!(*intermediate_frame)) + return AVERROR(ENOMEM); + + RET(av_hwframe_get_buffer(fv->intermediate_frames_ref, + *intermediate_frame, 0)); + + RET(ff_vk_exec_add_dep_frame(&fv->s, exec, *intermediate_frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_create_imageviews(&fv->s, exec, intermediate_views, + *intermediate_frame, + fv->rep_fmt)); + + /* Update descriptors */ + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct, + 1, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct, + enc_in, enc_in_views, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct, + *intermediate_frame, intermediate_views, + 1, 2, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Prep the input/output images */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = *nb_img_bar, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = *nb_buf_bar, + }); + *nb_img_bar = 0; + if (*nb_buf_bar) { + slice_data_buf->stage = buf_bar[0].dstStageMask; + slice_data_buf->access = buf_bar[0].dstAccessMask; + *nb_buf_bar = 0; + } + + /* Run the shader */ + ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct); + pd = (FFv1VkRCTParameters) { + .offset = 1 << f->bits_per_raw_sample, + .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) && + (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1), + .transparency = f->transparency, + }; + ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + + /* Add a post-dispatch barrier before encoding */ + ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + +fail: + return err; +} + +static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *pict, int *got_packet) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVulkanFunctions *vk = &fv->s.vkfn; + FFVkExecContext *exec; + + FFv1VkParameters pd; + + AVFrame *intermediate_frame = NULL; + + /* Temporary data */ + size_t tmp_data_size; + AVBufferRef *tmp_data_ref; + FFVkBuffer *tmp_data_buf; + + /* Slice data */ + AVBufferRef *slice_data_ref; + FFVkBuffer *slice_data_buf; + uint32_t plane_state_size; + uint32_t slice_state_size; + uint32_t slice_data_size; + + /* Output data */ + size_t maxsize; + AVBufferRef *out_data_ref; + FFVkBuffer *out_data_buf; + uint8_t *buf_p; + + /* Results data */ + AVBufferRef *results_data_ref; + FFVkBuffer *results_data_buf; + uint32_t *sc; + + int has_inter = avctx->gop_size > 1; + uint32_t context_count = f->context_count[f->context_model]; + + VkImageView in_views[AV_NUM_DATA_POINTERS]; + VkImageView intermediate_views[AV_NUM_DATA_POINTERS]; + + AVFrame *enc_in = (AVFrame *)pict; + VkImageView *enc_in_views = in_views; + + VkMappedMemoryRange invalidate_data[2]; + int nb_invalidate_data = 0; + + VkImageMemoryBarrier2 img_bar[37]; + int nb_img_bar = 0; + VkBufferMemoryBarrier2 buf_bar[8]; + int nb_buf_bar = 0; + + if (!pict) + return 0; + + exec = ff_vk_exec_get(&fv->s, &fv->exec_pool); + ff_vk_exec_start(&fv->s, exec); + + /* Frame state */ + f->cur_enc_frame = pict; + if (avctx->gop_size == 0 || f->picture_number % avctx->gop_size == 0) { + av_buffer_unref(&fv->keyframe_slice_data_ref); + f->key_frame = 1; + f->gob_count++; + } else { + f->key_frame = 0; + } + + f->max_slice_count = f->num_h_slices * f->num_v_slices; + f->slice_count = f->max_slice_count; + + /* Allocate temporary data buffer */ + tmp_data_size = f->slice_count*CONTEXT_SIZE; + err = ff_vk_get_pooled_buffer(&fv->s, &fv->tmp_data_pool, + &tmp_data_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, tmp_data_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + tmp_data_buf = (FFVkBuffer *)tmp_data_ref->data; + + /* Allocate slice buffer data */ + if (f->ac == AC_GOLOMB_RICE) + plane_state_size = 8; + else + plane_state_size = CONTEXT_SIZE; + + plane_state_size *= context_count; + slice_state_size = plane_state_size*f->plane_count; + + slice_data_size = 256; /* Overestimation for the SliceContext struct */ + slice_state_size += slice_data_size; + slice_state_size = FFALIGN(slice_state_size, 8); + + slice_data_ref = fv->keyframe_slice_data_ref; + if (!slice_data_ref) { + /* Allocate slice data buffer */ + err = ff_vk_get_pooled_buffer(&fv->s, &fv->slice_data_pool, + &slice_data_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, slice_state_size*f->slice_count, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + + /* Only save it if we're going to use it again */ + if (has_inter) + fv->keyframe_slice_data_ref = slice_data_ref; + } + slice_data_buf = (FFVkBuffer *)slice_data_ref->data; + + /* Allocate results buffer */ + err = ff_vk_get_pooled_buffer(&fv->s, &fv->results_data_pool, + &results_data_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, 2*f->slice_count*sizeof(uint32_t), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + results_data_buf = (FFVkBuffer *)results_data_ref->data; + + /* Output buffer size */ + maxsize = avctx->width*avctx->height*(1 + f->transparency); + if (f->chroma_planes) + maxsize += AV_CEIL_RSHIFT(avctx->width, f->chroma_h_shift) * + AV_CEIL_RSHIFT(f->height, f->chroma_v_shift)*2; + maxsize += f->slice_count * 800; + if (f->version > 3) { + maxsize *= f->bits_per_raw_sample + 1; + } else { + maxsize += f->slice_count * 2 * (avctx->width + avctx->height); + maxsize *= 8*(2*f->bits_per_raw_sample + 5); + } + maxsize >>= 3; + maxsize += FF_INPUT_BUFFER_MIN_SIZE; + + if (maxsize > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - 32) { + av_log(avctx, AV_LOG_WARNING, "Cannot allocate worst case packet size, " + "the encoding could fail\n"); + maxsize = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - 32; + } + + /* Allocate output buffer */ + err = ff_vk_get_pooled_buffer(&fv->s, &fv->out_data_pool, + &out_data_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, maxsize, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + if (err < 0) + return err; + + out_data_buf = (FFVkBuffer *)out_data_ref->data; + pkt->data = out_data_buf->mapped_mem; + pkt->size = out_data_buf->size; + pkt->buf = out_data_ref; + + /* Add dependencies */ + ff_vk_exec_add_dep_buf(&fv->s, exec, &tmp_data_ref, 1, 0); + ff_vk_exec_add_dep_buf(&fv->s, exec, &results_data_ref, 1, 0); + ff_vk_exec_add_dep_buf(&fv->s, exec, &slice_data_ref, 1, has_inter); + ff_vk_exec_add_dep_buf(&fv->s, exec, &out_data_ref, 1, 1); + RET(ff_vk_exec_add_dep_frame(&fv->s, exec, enc_in, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + RET(ff_vk_create_imageviews(&fv->s, exec, enc_in_views, enc_in, + fv->rep_fmt)); + ff_vk_frame_barrier(&fv->s, exec, enc_in, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Setup shader needs the original input */ + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->setup, + 1, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&fv->s, exec, &fv->setup, + enc_in, enc_in_views, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + /* Add a buffer barrier between previous and current frame */ + if (!f->key_frame) { + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_data_buf->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_data_buf->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .size = VK_WHOLE_SIZE, + .offset = 0, + }; + } + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_img_bar = 0; + if (nb_buf_bar) { + slice_data_buf->stage = buf_bar[0].dstStageMask; + slice_data_buf->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + } + + /* Run setup shader */ + ff_vk_exec_bind_shader(&fv->s, exec, &fv->setup); + pd = (FFv1VkParameters) { + .slice_state = slice_data_buf->address + f->slice_count*256, + .scratch_data = tmp_data_buf->address, + .out_data = out_data_buf->address, + .slice_size_max = out_data_buf->size / f->slice_count, + .bits_per_raw_sample = f->bits_per_raw_sample, + .sar[0] = pict->sample_aspect_ratio.num, + .sar[1] = pict->sample_aspect_ratio.den, + .chroma_shift[0] = f->chroma_h_shift, + .chroma_shift[1] = f->chroma_v_shift, + .plane_state_size = plane_state_size, + .context_count = context_count, + .crcref = f->crcref, + .context_model = fv->ctx.context_model, + .version = f->version, + .micro_version = f->micro_version, + .force_pcm = fv->force_pcm, + .key_frame = f->key_frame, + .planes = av_pix_fmt_count_planes(avctx->sw_pix_fmt), + .codec_planes = f->plane_count, + .transparency = f->transparency, + .colorspace = f->colorspace, + .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 : + !(pict->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 2 : 1, + .ec = f->ec, + .ppi = fv->ppi, + .chunks = fv->chunks, + }; + ff_vk_shader_update_push_const(&fv->s, exec, &fv->setup, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + + /* Setup shader modified the slice data buffer */ + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_data_buf->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_data_buf->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .size = slice_data_size*f->slice_count, + .offset = 0, + }; + + if (f->key_frame || f->version > 3) { + FFv1VkResetParameters pd_reset; + + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->reset, + 1, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + + /* Run setup shader */ + ff_vk_exec_bind_shader(&fv->s, exec, &fv->reset); + pd_reset = (FFv1VkResetParameters) { + .slice_state = slice_data_buf->address + f->slice_count*256, + .plane_state_size = plane_state_size, + .context_count = context_count, + .codec_planes = f->plane_count, + .key_frame = f->key_frame, + }; + ff_vk_shader_update_push_const(&fv->s, exec, &fv->reset, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd_reset), &pd_reset); + + /* Sync between setup and reset shaders */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + slice_data_buf->stage = buf_bar[0].dstStageMask; + slice_data_buf->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, + f->plane_count); + } + + /* Run RCT shader */ + if (fv->is_rgb) { + RET(run_rct(avctx, exec, + enc_in, enc_in_views, + &intermediate_frame, intermediate_views, + img_bar, &nb_img_bar, buf_bar, &nb_buf_bar, + slice_data_buf, slice_data_size)); + + /* Use the new frame */ + enc_in = intermediate_frame; + enc_in_views = intermediate_views; + } + + /* If the reset shader ran, insert a barrier now. */ + if (f->key_frame || f->version > 3) { + /* Reset shader modified the slice data buffer */ + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_data_buf->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_data_buf->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .size = slice_data_buf->size - slice_data_size*f->slice_count, + .offset = slice_data_size*f->slice_count, + }; + } + + /* Final barrier before encoding */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_img_bar = 0; + if (nb_buf_bar) { + slice_data_buf->stage = buf_bar[0].dstStageMask; + slice_data_buf->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + } + + /* Main encode shader */ + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->enc, + 1, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&fv->s, exec, &fv->enc, + enc_in, enc_in_views, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&fv->s, exec, + &fv->enc, 1, 2, 0, + results_data_buf, + 0, results_data_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&fv->s, exec, &fv->enc); + ff_vk_shader_update_push_const(&fv->s, exec, &fv->enc, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + + /* Submit */ + err = ff_vk_exec_submit(&fv->s, exec); + if (err < 0) + return err; + + /* We need the encoded data immediately */ + ff_vk_exec_wait(&fv->s, exec); + av_frame_free(&intermediate_frame); + + /* Invalidate slice/output data if needed */ + if (!(results_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) + invalidate_data[nb_invalidate_data++] = (VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = results_data_buf->mem, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) + invalidate_data[nb_invalidate_data++] = (VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = out_data_buf->mem, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + if (nb_invalidate_data) + vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev, + nb_invalidate_data, invalidate_data); + + /* First slice is in-place */ + buf_p = pkt->data; + sc = &((uint32_t *)results_data_buf->mapped_mem)[0]; + av_log(avctx, AV_LOG_VERBOSE, "Slice size = %u (max %i), src offset = %u\n", + sc[0], pkt->size / f->slice_count, sc[1]); + av_assert0(sc[0] < pkt->size / f->slice_count); + av_assert0(sc[0] < (1 << 24)); + buf_p += sc[0]; + + /* We have to copy the rest */ + for (int i = 1; i < f->slice_count; i++) { + uint32_t bytes; + uint8_t *bs_start; + + sc = &((uint32_t *)results_data_buf->mapped_mem)[i*2]; + bytes = sc[0]; + bs_start = pkt->data + sc[1]; + + av_log(avctx, AV_LOG_VERBOSE, "Slice size = %u (max %i), src offset = %u\n", + bytes, pkt->size / f->slice_count, sc[1]); + av_assert0(bytes < pkt->size / f->slice_count); + av_assert0(bytes < (1 << 24)); + + memmove(buf_p, bs_start, bytes); + + buf_p += bytes; + } + + f->picture_number++; + pkt->size = buf_p - pkt->data; + pkt->flags |= AV_PKT_FLAG_KEY * f->key_frame; + *got_packet = 1; + + av_log(avctx, AV_LOG_VERBOSE, "Total data = %i\n", + pkt->size); + +fail: + /* Frames added as a dep are always referenced, so we only need to + * clean this up. */ + av_frame_free(&intermediate_frame); + + return 0; +} + +static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + AVHWFramesContext *frames_ctx; + AVVulkanFramesContext *vk_frames; + + fv->intermediate_frames_ref = av_hwframe_ctx_alloc(fv->s.device_ref); + if (!fv->intermediate_frames_ref) + return AVERROR(ENOMEM); + + frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data; + frames_ctx->format = AV_PIX_FMT_VULKAN; + frames_ctx->sw_format = sw_format; + frames_ctx->width = FFALIGN(fv->s.frames->width, 32); + frames_ctx->height = FFALIGN(fv->s.frames->height, 32); + + vk_frames = frames_ctx->hwctx; + vk_frames->tiling = VK_IMAGE_TILING_OPTIMAL; + vk_frames->usage = VK_IMAGE_USAGE_STORAGE_BIT; + vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + + err = av_hwframe_ctx_init(fv->intermediate_frames_ref); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize frame pool with format %s: %s\n", + av_get_pix_fmt_name(sw_format), av_err2str(err)); + av_buffer_unref(&fv->intermediate_frames_ref); + return err; + } + + return 0; +} + +static int check_support(AVHWFramesConstraints *constraints, + enum AVPixelFormat fmt) +{ + for (int i = 0; constraints->valid_sw_formats[i]; i++) { + if (constraints->valid_sw_formats[i] == fmt) + return 1; + } + return 0; +} + +static enum AVPixelFormat get_supported_rgb_buffer_fmt(AVCodecContext *avctx) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + + enum AVPixelFormat fmt; + AVHWFramesConstraints *constraints; + constraints = av_hwdevice_get_hwframe_constraints(fv->s.device_ref, + NULL); + + /* What we'd like to optimally have */ + fmt = fv->ctx.use32bit ? + (fv->ctx.transparency ? AV_PIX_FMT_RGBA128 : AV_PIX_FMT_RGB96) : + (fv->ctx.transparency ? AV_PIX_FMT_RGBA64 : AV_PIX_FMT_RGB48); + if (check_support(constraints, fmt)) + goto end; + + if (fv->ctx.use32bit) { + if (check_support(constraints, (fmt = AV_PIX_FMT_RGBA128))) + goto end; + } else { + if (check_support(constraints, (fmt = AV_PIX_FMT_RGBA64))) + goto end; + + if (!fv->ctx.transparency && + check_support(constraints, (fmt = AV_PIX_FMT_RGB96))) + goto end; + + if (check_support(constraints, (fmt = AV_PIX_FMT_RGBA128))) + goto end; + } + + fmt = AV_PIX_FMT_NONE; + +end: + av_hwframe_constraints_free(&constraints); + return fmt; +} + +static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + int smp_bits = fv->ctx.use32bit ? 32 : 16; + + av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n" ,CONTEXT_SIZE); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n" ,MAX_QUANT_TABLE_MASK); + + if (f->ac == AC_GOLOMB_RICE) { + av_bprintf(&shd->src, "#define PB_UNALIGNED\n" ); + av_bprintf(&shd->src, "#define GOLOMB\n" ); + } + + GLSLF(0, #define TYPE int%i_t ,smp_bits); + GLSLF(0, #define VTYPE2 i%ivec2 ,smp_bits); + GLSLF(0, #define VTYPE3 i%ivec3 ,smp_bits); + GLSLD(ff_source_common_comp); + GLSLD(ff_source_rangecoder_comp); + + if (f->ac == AC_GOLOMB_RICE) + GLSLD(ff_source_ffv1_vlc_comp); + + GLSLD(ff_source_ffv1_common_comp); +} + +static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->setup; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + RET(ff_vk_shader_init(&fv->s, shd, "ffv1_setup", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + 1, 1, 1, + 0)); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { /* This descriptor is never used */ + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0)); + + define_shared_code(avctx, shd); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx[1024];", + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format, + fv->rep_fmt), + .elems = av_pix_fmt_count_planes(fv->s.frames->sw_format), + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0)); + + add_push_data(shd); + + GLSLD(ff_source_ffv1_enc_setup_comp); + + RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->reset; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + int wg_dim = FFMIN(fv->s.props.properties.limits.maxComputeWorkGroupSize[0], 1024); + + RET(ff_vk_shader_init(&fv->s, shd, "ffv1_reset", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + wg_dim, 1, 1, + 0)); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0)); + + define_shared_code(avctx, shd); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx[1024];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 1, 0, 0)); + + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, u8buf slice_state; ); + GLSLC(1, uint plane_state_size; ); + GLSLC(1, uint context_count; ); + GLSLC(1, uint8_t codec_planes; ); + GLSLC(1, uint8_t key_frame; ); + GLSLC(1, uint8_t padding[3]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters), + VK_SHADER_STAGE_COMPUTE_BIT); + + GLSLD(ff_source_ffv1_reset_comp); + + RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_rct_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->rct; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + int wg_count = sqrt(fv->s.props.properties.limits.maxComputeWorkGroupInvocations); + + enum AVPixelFormat intermediate_fmt = get_supported_rgb_buffer_fmt(avctx); + if (intermediate_fmt == AV_PIX_FMT_NONE) { + av_log(avctx, AV_LOG_ERROR, "Unable to find a supported compatible " + "pixel format for RCT buffer!\n"); + return AVERROR(ENOTSUP); + } + + RET(init_indirect(avctx, intermediate_fmt)); + + RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + wg_count, wg_count, 1, + 0)); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0)); + + define_shared_code(avctx, shd); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx[1024];", + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format, + fv->rep_fmt), + .elems = av_pix_fmt_count_planes(fv->s.frames->sw_format), + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(intermediate_fmt, + fv->rep_fmt), + .elems = av_pix_fmt_count_planes(intermediate_fmt), + .mem_quali = "writeonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0)); + + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, int offset; ); + GLSLC(1, uint8_t planar_rgb; ); + GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t padding[2]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters), + VK_SHADER_STAGE_COMPUTE_BIT); + + GLSLD(ff_source_ffv1_enc_rct_comp); + + RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVulkanShader *shd = &fv->enc; + FFVulkanDescriptorSetBinding *desc_set; + + AVHWFramesContext *frames_ctx = fv->intermediate_frames_ref ? + (AVHWFramesContext *)fv->intermediate_frames_ref->data : + fv->s.frames; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + RET(ff_vk_shader_init(&fv->s, shd, "ffv1_enc", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + 1, 1, 1, + 0)); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + { + .name = "crc_ieee_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint32_t crc_ieee[256];", + }, + }; + + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 1, 0)); + + define_shared_code(avctx, shd); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx[1024];", + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format, + fv->rep_fmt), + .elems = av_pix_fmt_count_planes(frames_ctx->sw_format), + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "results_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_quali = "writeonly", + .buf_content = "uint32_t slice_results[2048];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0)); + + add_push_data(shd); + + /* Assemble the shader body */ + GLSLD(ff_source_ffv1_enc_common_comp); + + if (f->ac == AC_GOLOMB_RICE) + GLSLD(ff_source_ffv1_enc_vlc_comp); + else + GLSLD(ff_source_ffv1_enc_ac_comp); + + if (fv->is_rgb) + GLSLD(ff_source_ffv1_enc_rgb_comp); + else + GLSLD(ff_source_ffv1_enc_comp); + + RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_state_transition_data(AVCodecContext *avctx) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + + uint8_t *buf_mapped; + size_t buf_len = 512*sizeof(uint8_t); + + RET(ff_vk_create_buf(&fv->s, &fv->rangecoder_static_buf, + buf_len, + NULL, NULL, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(&fv->s, &fv->rangecoder_static_buf, + &buf_mapped, 0)); + + for (int i = 1; i < 256; i++) { + buf_mapped[256 + i] = fv->ctx.state_transition[i]; + buf_mapped[256 - i] = 256 - (int)fv->ctx.state_transition[i]; + } + + RET(ff_vk_unmap_buffer(&fv->s, &fv->rangecoder_static_buf, 1)); + + /* Update descriptors */ + RET(ff_vk_shader_update_desc_buffer(&fv->s, &fv->exec_pool.contexts[0], + &fv->setup, 0, 0, 0, + &fv->rangecoder_static_buf, + 0, fv->rangecoder_static_buf.size, + VK_FORMAT_UNDEFINED)); + RET(ff_vk_shader_update_desc_buffer(&fv->s, &fv->exec_pool.contexts[0], + &fv->enc, 0, 0, 0, + &fv->rangecoder_static_buf, + 0, fv->rangecoder_static_buf.size, + VK_FORMAT_UNDEFINED)); + +fail: + return err; +} + +static int init_quant_table_data(AVCodecContext *avctx) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + + int16_t *buf_mapped; + size_t buf_len = MAX_QUANT_TABLES* + MAX_CONTEXT_INPUTS* + MAX_QUANT_TABLE_SIZE*sizeof(int16_t); + + RET(ff_vk_create_buf(&fv->s, &fv->quant_buf, + buf_len, + NULL, NULL, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(&fv->s, &fv->quant_buf, (void *)&buf_mapped, 0)); + + memcpy(buf_mapped, fv->ctx.quant_tables, + sizeof(fv->ctx.quant_tables)); + + RET(ff_vk_unmap_buffer(&fv->s, &fv->quant_buf, 1)); + RET(ff_vk_shader_update_desc_buffer(&fv->s, &fv->exec_pool.contexts[0], + &fv->enc, 0, 1, 0, + &fv->quant_buf, + 0, fv->quant_buf.size, + VK_FORMAT_UNDEFINED)); + +fail: + return err; +} + +static int init_crc_table_data(AVCodecContext *avctx) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + + uint32_t *buf_mapped; + size_t buf_len = 256*sizeof(int32_t); + + RET(ff_vk_create_buf(&fv->s, &fv->crc_tab_buf, + buf_len, + NULL, NULL, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(&fv->s, &fv->crc_tab_buf, (void *)&buf_mapped, 0)); + + memcpy(buf_mapped, av_crc_get_table(AV_CRC_32_IEEE), buf_len); + + RET(ff_vk_unmap_buffer(&fv->s, &fv->crc_tab_buf, 1)); + RET(ff_vk_shader_update_desc_buffer(&fv->s, &fv->exec_pool.contexts[0], + &fv->enc, 0, 2, 0, + &fv->crc_tab_buf, + 0, fv->crc_tab_buf.size, + VK_FORMAT_UNDEFINED)); + +fail: + return err; +} + +static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVkSPIRVCompiler *spv; + + if ((err = ff_ffv1_common_init(avctx)) < 0) + return err; + + if (f->ac == 1) + f->ac = AC_RANGE_CUSTOM_TAB; + + err = ff_ffv1_encode_setup_plane_info(avctx, avctx->sw_pix_fmt); + if (err < 0) + return err; + + /* Target version 3 by default */ + f->version = 3; + + err = ff_ffv1_encode_init(avctx); + if (err < 0) + return err; + + /* Rice coding did not support high bit depths */ + if (f->bits_per_raw_sample > (f->version > 3 ? 16 : 8)) { + if (f->ac == AC_GOLOMB_RICE) { + av_log(avctx, AV_LOG_WARNING, "bits_per_raw_sample > 8, " + "forcing range coder\n"); + f->ac = AC_RANGE_CUSTOM_TAB; + } + } + + if (f->version < 4 && avctx->gop_size > 1) { + av_log(avctx, AV_LOG_ERROR, "Using inter frames requires version 4 (-level 4)\n"); + return AVERROR_INVALIDDATA; + } + + if (f->version == 4 && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) { + av_log(avctx, AV_LOG_ERROR, "Version 4 is experimental and requires -strict -2\n"); + return AVERROR_INVALIDDATA; + } + + //if (fv->ctx.ac == AC_GOLOMB_RICE) { + if (0) { + int w_a = FFALIGN(avctx->width, LG_ALIGN_W); + int h_a = FFALIGN(avctx->height, LG_ALIGN_H); + int w_sl, h_sl; + + /* Pixels per line an invocation handles */ + int ppi = 0; + /* Chunk size */ + int chunks = 0; + + do { + if (ppi < 2) + ppi++; + chunks++; + w_sl = w_a / (LG_ALIGN_W*ppi); + h_sl = h_a / (LG_ALIGN_H*chunks); + } while (w_sl > MAX_SLICES / h_sl); + + av_log(avctx, AV_LOG_VERBOSE, "Slice config: %ix%i, %i total\n", + LG_ALIGN_W*ppi, LG_ALIGN_H*chunks, w_sl*h_sl); + av_log(avctx, AV_LOG_VERBOSE, "Horizontal slices: %i (%i pixels per invoc)\n", + w_sl, ppi); + av_log(avctx, AV_LOG_VERBOSE, "Vertical slices: %i (%i chunks)\n", + h_sl, chunks); + + f->num_h_slices = w_sl; + f->num_v_slices = h_sl; + + fv->ppi = ppi; + fv->chunks = chunks; + } else { + f->num_h_slices = fv->num_h_slices; + f->num_v_slices = fv->num_v_slices; + + if (f->num_h_slices <= 0) + f->num_h_slices = 32; + if (f->num_v_slices <= 0) + f->num_v_slices = 32; + + f->num_h_slices = FFMIN(f->num_h_slices, avctx->width); + f->num_v_slices = FFMIN(f->num_v_slices, avctx->height); + } + + if ((err = ff_ffv1_write_extradata(avctx)) < 0) + return err; + + if (f->version < 4) { + if (((f->chroma_h_shift > 0) && (avctx->width % (64 << f->chroma_h_shift))) || + ((f->chroma_v_shift > 0) && (avctx->height % (64 << f->chroma_v_shift)))) { + av_log(avctx, AV_LOG_ERROR, "Encoding frames with subsampling and unaligned " + "dimensions is only supported in version 4 (-level 4)\n"); + return AVERROR_PATCHWELCOME; + } + } + + if (fv->force_pcm) { + if (f->version < 4) { + av_log(avctx, AV_LOG_ERROR, "PCM coding only supported by version 4 (-level 4)\n"); + return AVERROR_INVALIDDATA; + } else if (f->ac != AC_RANGE_CUSTOM_TAB) { + av_log(avctx, AV_LOG_ERROR, "PCM coding requires range coding\n"); + return AVERROR_INVALIDDATA; + } + } + + /* Init Vulkan */ + err = ff_vk_init(&fv->s, avctx, NULL, avctx->hw_frames_ctx); + if (err < 0) + return err; + + err = ff_vk_qf_init(&fv->s, &fv->qf, VK_QUEUE_COMPUTE_BIT); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n"); + return err; + } + + err = ff_vk_exec_pool_init(&fv->s, &fv->qf, &fv->exec_pool, + fv->qf.nb_queues*4, + 0, 0, 0, NULL); + if (err < 0) + return err; + + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return AVERROR_EXTERNAL; + } + + /* Detect the special RGB coding mode */ + fv->is_rgb = !(f->colorspace == 0 && avctx->sw_pix_fmt != AV_PIX_FMT_YA8) && + !(avctx->sw_pix_fmt == AV_PIX_FMT_YA8); + + /* bits_per_raw_sample use regular unsigned representation, + * but in higher bit depths, the data is casted to int16_t */ + fv->rep_fmt = FF_VK_REP_UINT; + if (!fv->is_rgb && f->bits_per_raw_sample > 8) + fv->rep_fmt = FF_VK_REP_INT; + + /* Init setup shader */ + err = init_setup_shader(avctx, spv); + if (err < 0) { + spv->uninit(&spv); + return err; + } + + /* Init reset shader */ + err = init_reset_shader(avctx, spv); + if (err < 0) { + spv->uninit(&spv); + return err; + } + + /* Init RCT shader */ + if (fv->is_rgb) { + err = init_rct_shader(avctx, spv); + if (err < 0) { + spv->uninit(&spv); + return err; + } + } + + /* Encode shader */ + err = init_encode_shader(avctx, spv); + if (err < 0) { + spv->uninit(&spv); + return err; + } + + spv->uninit(&spv); + + /* Range coder data */ + err = init_state_transition_data(avctx); + if (err < 0) + return err; + + /* Quantization table data */ + err = init_quant_table_data(avctx); + if (err < 0) + return err; + + /* CRC table buffer */ + err = init_crc_table_data(avctx); + if (err < 0) + return err; + + return 0; +} + +static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + + ff_vk_exec_pool_free(&fv->s, &fv->exec_pool); + + ff_vk_shader_free(&fv->s, &fv->enc); + ff_vk_shader_free(&fv->s, &fv->rct); + ff_vk_shader_free(&fv->s, &fv->reset); + ff_vk_shader_free(&fv->s, &fv->setup); + + av_buffer_unref(&fv->intermediate_frames_ref); + + av_buffer_pool_uninit(&fv->results_data_pool); + + av_buffer_pool_uninit(&fv->out_data_pool); + av_buffer_pool_uninit(&fv->tmp_data_pool); + + av_buffer_unref(&fv->keyframe_slice_data_ref); + av_buffer_pool_uninit(&fv->slice_data_pool); + + ff_vk_free_buf(&fv->s, &fv->quant_buf); + ff_vk_free_buf(&fv->s, &fv->rangecoder_static_buf); + ff_vk_free_buf(&fv->s, &fv->crc_tab_buf); + + ff_vk_uninit(&fv->s); + + return 0; +} + +#define OFFSET(x) offsetof(VulkanEncodeFFv1Context, x) +#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM +static const AVOption vulkan_encode_ffv1_options[] = { + { "slicecrc", "Protect slices with CRCs", OFFSET(ctx.ec), AV_OPT_TYPE_BOOL, + { .i64 = -1 }, -1, 1, VE }, + { "context", "Context model", OFFSET(ctx.context_model), AV_OPT_TYPE_INT, + { .i64 = 0 }, 0, 1, VE }, + { "coder", "Coder type", OFFSET(ctx.ac), AV_OPT_TYPE_INT, + { .i64 = AC_RANGE_CUSTOM_TAB }, -2, 2, VE, .unit = "coder" }, + { "rice", "Golomb rice", 0, AV_OPT_TYPE_CONST, + { .i64 = AC_GOLOMB_RICE }, INT_MIN, INT_MAX, VE, .unit = "coder" }, + { "range_tab", "Range with custom table", 0, AV_OPT_TYPE_CONST, + { .i64 = AC_RANGE_CUSTOM_TAB }, INT_MIN, INT_MAX, VE, .unit = "coder" }, + { "qtable", "Quantization table", OFFSET(ctx.qtable), AV_OPT_TYPE_INT, + { .i64 = -1 }, -1, 2, VE }, + + { "slices_h", "Number of horizontal slices", OFFSET(num_h_slices), AV_OPT_TYPE_INT, + { .i64 = -1 }, -1, 32, VE }, + { "slices_v", "Number of vertical slices", OFFSET(num_v_slices), AV_OPT_TYPE_INT, + { .i64 = -1 }, -1, 32, VE }, + + { "force_pcm", "Code all slices with no prediction", OFFSET(force_pcm), AV_OPT_TYPE_BOOL, + { .i64 = 0 }, 0, 1, VE }, + + { NULL } +}; + +static const FFCodecDefault vulkan_encode_ffv1_defaults[] = { + { "g", "1" }, + { NULL }, +}; + +static const AVClass vulkan_encode_ffv1_class = { + .class_name = "ffv1_vulkan", + .item_name = av_default_item_name, + .option = vulkan_encode_ffv1_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const AVCodecHWConfigInternal *const vulkan_encode_ffv1_hw_configs[] = { + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), + NULL, +}; + +const FFCodec ff_ffv1_vulkan_encoder = { + .p.name = "ffv1_vulkan", + CODEC_LONG_NAME("FFmpeg video codec #1 (Vulkan)"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_FFV1, + .priv_data_size = sizeof(VulkanEncodeFFv1Context), + .init = &vulkan_encode_ffv1_init, + FF_CODEC_ENCODE_CB(vulkan_encode_ffv1_frame), + .close = &vulkan_encode_ffv1_close, + .p.priv_class = &vulkan_encode_ffv1_class, + .p.capabilities = AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_HARDWARE | + AV_CODEC_CAP_DR1 | + AV_CODEC_CAP_ENCODER_FLUSH | + AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH, + .defaults = vulkan_encode_ffv1_defaults, + .p.pix_fmts = (const enum AVPixelFormat[]) { + AV_PIX_FMT_VULKAN, + AV_PIX_FMT_NONE, + }, + .hw_configs = vulkan_encode_ffv1_hw_configs, + .p.wrapper_name = "vulkan", +}; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 96b4de0092..351332ee44 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -3,6 +3,14 @@ GEN_CLEANSUFFIXES = *.o *.c *.d clean:: $(RM) $(GEN_CLEANSUFFIXES:%=libavcodec/vulkan/%) +OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \ + vulkan/rangecoder.o vulkan/ffv1_vlc.o \ + vulkan/ffv1_common.o vulkan/ffv1_reset.o \ + vulkan/ffv1_enc_common.o \ + vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \ + vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \ + vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o + VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp)) .SECONDARY: $(VULKAN:.comp=.c) libavcodec/vulkan/%.c: TAG = VULKAN diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp new file mode 100644 index 0000000000..deca5d63b1 --- /dev/null +++ b/libavcodec/vulkan/common.comp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +layout(buffer_reference, buffer_reference_align = 1) buffer u8buf { + uint8_t v; +}; + +layout(buffer_reference, buffer_reference_align = 2) buffer u16buf { + uint16_t v; +}; + +layout(buffer_reference, buffer_reference_align = 4) buffer u32buf { + uint32_t v; +}; + +layout(buffer_reference, buffer_reference_align = 8) buffer u64buf { + uint64_t v; +}; + +#define OFFBUF(type, b, l) \ + type(uint64_t(b) + uint64_t(l)) + +#define zero_extend(a, p) \ + ((a) & ((1 << (p)) - 1)) + +#define sign_extend(val, bits) \ + bitfieldExtract(val, 0, bits) + +#define fold(diff, bits) \ + sign_extend(diff, bits) + +#define mid_pred(a, b, c) \ + max(min((a), (b)), min(max((a), (b)), (c))) + +/* TODO: optimize */ +uint align(uint src, uint a) +{ + uint res = src % a; + if (res == 0) + return src; + return src + a - res; +} + +/* TODO: optimize */ +uint64_t align64(uint64_t src, uint64_t a) +{ + uint64_t res = src % a; + if (res == 0) + return src; + return src + a - res; +} + +#define reverse4(src) \ + (pack32(unpack8(uint32_t(src)).wzxy)) + +uint64_t reverse8(uint64_t src) +{ + u32vec2 tmp = unpack32(src); + tmp.x = reverse4(tmp.x); + tmp.y = reverse4(tmp.y); + return pack64(tmp.yx); +} + +#ifdef PB_32 +#define BIT_BUF_TYPE uint32_t +#define BUF_TYPE u32buf +#define BUF_REVERSE(src) reverse4(src) +#define BUF_BITS uint8_t(32) +#define BUF_BYTES uint8_t(4) +#define BYTE_EXTRACT(src, byte_off) \ + (uint8_t(bitfieldExtract((src), ((byte_off) << 3), 8))) +#else +#define BIT_BUF_TYPE uint64_t +#define BUF_TYPE u64buf +#define BUF_REVERSE(src) reverse8(src) +#define BUF_BITS uint8_t(64) +#define BUF_BYTES uint8_t(8) +#define BYTE_EXTRACT(src, byte_off) \ + (uint8_t(((src) >> ((byte_off) << 3)) & 0xFF)) +#endif + +struct PutBitContext { + uint64_t buf_start; + uint64_t buf; + + BIT_BUF_TYPE bit_buf; + uint8_t bit_left; +}; + +void put_bits(inout PutBitContext pb, const uint32_t n, uint32_t value) +{ + if (n < pb.bit_left) { + pb.bit_buf = (pb.bit_buf << n) | value; + pb.bit_left -= uint8_t(n); + } else { + pb.bit_buf <<= pb.bit_left; + pb.bit_buf |= (value >> (n - pb.bit_left)); + +#ifdef PB_UNALIGNED + u8buf bs = u8buf(pb.buf); + [[unroll]] + for (uint8_t i = uint8_t(0); i < BUF_BYTES; i++) + bs[i].v = BYTE_EXTRACT(pb.bit_buf, BUF_BYTES - uint8_t(1) - i); +#else +#ifdef DEBUG + if ((pb.buf % BUF_BYTES) != 0) + debugPrintfEXT("put_bits buffer is not aligned!"); +#endif + + BUF_TYPE bs = BUF_TYPE(pb.buf); + bs.v = BUF_REVERSE(pb.bit_buf); +#endif + pb.buf = uint64_t(bs) + BUF_BYTES; + + pb.bit_left += BUF_BITS - uint8_t(n); + pb.bit_buf = value; + } +} + +uint32_t flush_put_bits(inout PutBitContext pb) +{ + /* Align bits to MSBs */ + if (pb.bit_left < BUF_BITS) + pb.bit_buf <<= pb.bit_left; + + if (pb.bit_left < BUF_BITS) { + uint to_write = ((BUF_BITS - pb.bit_left) >> 3) + 1; + + u8buf bs = u8buf(pb.buf); + for (int i = 0; i < to_write; i++) + bs[i].v = BYTE_EXTRACT(pb.bit_buf, BUF_BYTES - uint8_t(1) - i); + pb.buf = uint64_t(bs) + BUF_BYTES; + } + + pb.bit_left = BUF_BITS; + pb.bit_buf = 0x0; + + return uint32_t(pb.buf - pb.buf_start); +} + +void init_put_bits(out PutBitContext pb, u8buf data, uint64_t len) +{ + pb.buf_start = uint64_t(data); + pb.buf = uint64_t(data); + + pb.bit_buf = 0; + pb.bit_left = BUF_BITS; +} + +uint64_t put_bits_count(in PutBitContext pb) +{ + return (pb.buf - pb.buf_start)*8 + BUF_BITS - pb.bit_left; +} diff --git a/libavcodec/vulkan/ffv1_common.comp b/libavcodec/vulkan/ffv1_common.comp new file mode 100644 index 0000000000..5b4a882367 --- /dev/null +++ b/libavcodec/vulkan/ffv1_common.comp @@ -0,0 +1,74 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +struct SliceContext { + RangeCoder c; + +#ifdef GOLOMB + PutBitContext pb; /* 8*8 bytes */ +#endif + + ivec2 slice_dim; + ivec2 slice_pos; + ivec2 slice_rct_coef; + + uint hdr_len; // only used for golomb + int slice_coding_mode; +}; + +/* -1, { -1, 0 } */ +int predict(int L, ivec2 top) +{ + return mid_pred(L, L + top[1] - top[0], top[1]); +} + +/* { -2, -1 }, { -1, 0, 1 }, 0 */ +int get_context(VTYPE2 cur_l, VTYPE3 top_l, TYPE top2, uint8_t quant_table_idx) +{ + const int LT = top_l[0]; /* -1 */ + const int T = top_l[1]; /* 0 */ + const int RT = top_l[2]; /* 1 */ + const int L = cur_l[1]; /* -1 */ + + int base = quant_table[quant_table_idx][0][(L - LT) & MAX_QUANT_TABLE_MASK] + + quant_table[quant_table_idx][1][(LT - T) & MAX_QUANT_TABLE_MASK] + + quant_table[quant_table_idx][2][(T - RT) & MAX_QUANT_TABLE_MASK]; + + if ((quant_table[quant_table_idx][3][127] == 0) && + (quant_table[quant_table_idx][4][127] == 0)) + return base; + + const int TT = top2; /* -2 */ + const int LL = cur_l[0]; /* -2 */ + return base + + quant_table[quant_table_idx][3][(LL - L) & MAX_QUANT_TABLE_MASK] + + quant_table[quant_table_idx][4][(TT - T) & MAX_QUANT_TABLE_MASK]; +} + +const uint32_t log2_run[41] = { + 0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, +}; diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp new file mode 100644 index 0000000000..880d3a37f0 --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc.comp @@ -0,0 +1,67 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void encode_slice(inout SliceContext sc, const uint slice_idx) +{ + int bits = bits_per_raw_sample; + +#ifndef GOLOMB + if (sc.slice_coding_mode == 1) { + for (int p = 0; p < planes; p++) { + + int h = sc.slice_dim.y; + if (p > 0 && p < 3) + h >>= chroma_shift.y; + + for (int y = 0; y < h; y++) + encode_line_pcm(sc, y, p, 0, bits); + } + } else +#endif + { + uint64_t slice_state_off = uint64_t(slice_state) + + slice_idx*plane_state_size*codec_planes; + + for (int p = 0; p < planes; p++) { + int run_index = 0; + + int h = sc.slice_dim.y; + if (p > 0 && p < 3) + h >>= chroma_shift.y; + + for (int y = 0; y < h; y++) + encode_line(sc, slice_state_off, y, p, 0, bits, run_index); + + /* For the second chroma plane, reuse the first plane's state */ + if (p != 1) + slice_state_off += plane_state_size; + } + } + + finalize_slice(sc, slice_idx); +} + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + encode_slice(slice_ctx[slice_idx], slice_idx); +} diff --git a/libavcodec/vulkan/ffv1_enc_ac.comp b/libavcodec/vulkan/ffv1_enc_ac.comp new file mode 100644 index 0000000000..0bbf58c5dd --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_ac.comp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void put_rac(inout RangeCoder c, uint64_t state, bool bit) +{ + put_rac_norenorm(c, state, bit); + if (c.range < 0x100) + renorm_encoder(c); +} + +/* Note - only handles signed values */ +void put_symbol(inout RangeCoder c, uint64_t state, int v) +{ + bool is_nil = (v == 0); + put_rac(c, state, is_nil); + if (is_nil) + return; + + const int a = abs(v); + const int e = findMSB(a); + + state += 1; + for (int i = 0; i < e; i++) + put_rac(c, state + min(i, 9), true); + put_rac(c, state + min(e, 9), false); + + state += 21; + for (int i = e - 1; i >= 0; i--) + put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1))); + + put_rac(c, state - 11 + min(e, 10), v < 0); +} + +void encode_line_pcm(inout SliceContext sc, int y, int p, int comp, + int bits) +{ + ivec2 sp = sc.slice_pos; + int w = sc.slice_dim.x; + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } + + for (int x = 0; x < w; x++) { + uint v = imageLoad(src[p], (sp + ivec2(x, y)))[comp]; + for (int i = (bits - 1); i >= 0; i--) + put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1))); + } +} + +void encode_line(inout SliceContext sc, uint64_t state, + int y, int p, int comp, int bits, const int run_index) +{ + ivec2 sp = sc.slice_pos; + + int w = sc.slice_dim.x; + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } + + for (int x = 0; x < w; x++) { + const ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits); + put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]); + } +} diff --git a/libavcodec/vulkan/ffv1_enc_common.comp b/libavcodec/vulkan/ffv1_enc_common.comp new file mode 100644 index 0000000000..759882f5c9 --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_common.comp @@ -0,0 +1,101 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +ivec2 get_diff(ivec2 pos, ivec2 off, int p, int comp, int sw, int bits) +{ + const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0); + const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0); + + TYPE top2 = TYPE(0); + if (off.y > 1) + top2 = TYPE(imageLoad(src[p], pos + ivec2(0, -2))[comp]); + + VTYPE3 top = VTYPE3(TYPE(0), + TYPE(0), + TYPE(0)); + if (off.y > 0 && off != ivec2(0, 1)) + top[0] = TYPE(imageLoad(src[p], pos + ivec2(-1, -1) + yoff_border1)[comp]); + if (off.y > 0) { + top[1] = TYPE(imageLoad(src[p], pos + ivec2(0, -1))[comp]); + top[2] = TYPE(imageLoad(src[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]); + } + + VTYPE3 cur = VTYPE3(TYPE(0), + TYPE(0), + imageLoad(src[p], pos)[comp]); + if (off.x > 0 && off != ivec2(1, 0)) + cur[0] = TYPE(imageLoad(src[p], pos + ivec2(-2, 0) + yoff_border2)[comp]); + if (off != ivec2(0, 0)) + cur[1] = TYPE(imageLoad(src[p], pos + ivec2(-1, 0) + yoff_border1)[comp]); + + /* context, diff */ + ivec2 d = ivec2(get_context(VTYPE2(cur), top, top2, context_model), + cur[2] - predict(cur[1], VTYPE2(top))); + + if (d[0] < 0) + d = -d; + + d[1] = fold(d[1], bits); + + return d; +} + +void finalize_slice(inout SliceContext sc, const uint slice_idx) +{ +#ifdef GOLOMB + uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb); +#else + uint32_t enc_len = rac_terminate(sc.c); +#endif + + u8buf bs = u8buf(sc.c.bytestream_start); + + /* Append slice length */ + u8vec4 enc_len_p = unpack8(enc_len); + bs[enc_len + 0].v = enc_len_p.z; + bs[enc_len + 1].v = enc_len_p.y; + bs[enc_len + 2].v = enc_len_p.x; + enc_len += 3; + + /* Calculate and write CRC */ + if (ec != 0) { + bs[enc_len].v = uint8_t(0); + enc_len++; + + uint32_t crc = crcref; + for (int i = 0; i < enc_len; i++) + crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8); + + if (crcref != 0x00000000) + crc ^= 0x8CD88196; + + u8vec4 crc_p = unpack8(crc); + bs[enc_len + 0].v = crc_p.x; + bs[enc_len + 1].v = crc_p.y; + bs[enc_len + 2].v = crc_p.z; + bs[enc_len + 3].v = crc_p.w; + enc_len += 4; + } + + slice_results[slice_idx*2 + 0] = enc_len; + slice_results[slice_idx*2 + 1] = uint32_t(uint64_t(bs) - uint64_t(out_data)); +} diff --git a/libavcodec/vulkan/ffv1_enc_rct.comp b/libavcodec/vulkan/ffv1_enc_rct.comp new file mode 100644 index 0000000000..ad4cbf805f --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_rct.comp @@ -0,0 +1,82 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +ivec4 load_components(ivec2 pos) +{ + if (planar_rgb == 0) + return ivec4(imageLoad(src[0], pos)); + + ivec4 pix; + for (int i = 0; i < (3 + transparency); i++) + pix[i] = int(imageLoad(src[i], pos)[0]); + + /* Swizzle out the difference */ + if (transparency > 0) + return pix.brga; + return pix.bgra; +} + +void bypass_sample(ivec2 pos) +{ + imageStore(dst[0], pos, load_components(pos)); +} + +void bypass_block(in SliceContext sc) +{ + ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + ivec2 end = sc.slice_pos + sc.slice_dim; + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + bypass_sample(ivec2(x, y)); +} + +void transform_sample(ivec2 pos, ivec2 rct_coef) +{ + ivec4 pix = load_components(pos); + pix.b -= pix.g; + pix.r -= pix.g; + pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2; + pix.b += offset; + pix.r += offset; + imageStore(dst[0], pos, pix); +} + +void transform_block(in SliceContext sc) +{ + const ivec2 rct_coef = sc.slice_rct_coef; + const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + const ivec2 end = sc.slice_pos + sc.slice_dim; + + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + transform_sample(ivec2(x, y), rct_coef); +} + +void main() +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + + if (slice_ctx[slice_idx].slice_coding_mode == 1) + bypass_block(slice_ctx[slice_idx]); + else + transform_block(slice_ctx[slice_idx]); +} diff --git a/libavcodec/vulkan/ffv1_enc_rgb.comp b/libavcodec/vulkan/ffv1_enc_rgb.comp new file mode 100644 index 0000000000..c176d94e8b --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_rgb.comp @@ -0,0 +1,83 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void encode_slice_rgb(inout SliceContext sc, const uint slice_idx) +{ + int bits = 9; + if (bits != 8 || sc.slice_coding_mode != 0) + bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1); + + int run_index = 0; + +#ifndef GOLOMB + if (sc.slice_coding_mode == 1) { + if (transparency == 1) { + for (int y = 0; y < sc.slice_dim.y; y++) { + encode_line_pcm(sc, y, 0, 1, bits); + encode_line_pcm(sc, y, 0, 2, bits); + encode_line_pcm(sc, y, 0, 0, bits); + encode_line_pcm(sc, y, 0, 3, bits); + } + } else { + for (int y = 0; y < sc.slice_dim.y; y++) { + encode_line_pcm(sc, y, 0, 1, bits); + encode_line_pcm(sc, y, 0, 2, bits); + encode_line_pcm(sc, y, 0, 0, bits); + } + } + } else +#endif + { + uint64_t slice_state_off = uint64_t(slice_state) + + slice_idx*plane_state_size*codec_planes; + + if (transparency == 1) { + for (int y = 0; y < sc.slice_dim.y; y++) { + encode_line(sc, slice_state_off + plane_state_size*0, + y, 0, 1, bits, run_index); + encode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 2, bits, run_index); + encode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 0, bits, run_index); + encode_line(sc, slice_state_off + plane_state_size*2, + y, 0, 3, bits, run_index); + } + } else { + for (int y = 0; y < sc.slice_dim.y; y++) { + encode_line(sc, slice_state_off + plane_state_size*0, + y, 0, 1, bits, run_index); + encode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 2, bits, run_index); + encode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 0, bits, run_index); + } + } + } + + finalize_slice(sc, slice_idx); +} + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + encode_slice_rgb(slice_ctx[slice_idx], slice_idx); +} diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp new file mode 100644 index 0000000000..d58050f281 --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_setup.comp @@ -0,0 +1,151 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +uint slice_coord(uint width, uint sx, uint num_h_slices, uint chroma_shift) +{ + uint mpw = 1 << chroma_shift; + uint awidth = align(width, mpw); + + if ((version < 4) || ((version == 4) && (micro_version < 3))) + return width * sx / num_h_slices; + + sx = (2 * awidth * sx + num_h_slices * mpw) / (2 * num_h_slices * mpw) * mpw; + if (sx == awidth) + sx = width; + + return sx; +} + +void init_slice(out SliceContext sc, const uint slice_idx) +{ + /* Set coordinates */ + uvec2 img_size = imageSize(src[0]); + uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0, + gl_NumWorkGroups.x, chroma_shift.x); + uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1, + gl_NumWorkGroups.x, chroma_shift.x); + uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0, + gl_NumWorkGroups.y, chroma_shift.y); + uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1, + gl_NumWorkGroups.y, chroma_shift.y); + + sc.slice_pos = ivec2(sxs, sys); + sc.slice_dim = ivec2(sxe - sxs, sye - sys); + sc.slice_rct_coef = ivec2(1, 1); + + rac_init(sc.c, + OFFBUF(u8buf, out_data, slice_idx * slice_size_max), + slice_size_max); +} + +void put_rac_full(inout RangeCoder c, uint64_t state, bool bit) +{ + put_rac_norenorm(c, state, bit); + if (c.range < 0x100) + renorm_encoder_full(c); +} + +void put_symbol_unsigned(inout RangeCoder c, uint64_t state, uint v) +{ + bool is_nil = (v == 0); + put_rac_full(c, state, is_nil); + if (is_nil) + return; + + const int e = findMSB(v); + + state += 1; + for (int i = 0; i < e; i++) + put_rac_full(c, state + min(i, 9), true); + put_rac_full(c, state + min(e, 9), false); + + state += 21; + for (int i = e - 1; i >= 0; i--) + put_rac_full(c, state + min(i, 9), bool(bitfieldExtract(v, i, 1))); +} + +void write_slice_header(inout SliceContext sc, uint64_t state) +{ + u8buf sb = u8buf(state); + + [[unroll]] + for (int i = 0; i < CONTEXT_SIZE; i++) + sb[i].v = uint8_t(128); + + put_symbol_unsigned(sc.c, state, gl_WorkGroupID.x); + put_symbol_unsigned(sc.c, state, gl_WorkGroupID.y); + put_symbol_unsigned(sc.c, state, 0); + put_symbol_unsigned(sc.c, state, 0); + + for (int i = 0; i < codec_planes; i++) + put_symbol_unsigned(sc.c, state, context_model); + + put_symbol_unsigned(sc.c, state, pic_mode); + put_symbol_unsigned(sc.c, state, sar.x); + put_symbol_unsigned(sc.c, state, sar.y); + + if (version >= 4) { + put_rac_full(sc.c, state, sc.slice_coding_mode == 1); + put_symbol_unsigned(sc.c, state, sc.slice_coding_mode); + if (sc.slice_coding_mode != 1 && colorspace == 1) { + put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.y); + put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.x); + } + } +} + +void write_frame_header(inout SliceContext sc, uint64_t state) +{ + u8buf sb = u8buf(state); + sb.v = uint8_t(128); + put_rac_full(sc.c, state, bool(key_frame)); +} + +#ifdef GOLOMB +void init_golomb(inout SliceContext sc) +{ + sc.hdr_len = rac_terminate(sc.c); + init_put_bits(sc.pb, + OFFBUF(u8buf, sc.c.bytestream_start, sc.hdr_len), + slice_size_max - sc.hdr_len); +} +#endif + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + + /* Write slice data */ + uint64_t scratch_state = uint64_t(scratch_data) + slice_idx*CONTEXT_SIZE; + u8buf sb = u8buf(scratch_state); + + init_slice(slice_ctx[slice_idx], slice_idx); + + if (slice_idx == 0) + write_frame_header(slice_ctx[slice_idx], scratch_state); + + write_slice_header(slice_ctx[slice_idx], scratch_state); + +#ifdef GOLOMB + init_golomb(slice_ctx[slice_idx]); +#endif +} diff --git a/libavcodec/vulkan/ffv1_enc_vlc.comp b/libavcodec/vulkan/ffv1_enc_vlc.comp new file mode 100644 index 0000000000..7a4d39e307 --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_vlc.comp @@ -0,0 +1,112 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +struct RLEState { + int count; + int diff; + int index; + bool mode; +}; + +void calc_new_state(inout RLEState state, int context) +{ + if (context == 0) + state.mode = false; + + if (!state.mode) + return; + + if (state.diff > 0) { + while (state.count >= (1 << log2_run[state.index])) { + state.count -= 1 << log2_run[state.index]; + state.index++; + } + if (state.index > 0) + state.index--; + state.count = 0; + state.mode = false; + if (state.diff > 0) + state.diff--; + } else { + state.count++; + } +} + +void encode_line(inout SliceContext sc, uint64_t state, + int y, int p, int comp, int bits, inout int run_index) +{ + ivec2 sp = sc.slice_pos; + + int w = sc.slice_dim.x; + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } + + int run_count = 0; + bool run_mode = false; + + for (int x = 0; x < w; x++) { + ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits); + + if (d[0] == 0) + run_mode = true; + + if (run_mode) { + if (d[1] != 0) { + /* A very unlikely loop */ + while (run_count >= 1 << log2_run[run_index]) { + run_count -= 1 << log2_run[run_index]; + run_index++; + put_bits(sc.pb, 1, 1); + } + + put_bits(sc.pb, 1 + log2_run[run_index], run_count); + if (run_index != 0) + run_index--; + run_count = 0; + run_mode = false; + if (d[1] > 0) + d[1]--; + } else { + run_count++; + } + } + + if (!run_mode) { + VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]); + Symbol sym = get_vlc_symbol(sb, d[1], bits); + put_bits(sc.pb, sym.bits, sym.val); + } + } + + if (run_mode) { + while (run_count >= (1 << log2_run[run_index])) { + run_count -= 1 << log2_run[run_index]; + run_index++; + put_bits(sc.pb, 1, 1); + } + + if (run_count > 0) + put_bits(sc.pb, 1, 1); + } +} diff --git a/libavcodec/vulkan/ffv1_reset.comp b/libavcodec/vulkan/ffv1_reset.comp new file mode 100644 index 0000000000..c7c7962850 --- /dev/null +++ b/libavcodec/vulkan/ffv1_reset.comp @@ -0,0 +1,55 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + + if (slice_ctx[slice_idx].slice_coding_mode == 0 && key_frame == 0) + return; + + uint64_t slice_state_off = uint64_t(slice_state) + + slice_idx*plane_state_size*codec_planes; + +#ifdef GOLOMB + uint64_t start = slice_state_off + + (gl_WorkGroupID.z*context_count + + gl_LocalInvocationID.x)*VLC_STATE_SIZE; + for (uint x = gl_LocalInvocationID.x; x < context_count; x += gl_WorkGroupSize.x) { + VlcState sb = VlcState(start); + sb.drift = int16_t(0); + sb.error_sum = uint16_t(4); + sb.bias = int8_t(0); + sb.count = uint8_t(1); + start += gl_WorkGroupSize.x*VLC_STATE_SIZE; + } +#else + uint64_t start = slice_state_off + + (gl_WorkGroupID.z*context_count)*CONTEXT_SIZE + + (gl_LocalInvocationID.x << 2 /* dwords */); /* Bytes */ + uint count_total = context_count*(CONTEXT_SIZE /* bytes */ >> 2 /* dwords */); + for (uint x = gl_LocalInvocationID.x; x < count_total; x += gl_WorkGroupSize.x) { + u32buf(start).v = 0x80808080; + start += gl_WorkGroupSize.x*(CONTEXT_SIZE >> 3 /* 1/8th of context */); + } +#endif +} diff --git a/libavcodec/vulkan/ffv1_vlc.comp b/libavcodec/vulkan/ffv1_vlc.comp new file mode 100644 index 0000000000..0a53e035b5 --- /dev/null +++ b/libavcodec/vulkan/ffv1_vlc.comp @@ -0,0 +1,122 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define VLC_STATE_SIZE 8 +layout(buffer_reference, buffer_reference_align = VLC_STATE_SIZE) buffer VlcState { + uint32_t error_sum; + int16_t drift; + int8_t bias; + uint8_t count; +}; + +void update_vlc_state(inout VlcState state, const int v) +{ + int drift = state.drift; + int count = state.count; + int bias = state.bias; + state.error_sum += uint16_t(abs(v)); + drift += v; + + if (count == 128) { // FIXME: variable + count >>= 1; + drift >>= 1; + state.error_sum >>= 1; + } + count++; + + if (drift <= -count) { + bias = max(bias - 1, -128); + drift = max(drift + count, -count + 1); + } else if (drift > 0) { + bias = min(bias + 1, 127); + drift = min(drift - count, 0); + } + + state.bias = int8_t(bias); + state.drift = int16_t(drift); + state.count = uint8_t(count); +} + +struct Symbol { + uint32_t bits; + uint32_t val; +}; + +Symbol set_ur_golomb(int i, int k, int limit, int esc_len) +{ + int e; + Symbol sym; + +#ifdef DEBUG + if (i < 0) + debugPrintfEXT("Error: i is zero!"); +#endif + + e = i >> k; + if (e < limit) { + sym.bits = e + k + 1; + sym.val = (1 << k) + zero_extend(i, k); + } else { + sym.bits = limit + esc_len; + sym.val = i - limit + 1; + } + + return sym; +} + +/** + * write signed golomb rice code (ffv1). + */ +Symbol set_sr_golomb(int i, int k, int limit, int esc_len) +{ + int v; + + v = -2 * i - 1; + v ^= (v >> 31); + + return set_ur_golomb(v, k, limit, esc_len); +} + +Symbol get_vlc_symbol(inout VlcState state, int v, int bits) +{ + int i, k, code; + Symbol sym; + v = fold(v - int(state.bias), bits); + + i = state.count; + k = 0; + while (i < state.error_sum) { // FIXME: optimize + k++; + i += i; + } + +#ifdef DEBUG + if (k > 16) + debugPrintfEXT("Error: k > 16!"); +#endif + + code = v ^ ((2 * state.drift + state.count) >> 31); + + update_vlc_state(state, v); + + return set_sr_golomb(code, k, 12, bits); +} diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp new file mode 100644 index 0000000000..13c135f913 --- /dev/null +++ b/libavcodec/vulkan/rangecoder.comp @@ -0,0 +1,190 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +struct RangeCoder { + u8buf bytestream_start; + u8buf bytestream; + + uint low; + uint16_t range; + uint8_t outstanding_count; + uint8_t outstanding_byte; +}; + +/* Full renorm version that can handle outstanding_byte == 0xFF */ +void renorm_encoder_full(inout RangeCoder c) +{ + int bs_cnt = 0; + + if (c.outstanding_byte == 0xFF) { + c.outstanding_byte = uint8_t(c.low >> 8); + } else if (c.low <= 0xFF00) { + c.bytestream[bs_cnt++].v = c.outstanding_byte; + uint8_t cnt = c.outstanding_count; + for (; cnt > 0; cnt--) + c.bytestream[bs_cnt++].v = uint8_t(0xFF); + c.outstanding_count = uint8_t(0); + c.outstanding_byte = uint8_t(c.low >> 8); + } else if (c.low >= 0x10000) { + c.bytestream[bs_cnt++].v = c.outstanding_byte + uint8_t(1); + uint8_t cnt = c.outstanding_count; + for (; cnt > 0; cnt--) + c.bytestream[bs_cnt++].v = uint8_t(0x00); + c.outstanding_count = uint8_t(0); + c.outstanding_byte = uint8_t(bitfieldExtract(c.low, 8, 8)); + } else { + c.outstanding_count++; + } + + c.bytestream = OFFBUF(u8buf, c.bytestream, bs_cnt); + c.range <<= 8; + c.low = bitfieldInsert(0, c.low, 8, 8); +} + +/* Cannot deal with outstanding_byte == -1 in the name of speed */ +void renorm_encoder(inout RangeCoder c) +{ + uint8_t oc = c.outstanding_count + uint8_t(1); + uint low = c.low; + + c.range <<= 8; + c.low = bitfieldInsert(0, low, 8, 8); + + if (low > 0xFF00 && low < 0x10000) { + c.outstanding_count = oc; + return; + } + + u8buf bs = c.bytestream; + uint8_t outstanding_byte = c.outstanding_byte; + + c.bytestream = OFFBUF(u8buf, bs, oc); + c.outstanding_count = uint8_t(0); + c.outstanding_byte = uint8_t(low >> 8); + + uint8_t obs = uint8_t(low > 0xFF00); + uint8_t fill = obs - uint8_t(1); /* unsigned underflow */ + + bs[0].v = outstanding_byte + obs; + for (int i = 1; i < oc; i++) + bs[i].v = fill; +} + +void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit) +{ + u8buf sb = u8buf(state); + uint val = uint(sb.v); + uint16_t range1 = uint16_t((uint(c.range) * val) >> 8); + +#ifdef DEBUG + if (val == 0) + debugPrintfEXT("Error: state is zero (addr: 0x%lx)", uint64_t(sb)); + if (range1 >= c.range) + debugPrintfEXT("Error: range1 >= c.range"); + if (range1 <= 0) + debugPrintfEXT("Error: range1 <= 0"); +#endif + + uint16_t diff = c.range - range1; + if (bit) { + c.low += diff; + c.range = range1; + } else { + c.range = diff; + } + + sb.v = zero_one_state[(uint(bit) << 8) + val]; + +#ifdef DEBUG + if (sb.v == 0) + debugPrintfEXT("Error: inserted zero state from tab %i idx %i", bit, val); +#endif +} + +/* Equiprobable bit */ +void put_rac_equi(inout RangeCoder c, bool bit) +{ + uint16_t range1 = c.range >> 1; + +#ifdef DEBUG + if (range1 >= c.range) + debugPrintfEXT("Error: range1 >= c.range"); + if (range1 <= 0) + debugPrintfEXT("Error: range1 <= 0"); +#endif + + if (bit) { + c.low += c.range - range1; + c.range = range1; + } else { + c.range -= range1; + } + + if (c.range < 0x100) + renorm_encoder(c); +} + +void put_rac_terminate(inout RangeCoder c) +{ + uint16_t range1 = uint16_t((uint(c.range) * 129) >> 8); + +#ifdef DEBUG + if (range1 >= c.range) + debugPrintfEXT("Error: range1 >= c.range"); + if (range1 <= 0) + debugPrintfEXT("Error: range1 <= 0"); +#endif + + c.range -= range1; + if (c.range < 0x100) + renorm_encoder(c); +} + +/* Return the number of bytes written. */ +uint32_t rac_terminate(inout RangeCoder c) +{ + put_rac_terminate(c); + c.range = uint16_t(0xFF); + c.low += 0xFF; + renorm_encoder(c); + c.range = uint16_t(0xFF); + renorm_encoder(c); + +#ifdef DEBUG + if (c.low != 0) + debugPrintfEXT("Error: c.low != 0"); + if (c.range < 0x100) + debugPrintfEXT("Error: range < 0x100"); +#endif + + return uint32_t(uint64_t(c.bytestream) - uint64_t(c.bytestream_start)); +} + +void rac_init(out RangeCoder r, u8buf data, uint64_t buf_size) +{ + r.bytestream_start = data; + r.bytestream = data; + r.low = 0; + r.range = uint16_t(0xFF00); + r.outstanding_count = uint8_t(0); + r.outstanding_byte = uint8_t(0xFF); +}