mirror of https://git.ffmpeg.org/ffmpeg.git
adc87a5f7c
This uses a more traditional approach allowing up processing of up to period minus two elements per iteration. This also allows the algorithm to work for all and any vector length. As the T-Head C908 device under test can load 16 elements loop, there is unsurprisingly a little performance drop when the period is minimal and the parallelism is capped at 13 elements: Before: postfilter_15_c: 21222.2 postfilter_15_rvv_f32: 22007.7 postfilter_512_c: 20189.7 postfilter_512_rvv_f32: 22004.2 postfilter_1022_c: 20189.7 postfilter_1022_rvv_f32: 22004.2 After: postfilter_15_c: 20189.5 postfilter_15_rvv_f32: 7057.2 postfilter_512_c: 20189.5 postfilter_512_rvv_f32: 5667.2 postfilter_1022_c: 20192.7 postfilter_1022_rvv_f32: 5667.2 |
||
---|---|---|
.. | ||
Makefile | ||
aacpsdsp_init.c | ||
aacpsdsp_rvv.S | ||
ac3dsp_init.c | ||
ac3dsp_rvb.S | ||
alacdsp_init.c | ||
alacdsp_rvv.S | ||
audiodsp_init.c | ||
audiodsp_rvf.S | ||
audiodsp_rvv.S | ||
bswapdsp_init.c | ||
bswapdsp_rvb.S | ||
bswapdsp_rvv.S | ||
exrdsp_init.c | ||
exrdsp_rvv.S | ||
fmtconvert_init.c | ||
fmtconvert_rvv.S | ||
g722dsp_init.c | ||
g722dsp_rvv.S | ||
h264_chroma_init_riscv.c | ||
h264_mc_chroma.S | ||
huffyuvdsp_init.c | ||
huffyuvdsp_rvv.S | ||
idctdsp_init.c | ||
idctdsp_rvv.S | ||
jpeg2000dsp_init.c | ||
jpeg2000dsp_rvv.S | ||
opusdsp_init.c | ||
opusdsp_rvv.S | ||
pixblockdsp_init.c | ||
pixblockdsp_rvi.S | ||
pixblockdsp_rvv.S | ||
sbrdsp_init.c | ||
sbrdsp_rvv.S | ||
utvideodsp_init.c | ||
utvideodsp_rvv.S | ||
vorbisdsp_init.c | ||
vorbisdsp_rvv.S |