mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-12 02:19:35 +00:00
1f7801c2bc
This work is sponsored by, and copyright, Google.
These are ported from the ARM version; it is essentially a 1:1
port with no extra added features, but with some hand tuning
(especially for the plain copy/avg functions). The ARM version
isn't very register starved to begin with, so there's not much
to be gained from having more spare registers here - we only
avoid having to clobber callee-saved registers.
Examples of runtimes vs the 32 bit version, on a Cortex A53:
ARM AArch64
vp9_avg4_neon: 27.2 23.7
vp9_avg8_neon: 56.5 54.7
vp9_avg16_neon: 169.9 167.4
vp9_avg32_neon: 585.8 585.2
vp9_avg64_neon: 2460.3 2294.7
vp9_avg_8tap_smooth_4h_neon: 132.7 125.2
vp9_avg_8tap_smooth_4hv_neon: 478.8 442.0
vp9_avg_8tap_smooth_4v_neon: 126.0 93.7
vp9_avg_8tap_smooth_8h_neon: 241.7 234.2
vp9_avg_8tap_smooth_8hv_neon: 690.9 646.5
vp9_avg_8tap_smooth_8v_neon: 245.0 205.5
vp9_avg_8tap_smooth_64h_neon: 11273.2 11280.1
vp9_avg_8tap_smooth_64hv_neon: 22980.6 22184.1
vp9_avg_8tap_smooth_64v_neon: 11549.7 10781.1
vp9_put4_neon: 18.0 17.2
vp9_put8_neon: 40.2 37.7
vp9_put16_neon: 97.4 99.5
vp9_put32_neon/armv8: 346.0 307.4
vp9_put64_neon/armv8: 1319.0 1107.5
vp9_put_8tap_smooth_4h_neon: 126.7 118.2
vp9_put_8tap_smooth_4hv_neon: 465.7 434.0
vp9_put_8tap_smooth_4v_neon: 113.0 86.5
vp9_put_8tap_smooth_8h_neon: 229.7 221.6
vp9_put_8tap_smooth_8hv_neon: 658.9 621.3
vp9_put_8tap_smooth_8v_neon: 215.0 187.5
vp9_put_8tap_smooth_64h_neon: 10636.7 10627.8
vp9_put_8tap_smooth_64hv_neon: 21076.8 21026.9
vp9_put_8tap_smooth_64v_neon: 9635.0 9632.4
These are generally about as fast as the corresponding ARM
routines on the same CPU (at least on the A53), in most cases
marginally faster.
The speedup vs C code is pretty much the same as for the 32 bit
case; on the A53 it's around 6-13x for ther larger 8tap filters.
The exact speedup varies a little, since the C versions generally
don't end up exactly as slow/fast as on 32 bit.
This is an adapted cherry-pick from libav commit
383d96aa22
.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
100 lines
3.9 KiB
C
100 lines
3.9 KiB
C
/*
|
|
* VP9 compatible video decoder
|
|
*
|
|
* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
|
|
* Copyright (C) 2013 Clément Bœsch <u pkh me>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/avassert.h"
|
|
#include "libavutil/common.h"
|
|
#include "vp9dsp.h"
|
|
|
|
const DECLARE_ALIGNED(16, int16_t, ff_vp9_subpel_filters)[3][16][8] = {
|
|
[FILTER_8TAP_REGULAR] = {
|
|
{ 0, 0, 0, 128, 0, 0, 0, 0 },
|
|
{ 0, 1, -5, 126, 8, -3, 1, 0 },
|
|
{ -1, 3, -10, 122, 18, -6, 2, 0 },
|
|
{ -1, 4, -13, 118, 27, -9, 3, -1 },
|
|
{ -1, 4, -16, 112, 37, -11, 4, -1 },
|
|
{ -1, 5, -18, 105, 48, -14, 4, -1 },
|
|
{ -1, 5, -19, 97, 58, -16, 5, -1 },
|
|
{ -1, 6, -19, 88, 68, -18, 5, -1 },
|
|
{ -1, 6, -19, 78, 78, -19, 6, -1 },
|
|
{ -1, 5, -18, 68, 88, -19, 6, -1 },
|
|
{ -1, 5, -16, 58, 97, -19, 5, -1 },
|
|
{ -1, 4, -14, 48, 105, -18, 5, -1 },
|
|
{ -1, 4, -11, 37, 112, -16, 4, -1 },
|
|
{ -1, 3, -9, 27, 118, -13, 4, -1 },
|
|
{ 0, 2, -6, 18, 122, -10, 3, -1 },
|
|
{ 0, 1, -3, 8, 126, -5, 1, 0 },
|
|
}, [FILTER_8TAP_SHARP] = {
|
|
{ 0, 0, 0, 128, 0, 0, 0, 0 },
|
|
{ -1, 3, -7, 127, 8, -3, 1, 0 },
|
|
{ -2, 5, -13, 125, 17, -6, 3, -1 },
|
|
{ -3, 7, -17, 121, 27, -10, 5, -2 },
|
|
{ -4, 9, -20, 115, 37, -13, 6, -2 },
|
|
{ -4, 10, -23, 108, 48, -16, 8, -3 },
|
|
{ -4, 10, -24, 100, 59, -19, 9, -3 },
|
|
{ -4, 11, -24, 90, 70, -21, 10, -4 },
|
|
{ -4, 11, -23, 80, 80, -23, 11, -4 },
|
|
{ -4, 10, -21, 70, 90, -24, 11, -4 },
|
|
{ -3, 9, -19, 59, 100, -24, 10, -4 },
|
|
{ -3, 8, -16, 48, 108, -23, 10, -4 },
|
|
{ -2, 6, -13, 37, 115, -20, 9, -4 },
|
|
{ -2, 5, -10, 27, 121, -17, 7, -3 },
|
|
{ -1, 3, -6, 17, 125, -13, 5, -2 },
|
|
{ 0, 1, -3, 8, 127, -7, 3, -1 },
|
|
}, [FILTER_8TAP_SMOOTH] = {
|
|
{ 0, 0, 0, 128, 0, 0, 0, 0 },
|
|
{ -3, -1, 32, 64, 38, 1, -3, 0 },
|
|
{ -2, -2, 29, 63, 41, 2, -3, 0 },
|
|
{ -2, -2, 26, 63, 43, 4, -4, 0 },
|
|
{ -2, -3, 24, 62, 46, 5, -4, 0 },
|
|
{ -2, -3, 21, 60, 49, 7, -4, 0 },
|
|
{ -1, -4, 18, 59, 51, 9, -4, 0 },
|
|
{ -1, -4, 16, 57, 53, 12, -4, -1 },
|
|
{ -1, -4, 14, 55, 55, 14, -4, -1 },
|
|
{ -1, -4, 12, 53, 57, 16, -4, -1 },
|
|
{ 0, -4, 9, 51, 59, 18, -4, -1 },
|
|
{ 0, -4, 7, 49, 60, 21, -3, -2 },
|
|
{ 0, -4, 5, 46, 62, 24, -3, -2 },
|
|
{ 0, -4, 4, 43, 63, 26, -2, -2 },
|
|
{ 0, -3, 2, 41, 63, 29, -2, -2 },
|
|
{ 0, -3, 1, 38, 64, 32, -1, -3 },
|
|
}
|
|
};
|
|
|
|
|
|
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
|
|
{
|
|
if (bpp == 8) {
|
|
ff_vp9dsp_init_8(dsp);
|
|
} else if (bpp == 10) {
|
|
ff_vp9dsp_init_10(dsp);
|
|
} else {
|
|
av_assert0(bpp == 12);
|
|
ff_vp9dsp_init_12(dsp);
|
|
}
|
|
|
|
if (ARCH_AARCH64) ff_vp9dsp_init_aarch64(dsp, bpp);
|
|
if (ARCH_ARM) ff_vp9dsp_init_arm(dsp, bpp);
|
|
if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
|
|
if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
|
|
}
|