mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-26 17:32:06 +00:00
avfilter/vf_bwdif: Add neon for filter_edge
Adds clip and spatial macros for aarch64 neon Exports C filter_edge needed for tail fixup of neon code Adds neon for filter_edge Signed-off-by: John Cox <jc@kynesim.co.uk> Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
7ed7c00f55
commit
8130df83e0
@ -24,10 +24,29 @@
|
||||
#include "libavfilter/bwdif.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
int w, int prefs, int mrefs, int prefs2, int mrefs2,
|
||||
int parity, int clip_max, int spat);
|
||||
|
||||
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
|
||||
int prefs3, int mrefs3, int parity, int clip_max);
|
||||
|
||||
|
||||
static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
int w, int prefs, int mrefs, int prefs2, int mrefs2,
|
||||
int parity, int clip_max, int spat)
|
||||
{
|
||||
const int w0 = clip_max != 255 ? 0 : w & ~15;
|
||||
|
||||
ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
|
||||
parity, clip_max, spat);
|
||||
|
||||
if (w0 < w)
|
||||
ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
|
||||
w - w0, prefs, mrefs, prefs2, mrefs2,
|
||||
parity, clip_max, spat);
|
||||
}
|
||||
|
||||
static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
|
||||
int prefs3, int mrefs3, int parity, int clip_max)
|
||||
{
|
||||
@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
|
||||
return;
|
||||
|
||||
s->filter_intra = filter_intra_helper;
|
||||
s->filter_edge = filter_edge_helper;
|
||||
}
|
||||
|
||||
|
@ -66,6 +66,79 @@
|
||||
umlsl2 \a3\().4s, \s1\().8h, \k
|
||||
.endm
|
||||
|
||||
// int b = m2s1 - m1;
|
||||
// int f = p2s1 - p1;
|
||||
// int dc = c0s1 - m1;
|
||||
// int de = c0s1 - p1;
|
||||
// int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
|
||||
// sp_max = FFMIN(sp_max, FFMAX(-b,-f));
|
||||
// int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
|
||||
// sp_min = FFMIN(sp_min, FFMAX(b,f));
|
||||
// diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
|
||||
.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
|
||||
uqsub \t0\().16b, \p1\().16b, \c0s1\().16b
|
||||
uqsub \t2\().16b, \m1\().16b, \c0s1\().16b
|
||||
umin \t2\().16b, \t0\().16b, \t2\().16b
|
||||
|
||||
uqsub \t1\().16b, \m1\().16b, \m2s1\().16b
|
||||
uqsub \t3\().16b, \p1\().16b, \p2s1\().16b
|
||||
umax \t3\().16b, \t3\().16b, \t1\().16b
|
||||
umin \t3\().16b, \t3\().16b, \t2\().16b
|
||||
|
||||
uqsub \t0\().16b, \c0s1\().16b, \p1\().16b
|
||||
uqsub \t2\().16b, \c0s1\().16b, \m1\().16b
|
||||
umin \t2\().16b, \t0\().16b, \t2\().16b
|
||||
|
||||
uqsub \t1\().16b, \m2s1\().16b, \m1\().16b
|
||||
uqsub \t0\().16b, \p2s1\().16b, \p1\().16b
|
||||
umax \t0\().16b, \t0\().16b, \t1\().16b
|
||||
umin \t2\().16b, \t2\().16b, \t0\().16b
|
||||
|
||||
cmeq \t1\().16b, \diff\().16b, #0
|
||||
umax \diff\().16b, \diff\().16b, \t3\().16b
|
||||
umax \diff\().16b, \diff\().16b, \t2\().16b
|
||||
bic \diff\().16b, \diff\().16b, \t1\().16b
|
||||
.endm
|
||||
|
||||
// i0 = s0;
|
||||
// if (i0 > d0 + diff0)
|
||||
// i0 = d0 + diff0;
|
||||
// else if (i0 < d0 - diff0)
|
||||
// i0 = d0 - diff0;
|
||||
//
|
||||
// i0 = s0 is safe
|
||||
.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
|
||||
uqadd \t0\().16b, \d0\().16b, \diff\().16b
|
||||
uqsub \t1\().16b, \d0\().16b, \diff\().16b
|
||||
umin \i0\().16b, \s0\().16b, \t0\().16b
|
||||
umax \i0\().16b, \i0\().16b, \t1\().16b
|
||||
.endm
|
||||
|
||||
// i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
|
||||
// DIFF_CLIP
|
||||
//
|
||||
// i0 = i1 is safe
|
||||
.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
|
||||
uabd \t0\().16b, \m1\().16b, \p1\().16b
|
||||
cmhi \t0\().16b, \t0\().16b, \td0\().16b
|
||||
bsl \t0\().16b, \i1\().16b, \i2\().16b
|
||||
DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2
|
||||
.endm
|
||||
|
||||
.macro PUSH_VREGS
|
||||
stp d8, d9, [sp, #-64]!
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
.endm
|
||||
|
||||
.macro POP_VREGS
|
||||
ldp d14, d15, [sp, #48]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d8, d9, [sp], #64
|
||||
.endm
|
||||
|
||||
.macro LDR_COEFFS d, t0
|
||||
movrel \t0, coeffs, 0
|
||||
ld1 {\d\().8h}, [\t0]
|
||||
@ -81,6 +154,110 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
|
||||
.hword 5077, 981 // sp[0] = v0.h[6]
|
||||
endconst
|
||||
|
||||
// ============================================================================
|
||||
//
|
||||
// void ff_bwdif_filter_edge_neon(
|
||||
// void *dst1, // x0
|
||||
// void *prev1, // x1
|
||||
// void *cur1, // x2
|
||||
// void *next1, // x3
|
||||
// int w, // w4
|
||||
// int prefs, // w5
|
||||
// int mrefs, // w6
|
||||
// int prefs2, // w7
|
||||
// int mrefs2, // [sp, #0]
|
||||
// int parity, // [sp, #SP_INT]
|
||||
// int clip_max, // [sp, #SP_INT*2] unused
|
||||
// int spat); // [sp, #SP_INT*3]
|
||||
|
||||
function ff_bwdif_filter_edge_neon, export=1
|
||||
// Sanity check w
|
||||
cmp w4, #0
|
||||
ble 99f
|
||||
|
||||
// #define prev2 cur
|
||||
// const uint8_t * restrict next2 = parity ? prev : next;
|
||||
|
||||
ldr w8, [sp, #0] // mrefs2
|
||||
|
||||
ldr w17, [sp, #SP_INT] // parity
|
||||
ldr w16, [sp, #SP_INT*3] // spat
|
||||
cmp w17, #0
|
||||
csel x17, x1, x3, ne
|
||||
|
||||
// for (x = 0; x < w; x++) {
|
||||
|
||||
10:
|
||||
// int m1 = cur[mrefs];
|
||||
// int d = (prev2[0] + next2[0]) >> 1;
|
||||
// int p1 = cur[prefs];
|
||||
// int temporal_diff0 = FFABS(prev2[0] - next2[0]);
|
||||
// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
|
||||
// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
|
||||
// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
|
||||
ldr q31, [x2]
|
||||
ldr q21, [x17]
|
||||
uhadd v16.16b, v31.16b, v21.16b // d0 = v16
|
||||
uabd v17.16b, v31.16b, v21.16b // td0 = v17
|
||||
ldr q24, [x2, w6, sxtw] // m1 = v24
|
||||
ldr q22, [x2, w5, sxtw] // p1 = v22
|
||||
|
||||
ldr q0, [x1, w6, sxtw] // prev[mrefs]
|
||||
ldr q2, [x1, w5, sxtw] // prev[prefs]
|
||||
ldr q1, [x3, w6, sxtw] // next[mrefs]
|
||||
ldr q3, [x3, w5, sxtw] // next[prefs]
|
||||
|
||||
ushr v29.16b, v17.16b, #1
|
||||
|
||||
uabd v31.16b, v0.16b, v24.16b
|
||||
uabd v30.16b, v2.16b, v22.16b
|
||||
uhadd v0.16b, v31.16b, v30.16b // td1 = q0
|
||||
|
||||
uabd v31.16b, v1.16b, v24.16b
|
||||
uabd v30.16b, v3.16b, v22.16b
|
||||
uhadd v1.16b, v31.16b, v30.16b // td2 = q1
|
||||
|
||||
umax v0.16b, v0.16b, v29.16b
|
||||
umax v0.16b, v0.16b, v1.16b // diff = v0
|
||||
|
||||
// if (spat) {
|
||||
// SPAT_CHECK()
|
||||
// }
|
||||
// i0 = (m1 + p1) >> 1;
|
||||
cbz w16, 1f
|
||||
|
||||
ldr q31, [x2, w8, sxtw]
|
||||
ldr q18, [x17, w8, sxtw]
|
||||
ldr q30, [x2, w7, sxtw]
|
||||
ldr q19, [x17, w7, sxtw]
|
||||
uhadd v18.16b, v18.16b, v31.16b
|
||||
uhadd v19.16b, v19.16b, v30.16b
|
||||
|
||||
SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
|
||||
|
||||
1:
|
||||
uhadd v2.16b, v22.16b, v24.16b
|
||||
|
||||
// i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
|
||||
DIFF_CLIP v2, v2, v16, v0, v31, v30
|
||||
|
||||
// dst[0] = av_clip(interpol, 0, clip_max);
|
||||
str q2, [x0], #16
|
||||
|
||||
// dst++;
|
||||
// cur++;
|
||||
// }
|
||||
subs w4, w4, #16
|
||||
add x1, x1, #16
|
||||
add x2, x2, #16
|
||||
add x3, x3, #16
|
||||
add x17, x17, #16
|
||||
bgt 10b
|
||||
|
||||
99:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// ============================================================================
|
||||
//
|
||||
// void ff_bwdif_filter_intra_neon(
|
||||
|
@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
|
||||
void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
|
||||
void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
|
||||
|
||||
void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
int w, int prefs, int mrefs, int prefs2, int mrefs2,
|
||||
int parity, int clip_max, int spat);
|
||||
|
||||
void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
|
||||
int prefs3, int mrefs3, int parity, int clip_max);
|
||||
|
||||
|
@ -150,7 +150,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
FILTER2()
|
||||
}
|
||||
|
||||
static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
|
||||
int w, int prefs, int mrefs, int prefs2, int mrefs2,
|
||||
int parity, int clip_max, int spat)
|
||||
{
|
||||
@ -364,7 +364,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
|
||||
} else {
|
||||
s->filter_intra = ff_bwdif_filter_intra_c;
|
||||
s->filter_line = filter_line_c;
|
||||
s->filter_edge = filter_edge;
|
||||
s->filter_edge = ff_bwdif_filter_edge_c;
|
||||
}
|
||||
|
||||
#if ARCH_X86
|
||||
|
Loading…
Reference in New Issue
Block a user