/* * bwdif aarch64 NEON optimisations * * Copyright (c) 2023 John Cox * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/aarch64/asm.S" // Space taken on the stack by an int (32-bit) #ifdef __APPLE__ .set SP_INT, 4 #else .set SP_INT, 8 #endif .macro SQSHRUNN b, s0, s1, s2, s3, n sqshrun \s0\().4h, \s0\().4s, #\n - 8 sqshrun2 \s0\().8h, \s1\().4s, #\n - 8 sqshrun \s1\().4h, \s2\().4s, #\n - 8 sqshrun2 \s1\().8h, \s3\().4s, #\n - 8 uzp2 \b\().16b, \s0\().16b, \s1\().16b .endm .macro SMULL4K a0, a1, a2, a3, s0, s1, k smull \a0\().4s, \s0\().4h, \k smull2 \a1\().4s, \s0\().8h, \k smull \a2\().4s, \s1\().4h, \k smull2 \a3\().4s, \s1\().8h, \k .endm .macro UMULL4K a0, a1, a2, a3, s0, s1, k umull \a0\().4s, \s0\().4h, \k umull2 \a1\().4s, \s0\().8h, \k umull \a2\().4s, \s1\().4h, \k umull2 \a3\().4s, \s1\().8h, \k .endm .macro UMLAL4K a0, a1, a2, a3, s0, s1, k umlal \a0\().4s, \s0\().4h, \k umlal2 \a1\().4s, \s0\().8h, \k umlal \a2\().4s, \s1\().4h, \k umlal2 \a3\().4s, \s1\().8h, \k .endm .macro UMLSL4K a0, a1, a2, a3, s0, s1, k umlsl \a0\().4s, \s0\().4h, \k umlsl2 \a1\().4s, \s0\().8h, \k umlsl \a2\().4s, \s1\().4h, \k umlsl2 \a3\().4s, \s1\().8h, \k .endm // int b = m2s1 - m1; // int f = p2s1 - p1; // int dc = c0s1 - m1; // int de = c0s1 - p1; // int sp_max = FFMIN(p1 - c0s1, m1 - c0s1); // sp_max = FFMIN(sp_max, FFMAX(-b,-f)); // int sp_min = FFMIN(c0s1 - p1, c0s1 - m1); // sp_min = FFMIN(sp_min, FFMAX(b,f)); // diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max); .macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3 uqsub \t0\().16b, \p1\().16b, \c0s1\().16b uqsub \t2\().16b, \m1\().16b, \c0s1\().16b umin \t2\().16b, \t0\().16b, \t2\().16b uqsub \t1\().16b, \m1\().16b, \m2s1\().16b uqsub \t3\().16b, \p1\().16b, \p2s1\().16b umax \t3\().16b, \t3\().16b, \t1\().16b umin \t3\().16b, \t3\().16b, \t2\().16b uqsub \t0\().16b, \c0s1\().16b, \p1\().16b uqsub \t2\().16b, \c0s1\().16b, \m1\().16b umin \t2\().16b, \t0\().16b, \t2\().16b uqsub \t1\().16b, \m2s1\().16b, \m1\().16b uqsub \t0\().16b, \p2s1\().16b, \p1\().16b umax \t0\().16b, \t0\().16b, \t1\().16b umin \t2\().16b, \t2\().16b, \t0\().16b cmeq \t1\().16b, \diff\().16b, #0 umax \diff\().16b, \diff\().16b, \t3\().16b umax \diff\().16b, \diff\().16b, \t2\().16b bic \diff\().16b, \diff\().16b, \t1\().16b .endm // i0 = s0; // if (i0 > d0 + diff0) // i0 = d0 + diff0; // else if (i0 < d0 - diff0) // i0 = d0 - diff0; // // i0 = s0 is safe .macro DIFF_CLIP i0, s0, d0, diff, t0, t1 uqadd \t0\().16b, \d0\().16b, \diff\().16b uqsub \t1\().16b, \d0\().16b, \diff\().16b umin \i0\().16b, \s0\().16b, \t0\().16b umax \i0\().16b, \i0\().16b, \t1\().16b .endm // i0 = FFABS(m1 - p1) > td0 ? i1 : i2; // DIFF_CLIP // // i0 = i1 is safe .macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2 uabd \t0\().16b, \m1\().16b, \p1\().16b cmhi \t0\().16b, \t0\().16b, \td0\().16b bsl \t0\().16b, \i1\().16b, \i2\().16b DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2 .endm .macro PUSH_VREGS stp d8, d9, [sp, #-64]! stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] .endm .macro POP_VREGS ldp d14, d15, [sp, #48] ldp d12, d13, [sp, #32] ldp d10, d11, [sp, #16] ldp d8, d9, [sp], #64 .endm .macro LDR_COEFFS d, t0 movrel \t0, coeffs, 0 ld1 {\d\().8h}, [\t0] .endm // static const uint16_t coef_lf[2] = { 4309, 213 }; // static const uint16_t coef_hf[3] = { 5570, 3801, 1016 }; // static const uint16_t coef_sp[2] = { 5077, 981 }; const coeffs, align=4 // align 4 means align on 2^4 boundry .hword 4309 * 4, 213 * 4 // lf[0]*4 = v0.h[0] .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] .hword 5077, 981 // sp[0] = v0.h[6] endconst // =========================================================================== // // void ff_bwdif_filter_line3_neon( // void * dst1, // x0 // int d_stride, // w1 // const void * prev1, // x2 // const void * cur1, // x3 // const void * next1, // x4 // int s_stride, // w5 // int w, // w6 // int parity, // w7 // int clip_max); // [sp, #0] (Ignored) function ff_bwdif_filter_line3_neon, export=1 // Sanity check w cmp w6, #0 ble 99f LDR_COEFFS v0, x17 // #define prev2 cur // const uint8_t * restrict next2 = parity ? prev : next; cmp w7, #0 csel x17, x2, x4, ne // We want all the V registers - save all the ones we must PUSH_VREGS // Some rearrangement of initial values for nice layout of refs in regs mov w10, w6 // w10 = loop count neg w9, w5 // w9 = mref lsl w8, w9, #1 // w8 = mref2 add w7, w9, w9, LSL #1 // w7 = mref3 lsl w6, w9, #2 // w6 = mref4 mov w11, w5 // w11 = pref lsl w12, w5, #1 // w12 = pref2 add w13, w5, w5, LSL #1 // w13 = pref3 lsl w14, w5, #2 // w14 = pref4 add w15, w5, w5, LSL #2 // w15 = pref5 add w16, w14, w12 // w16 = pref6 lsl w5, w1, #1 // w5 = d_stride * 2 // for (x = 0; x < w; x++) { // int diff0, diff2; // int d0, d2; // int temporal_diff0, temporal_diff2; // // int i1, i2; // int j1, j2; // int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; 10: // c0 = prev2[0] + next2[0]; // c0 = v20, v21 // d0 = c0 >> 1; // d0 = v10 // temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ldr q31, [x3] ldr q21, [x17] uhadd v10.16b, v31.16b, v21.16b uabd v11.16b, v31.16b, v21.16b uaddl v20.8h, v21.8b, v31.8b uaddl2 v21.8h, v21.16b, v31.16b ldr q31, [x3, w6, sxtw] ldr q23, [x17, w6, sxtw] // i1 = coef_hf[0] * c0; // i1 = v2-v5 UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ldr q30, [x3, w14, sxtw] ldr q25, [x17, w14, sxtw] // m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 uaddl v22.8h, v23.8b, v31.8b uaddl2 v23.8h, v23.16b, v31.16b // p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 uhadd v12.16b, v25.16b, v30.16b uaddl v24.8h, v25.8b, v30.8b uaddl2 v25.8h, v25.16b, v30.16b // j1 = -coef_hf[1] * (c0 + p4); // j1 = v6-v9 (-c0:v20,v21) add v20.8h, v20.8h, v24.8h add v21.8h, v21.8h, v25.8h SMULL4K v6, v7, v8, v9, v20, v21, v0.h[5] // m3 = cur[mrefs3]; // m3 = v20 ldr q20, [x3, w7, sxtw] // p3 = cur[prefs3]; // p3 = v21 ldr q21, [x3, w13, sxtw] // i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) add v22.8h, v22.8h, v24.8h add v23.8h, v23.8h, v25.8h UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ldr q29, [x3, w8, sxtw] ldr q23, [x17, w8, sxtw] // i1 -= coef_lf[1] * 4 * (m3 + p3); // - uaddl v30.8h, v20.8b, v21.8b uaddl2 v31.8h, v20.16b, v21.16b ldr q28, [x3, w16, sxtw] ldr q25, [x17, w16, sxtw] UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] // m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 uhadd v13.16b, v23.16b, v29.16b uaddl v22.8h, v23.8b, v29.8b uaddl2 v23.8h, v23.16b, v29.16b ldr q31, [x3, w12, sxtw] ldr q27, [x17, w12, sxtw] // p6 = prev2[prefs6] + next2[prefs6]; // p6 = v24,v25 uaddl v24.8h, v25.8b, v28.8b uaddl2 v25.8h, v25.16b, v28.16b // j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) add v24.8h, v24.8h, v22.8h add v25.8h, v25.8h, v23.8h UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] // m1 = cur[mrefs]; // m1 = v24 ldr q24, [x3, w9, sxtw] // p5 = cur[prefs5]; // p5 = v25 ldr q25, [x3, w15, sxtw] // p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 // temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 // d2 = p2 >> 1; // d2 = v15 uabd v14.16b, v31.16b, v27.16b uhadd v15.16b, v31.16b, v27.16b uaddl v26.8h, v27.8b, v31.8b uaddl2 v27.8h, v27.16b, v31.16b // j1 += coef_hf[0] * p2; // - UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[2] // i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) add v22.8h, v22.8h, v26.8h add v23.8h, v23.8h, v27.8h UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] // p1 = cur[prefs]; // p1 = v22 ldr q22, [x3, w11, sxtw] // j1 -= coef_lf[1] * 4 * (m1 + p5); // - uaddl v26.8h, v24.8b, v25.8b uaddl2 v27.8h, v24.16b, v25.16b UMLSL4K v6, v7, v8, v9, v26, v27, v0.h[1] // j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16 uaddl v18.8h, v22.8b, v21.8b uaddl2 v19.8h, v22.16b, v21.16b UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] uaddl v18.8h, v24.8b, v25.8b uaddl2 v19.8h, v24.16b, v25.16b UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] SQSHRUNN v16, v28, v29, v30, v31, 13 // i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 uaddl v18.8h, v22.8b, v24.8b uaddl2 v19.8h, v22.16b, v24.16b UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] uaddl v18.8h, v20.8b, v21.8b uaddl2 v19.8h, v20.16b, v21.16b UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] SQSHRUNN v17, v28, v29, v30, v31, 13 // i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 uaddl v26.8h, v24.8b, v22.8b uaddl2 v27.8h, v24.16b, v22.16b UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ldr q31, [x2, w9, sxtw] ldr q29, [x4, w9, sxtw] // j1 += coef_lf[0] * 4 * (p1 + p3); // p1 = v22, p3 = v21 uaddl v26.8h, v21.8b, v22.8b uaddl2 v27.8h, v21.16b, v22.16b UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[0] ldr q30, [x2, w11, sxtw] ldr q28, [x4, w11, sxtw] // i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* SQSHRUNN v2, v2, v3, v4, v5, 15 // j1 >>= 15; // j1 = v3, -v6*, -v7*, -v8*, -v9* SQSHRUNN v3, v6, v7, v8, v9, 15 // { // int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; // int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; uabd v30.16b, v22.16b, v30.16b uabd v31.16b, v24.16b, v31.16b uabd v28.16b, v22.16b, v28.16b uabd v29.16b, v24.16b, v29.16b uhadd v31.16b, v31.16b, v30.16b uhadd v29.16b, v29.16b, v28.16b ldr q27, [x2, w13, sxtw] ldr q26, [x4, w13, sxtw] // diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ushr v18.16b, v11.16b, #1 umax v18.16b, v18.16b, v31.16b umax v18.16b, v18.16b, v29.16b // } // v28, v30 preserved for next block // { // tdiff2 = v14 // int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1; // int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1; uabd v31.16b, v21.16b, v27.16b uabd v29.16b, v21.16b, v26.16b uhadd v31.16b, v31.16b, v30.16b uhadd v29.16b, v29.16b, v28.16b // diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19 ushr v19.16b, v14.16b, #1 umax v19.16b, v19.16b, v31.16b umax v19.16b, v19.16b, v29.16b // } // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 // diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12 SPAT_CHECK v19, v10, v22, v15, v21, v12, v31, v30, v29, v28 // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19 INTERPOL v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29 // dst[d_stride * 2] = av_clip_uint8(interpol); str q3, [x0, w5, sxtw] // dst[d_stride] = p1; str q22, [x0, w1, sxtw] // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 // dst[0] = av_clip_uint8(interpol); str q2, [x0], #16 // } // // dst++; // cur++; // prev++; // prev2++; // next++; // } subs w10, w10, #16 add x2, x2, #16 add x3, x3, #16 add x4, x4, #16 add x17, x17, #16 bgt 10b POP_VREGS 99: ret endfunc // =========================================================================== // // void filter_line( // void *dst1, // x0 // void *prev1, // x1 // void *cur1, // x2 // void *next1, // x3 // int w, // w4 // int prefs, // w5 // int mrefs, // w6 // int prefs2, // w7 // int mrefs2, // [sp, #0] // int prefs3, // [sp, #SP_INT] // int mrefs3, // [sp, #SP_INT*2] // int prefs4, // [sp, #SP_INT*3] // int mrefs4, // [sp, #SP_INT*4] // int parity, // [sp, #SP_INT*5] // int clip_max) // [sp, #SP_INT*6] function ff_bwdif_filter_line_neon, export=1 // Sanity check w cmp w4, #0 ble 99f // Rearrange regs to be the same as line3 for ease of debug! mov w10, w4 // w10 = loop count mov w9, w6 // w9 = mref mov w12, w7 // w12 = pref2 mov w11, w5 // w11 = pref ldr w8, [sp, #0] // w8 = mref2 ldr w7, [sp, #SP_INT*2] // w7 = mref3 ldr w6, [sp, #SP_INT*4] // w6 = mref4 ldr w13, [sp, #SP_INT] // w13 = pref3 ldr w14, [sp, #SP_INT*3] // w14 = pref4 mov x4, x3 mov x3, x2 mov x2, x1 LDR_COEFFS v0, x17 // #define prev2 cur // const uint8_t * restrict next2 = parity ? prev : next; ldr w17, [sp, #SP_INT*5] // parity cmp w17, #0 csel x17, x2, x4, ne PUSH_VREGS // for (x = 0; x < w; x++) { // int diff0, diff2; // int d0, d2; // int temporal_diff0, temporal_diff2; // // int i1, i2; // int j1, j2; // int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; 10: // c0 = prev2[0] + next2[0]; // c0 = v20, v21 // d0 = c0 >> 1; // d0 = v10 // temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ldr q31, [x3] ldr q21, [x17] uhadd v10.16b, v31.16b, v21.16b uabd v11.16b, v31.16b, v21.16b uaddl v20.8h, v21.8b, v31.8b uaddl2 v21.8h, v21.16b, v31.16b ldr q31, [x3, w6, sxtw] ldr q23, [x17, w6, sxtw] // i1 = coef_hf[0] * c0; // i1 = v2-v5 UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ldr q30, [x3, w14, sxtw] ldr q25, [x17, w14, sxtw] // m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 uaddl v22.8h, v23.8b, v31.8b uaddl2 v23.8h, v23.16b, v31.16b // p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 uhadd v12.16b, v25.16b, v30.16b uaddl v24.8h, v25.8b, v30.8b uaddl2 v25.8h, v25.16b, v30.16b // m3 = cur[mrefs3]; // m3 = v20 ldr q20, [x3, w7, sxtw] // p3 = cur[prefs3]; // p3 = v21 ldr q21, [x3, w13, sxtw] // i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) add v22.8h, v22.8h, v24.8h add v23.8h, v23.8h, v25.8h UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ldr q29, [x3, w8, sxtw] ldr q23, [x17, w8, sxtw] // i1 -= coef_lf[1] * 4 * (m3 + p3); // - uaddl v30.8h, v20.8b, v21.8b uaddl2 v31.8h, v20.16b, v21.16b UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ldr q31, [x3, w12, sxtw] ldr q27, [x17, w12, sxtw] // m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 uhadd v13.16b, v23.16b, v29.16b uaddl v22.8h, v23.8b, v29.8b uaddl2 v23.8h, v23.16b, v29.16b // m1 = cur[mrefs]; // m1 = v24 ldr q24, [x3, w9, sxtw] // p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 // temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 // d2 = p2 >> 1; // d2 = v15 uabd v14.16b, v31.16b, v27.16b uhadd v15.16b, v31.16b, v27.16b uaddl v26.8h, v27.8b, v31.8b uaddl2 v27.8h, v27.16b, v31.16b // i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) add v22.8h, v22.8h, v26.8h add v23.8h, v23.8h, v27.8h UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] // p1 = cur[prefs]; // p1 = v22 ldr q22, [x3, w11, sxtw] // i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 uaddl v18.8h, v22.8b, v24.8b uaddl2 v19.8h, v22.16b, v24.16b UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] uaddl v18.8h, v20.8b, v21.8b uaddl2 v19.8h, v20.16b, v21.16b UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] SQSHRUNN v17, v28, v29, v30, v31, 13 // i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 uaddl v26.8h, v24.8b, v22.8b uaddl2 v27.8h, v24.16b, v22.16b UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ldr q31, [x2, w9, sxtw] ldr q29, [x4, w9, sxtw] ldr q30, [x2, w11, sxtw] ldr q28, [x4, w11, sxtw] // i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* SQSHRUNN v2, v2, v3, v4, v5, 15 // { // int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; // int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; uabd v30.16b, v22.16b, v30.16b uabd v31.16b, v24.16b, v31.16b uabd v28.16b, v22.16b, v28.16b uabd v29.16b, v24.16b, v29.16b uhadd v31.16b, v31.16b, v30.16b uhadd v29.16b, v29.16b, v28.16b // diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ushr v18.16b, v11.16b, #1 umax v18.16b, v18.16b, v31.16b umax v18.16b, v18.16b, v29.16b // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 // dst[0] = av_clip_uint8(interpol); str q2, [x0], #16 // } // // dst++; // cur++; // prev++; // prev2++; // next++; // } subs w10, w10, #16 add x2, x2, #16 add x3, x3, #16 add x4, x4, #16 add x17, x17, #16 bgt 10b POP_VREGS 99: ret endfunc // ============================================================================ // // void ff_bwdif_filter_edge_neon( // void *dst1, // x0 // void *prev1, // x1 // void *cur1, // x2 // void *next1, // x3 // int w, // w4 // int prefs, // w5 // int mrefs, // w6 // int prefs2, // w7 // int mrefs2, // [sp, #0] // int parity, // [sp, #SP_INT] // int clip_max, // [sp, #SP_INT*2] unused // int spat); // [sp, #SP_INT*3] function ff_bwdif_filter_edge_neon, export=1 // Sanity check w cmp w4, #0 ble 99f // #define prev2 cur // const uint8_t * restrict next2 = parity ? prev : next; ldr w8, [sp, #0] // mrefs2 ldr w17, [sp, #SP_INT] // parity ldr w16, [sp, #SP_INT*3] // spat cmp w17, #0 csel x17, x1, x3, ne // for (x = 0; x < w; x++) { 10: // int m1 = cur[mrefs]; // int d = (prev2[0] + next2[0]) >> 1; // int p1 = cur[prefs]; // int temporal_diff0 = FFABS(prev2[0] - next2[0]); // int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; // int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; // int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); ldr q31, [x2] ldr q21, [x17] uhadd v16.16b, v31.16b, v21.16b // d0 = v16 uabd v17.16b, v31.16b, v21.16b // td0 = v17 ldr q24, [x2, w6, sxtw] // m1 = v24 ldr q22, [x2, w5, sxtw] // p1 = v22 ldr q0, [x1, w6, sxtw] // prev[mrefs] ldr q2, [x1, w5, sxtw] // prev[prefs] ldr q1, [x3, w6, sxtw] // next[mrefs] ldr q3, [x3, w5, sxtw] // next[prefs] ushr v29.16b, v17.16b, #1 uabd v31.16b, v0.16b, v24.16b uabd v30.16b, v2.16b, v22.16b uhadd v0.16b, v31.16b, v30.16b // td1 = q0 uabd v31.16b, v1.16b, v24.16b uabd v30.16b, v3.16b, v22.16b uhadd v1.16b, v31.16b, v30.16b // td2 = q1 umax v0.16b, v0.16b, v29.16b umax v0.16b, v0.16b, v1.16b // diff = v0 // if (spat) { // SPAT_CHECK() // } // i0 = (m1 + p1) >> 1; cbz w16, 1f ldr q31, [x2, w8, sxtw] ldr q18, [x17, w8, sxtw] ldr q30, [x2, w7, sxtw] ldr q19, [x17, w7, sxtw] uhadd v18.16b, v18.16b, v31.16b uhadd v19.16b, v19.16b, v30.16b SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28 1: uhadd v2.16b, v22.16b, v24.16b // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30 DIFF_CLIP v2, v2, v16, v0, v31, v30 // dst[0] = av_clip(interpol, 0, clip_max); str q2, [x0], #16 // dst++; // cur++; // } subs w4, w4, #16 add x1, x1, #16 add x2, x2, #16 add x3, x3, #16 add x17, x17, #16 bgt 10b 99: ret endfunc // ============================================================================ // // void ff_bwdif_filter_intra_neon( // void *dst1, // x0 // void *cur1, // x1 // int w, // w2 // int prefs, // w3 // int mrefs, // w4 // int prefs3, // w5 // int mrefs3, // w6 // int parity, // w7 unused // int clip_max) // [sp, #0] unused function ff_bwdif_filter_intra_neon, export=1 cmp w2, #0 ble 99f LDR_COEFFS v0, x17 // for (x = 0; x < w; x++) { 10: // interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; ldr q31, [x1, w4, sxtw] ldr q30, [x1, w3, sxtw] ldr q29, [x1, w6, sxtw] ldr q28, [x1, w5, sxtw] uaddl v20.8h, v31.8b, v30.8b uaddl2 v21.8h, v31.16b, v30.16b UMULL4K v2, v3, v4, v5, v20, v21, v0.h[6] uaddl v20.8h, v29.8b, v28.8b uaddl2 v21.8h, v29.16b, v28.16b UMLSL4K v2, v3, v4, v5, v20, v21, v0.h[7] // dst[0] = av_clip(interpol, 0, clip_max); SQSHRUNN v2, v2, v3, v4, v5, 13 str q2, [x0], #16 // dst++; // cur++; // } subs w2, w2, #16 add x1, x1, #16 bgt 10b 99: ret endfunc