lavc/aarch64: h264qpel, add 10-bit lowpass_8_10 based functions

Benchmarks                         A53      A55     A72     A76
avg_h264_qpel_8_mc01_10_c:        936.5    924.0   656.0   504.7
avg_h264_qpel_8_mc01_10_neon:     234.7    202.0   120.7    63.2
avg_h264_qpel_8_mc02_10_c:        921.0    920.0   669.2   493.7
avg_h264_qpel_8_mc02_10_neon:     202.0    173.2   102.7    58.5
avg_h264_qpel_8_mc03_10_c:        936.5    924.0   656.0   509.5
avg_h264_qpel_8_mc03_10_neon:     236.2    203.7   120.0    63.2
avg_h264_qpel_8_mc10_10_c:       1441.0   1437.7   806.7   478.5
avg_h264_qpel_8_mc10_10_neon:     325.7    324.0   153.7    94.2
avg_h264_qpel_8_mc11_10_c:       2160.7   2148.2  1366.7   906.7
avg_h264_qpel_8_mc11_10_neon:     492.0    464.0   242.5   134.5
avg_h264_qpel_8_mc13_10_c:       2157.0   2138.2  1357.0   908.2
avg_h264_qpel_8_mc13_10_neon:     494.0    467.2   242.0   140.0
avg_h264_qpel_8_mc20_10_c:       1433.5   1410.0   785.2   486.0
avg_h264_qpel_8_mc20_10_neon:     293.7    289.7   138.0    91.5
avg_h264_qpel_8_mc30_10_c:       1458.5   1461.7   813.7   483.2
avg_h264_qpel_8_mc30_10_neon:     341.7    339.2   154.0    95.2
avg_h264_qpel_8_mc31_10_c:       2194.7   2197.2  1358.7   928.0
avg_h264_qpel_8_mc31_10_neon:     520.0    495.0   245.5   142.5
avg_h264_qpel_8_mc33_10_c:       2188.0   2205.5  1356.7   910.7
avg_h264_qpel_8_mc33_10_neon:     521.0    494.5   245.7   145.7
avg_h264_qpel_16_mc01_10_c:      3717.2   3595.0  2610.0  2012.0
avg_h264_qpel_16_mc01_10_neon:    920.5    791.5   483.2   240.5
avg_h264_qpel_16_mc02_10_c:      3684.0   3633.0  2659.0  1919.7
avg_h264_qpel_16_mc02_10_neon:    790.7    678.2   409.2   217.0
avg_h264_qpel_16_mc03_10_c:      3726.5   3596.0  2606.7  2010.0
avg_h264_qpel_16_mc03_10_neon:    922.0    792.5   483.2   239.7
avg_h264_qpel_16_mc10_10_c:      5912.0   5803.2  3241.5  1916.7
avg_h264_qpel_16_mc10_10_neon:   1267.5   1277.2   616.5   365.0
avg_h264_qpel_16_mc11_10_c:      8599.2   8482.5  5338.0  3616.2
avg_h264_qpel_16_mc11_10_neon:   1913.0   1827.0   956.2   542.2
avg_h264_qpel_16_mc13_10_c:      8643.7   8488.5  5388.0  3628.5
avg_h264_qpel_16_mc13_10_neon:   1914.7   1828.7   969.2   530.5
avg_h264_qpel_16_mc20_10_c:      5719.5   5641.0  3147.0  1946.2
avg_h264_qpel_16_mc20_10_neon:   1139.5   1150.0   539.5   344.0
avg_h264_qpel_16_mc30_10_c:      5930.0   5872.5  3267.5  1918.0
avg_h264_qpel_16_mc30_10_neon:   1331.5   1341.2   616.5   369.5
avg_h264_qpel_16_mc31_10_c:      8758.7   8697.7  5353.0  3630.7
avg_h264_qpel_16_mc31_10_neon:   2018.7   1941.7   982.2   574.7
avg_h264_qpel_16_mc33_10_c:      8683.2   8675.2  5339.2  3634.7
avg_h264_qpel_16_mc33_10_neon:   2019.7   1940.2   994.5   566.0
put_h264_qpel_8_mc01_10_c:        854.2    843.0   599.2   478.0
put_h264_qpel_8_mc01_10_neon:     192.7    168.0   101.7    56.7
put_h264_qpel_8_mc02_10_c:        766.5    760.0   550.2   441.0
put_h264_qpel_8_mc02_10_neon:     160.0    139.2    88.7    53.0
put_h264_qpel_8_mc03_10_c:        854.2    843.0   599.2   479.0
put_h264_qpel_8_mc03_10_neon:     194.2    169.7   102.0    56.2
put_h264_qpel_8_mc10_10_c:       1352.7   1353.7   749.7   446.7
put_h264_qpel_8_mc10_10_neon:     289.7    294.2   135.5    88.5
put_h264_qpel_8_mc11_10_c:       2080.0   2066.2  1309.5   876.7
put_h264_qpel_8_mc11_10_neon:     450.0    429.7   229.7   131.2
put_h264_qpel_8_mc13_10_c:       2074.7   2060.2  1294.5   870.5
put_h264_qpel_8_mc13_10_neon:     452.5    434.5   226.5   130.0
put_h264_qpel_8_mc20_10_c:       1221.5   1216.0   684.5   399.7
put_h264_qpel_8_mc20_10_neon:     257.7    262.5   121.2    78.7
put_h264_qpel_8_mc30_10_c:       1379.0   1374.7   757.2   449.5
put_h264_qpel_8_mc30_10_neon:     305.7    310.2   135.5    86.5
put_h264_qpel_8_mc31_10_c:       2109.2   2119.7  1299.5   878.0
put_h264_qpel_8_mc31_10_neon:     478.0    458.5   226.0   137.2
put_h264_qpel_8_mc33_10_c:       2101.5   2115.2  1306.5   887.0
put_h264_qpel_8_mc33_10_neon:     479.0    458.7   229.7   141.7
put_h264_qpel_16_mc01_10_c:      3485.7   3396.7  2460.5  1914.5
put_h264_qpel_16_mc01_10_neon:    752.5    665.5   397.0   213.2
put_h264_qpel_16_mc02_10_c:      3103.5   3023.2  2154.7  1720.7
put_h264_qpel_16_mc02_10_neon:    622.7    551.2   347.7   196.2
put_h264_qpel_16_mc03_10_c:      3486.2   3394.0  2436.5  1917.7
put_h264_qpel_16_mc03_10_neon:    754.0    666.5   397.0   215.7
put_h264_qpel_16_mc10_10_c:      5533.0   5488.5  2989.0  1783.0
put_h264_qpel_16_mc10_10_neon:   1123.5   1165.2   535.2   334.7
put_h264_qpel_16_mc11_10_c:      8437.7   8281.2  5209.0  3510.7
put_h264_qpel_16_mc11_10_neon:   1745.0   1697.0   878.5   513.5
put_h264_qpel_16_mc13_10_c:      8567.7   8468.0  5221.5  3528.0
put_h264_qpel_16_mc13_10_neon:   1751.7   1698.2   889.2   507.0
put_h264_qpel_16_mc20_10_c:      4907.5   4885.0  2786.2  1607.5
put_h264_qpel_16_mc20_10_neon:    995.5   1034.5   475.5   307.0
put_h264_qpel_16_mc30_10_c:      5579.7   5537.7  3045.2  1789.5
put_h264_qpel_16_mc30_10_neon:   1187.5   1231.2   532.5   334.5
put_h264_qpel_16_mc31_10_c:      8677.2   8672.5  5204.2  3516.0
put_h264_qpel_16_mc31_10_neon:   1850.7   1813.2   893.0   545.2
put_h264_qpel_16_mc33_10_c:      8688.7   8671.2  5223.2  3512.0
put_h264_qpel_16_mc33_10_neon:   1851.7   1814.2   908.5   535.2

Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Mikhail Nitenko 2023-12-04 14:00:35 +04:00 committed by Martin Storsjö
parent f89cff96d0
commit 0f745b74ec
2 changed files with 621 additions and 2 deletions

View File

@ -95,12 +95,55 @@ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
if (have_neon(cpu_flags) && bit_depth <= 8) {
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@ -168,5 +211,49 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
} else if (have_neon(cpu_flags) && bit_depth == 10) {
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
}
}

View File

@ -933,3 +933,535 @@ endfunc
h264_qpel16 put
h264_qpel16 avg
//trashes v0-v5
.macro lowpass_8_10 r0, r1, r2, r3, d0, d1
ext v2.16b, \r0\().16b, \r1\().16b, #4
ext v3.16b, \r0\().16b, \r1\().16b, #6
add v2.8h, v2.8h, v3.8h
ext v4.16b, \r0\().16b, \r1\().16b, #2
ext v5.16b, \r0\().16b, \r1\().16b, #8
add v4.8h, v4.8h, v5.8h
ext v1.16b, \r0\().16b, \r1\().16b, #10
add \d0\().8h, \r0\().8h, v1.8h
ext v0.16b, \r2\().16b, \r3\().16b, #4
mla \d0\().8h, v2.8h, v6.h[1]
ext v1.16b, \r2\().16b, \r3\().16b, #6
add v0.8h, v0.8h, v1.8h
ext v1.16b, \r2\().16b, \r3\().16b, #2
mul v5.8h, v4.8h, v6.h[0]
uqsub \d0\().8h, \d0\().8h, v5.8h
urshr \d0\().8h, \d0\().8h, #5
ext v3.16b, \r2\().16b, \r3\().16b, #8
add v1.8h, v1.8h, v3.8h
ext v2.16b, \r2\().16b, \r3\().16b, #10
add \d1\().8h, \r2\().8h, v2.8h
mla \d1\().8h, v0.8h, v6.h[1]
mul v5.8h, v1.8h, v6.h[0]
uqsub \d1\().8h, \d1\().8h, v5.8h
mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping
urshr \d1\().8h, \d1\().8h, #5
umin \d0\().8h, \d0\().8h, v5.8h
umin \d1\().8h, \d1\().8h, v5.8h
.endm
//trashes v0-v4
.macro lowpass_8_10_v r0, r1, r2, r3, r4, r5, r6, d0, d1
add v2.8h, \r2\().8h, \r3\().8h
add v0.8h, \r3\().8h, \r4\().8h
add v4.8h, \r1\().8h, \r4\().8h
add v1.8h, \r2\().8h, \r5\().8h
add \d0\().8h, \r0\().8h, \r5\().8h
add \d1\().8h, \r1\().8h, \r6\().8h
mla \d0\().8h, v2.8h, v6.h[1]
mla \d1\().8h, v0.8h, v6.h[1]
mul v2.8h, v4.8h, v6.h[0]
mul v0.8h, v1.8h, v6.h[0]
uqsub \d0\().8h, \d0\().8h, v2.8h
uqsub \d1\().8h, \d1\().8h, v0.8h
mvni v0.8h, #0xFC, lsl #8 // 1023 for clipping
urshr \d0\().8h, \d0\().8h, #5
urshr \d1\().8h, \d1\().8h, #5
umin \d0\().8h, \d0\().8h, v0.8h
umin \d1\().8h, \d1\().8h, v0.8h
.endm
function put_h264_qpel16_h_lowpass_neon_packed_10
mov x4, x30
mov x12, #32
mov x3, #16
bl put_h264_qpel8_h_lowpass_neon_10
sub x1, x1, x2, lsl #4
add x1, x1, #16
mov x12, #32
mov x30, x4
b put_h264_qpel8_h_lowpass_neon_10
endfunc
.macro h264_qpel_h_lowpass_10 type
function \type\()_h264_qpel16_h_lowpass_neon_10
mov x13, x30
mov x12, #32
bl \type\()_h264_qpel8_h_lowpass_neon_10
sub x0, x0, x3, lsl #4
sub x1, x1, x2, lsl #4
add x0, x0, #16
add x1, x1, #16
mov x12, #32
mov x30, x13
endfunc
function \type\()_h264_qpel8_h_lowpass_neon_10
1: ld1 {v28.8h, v29.8h}, [x1], x2
ld1 {v16.8h, v17.8h}, [x1], x2
subs x12, x12, #4
lowpass_8_10 v28, v29, v16, v17, v28, v20
.ifc \type,avg
ld1 {v2.8h}, [x0], x3
ld1 {v3.8h}, [x0]
urhadd v28.8h, v28.8h, v2.8h
urhadd v20.8h, v20.8h, v3.8h
sub x0, x0, x3
.endif
st1 {v28.8h}, [x0], x3
st1 {v20.8h}, [x0], x3
b.ne 1b
ret
endfunc
.endm
h264_qpel_h_lowpass_10 put
h264_qpel_h_lowpass_10 avg
.macro h264_qpel_h_lowpass_l2_10 type
function \type\()_h264_qpel16_h_lowpass_l2_neon_10
mov x13, x30
mov x12, #32
bl \type\()_h264_qpel8_h_lowpass_l2_neon_10
sub x0, x0, x2, lsl #4
sub x1, x1, x2, lsl #4
sub x3, x3, x2, lsl #4
add x0, x0, #16
add x1, x1, #16
add x3, x3, #16
mov x12, #32
mov x30, x13
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon_10
1: ld1 {v26.8h, v27.8h}, [x1], x2
ld1 {v16.8h, v17.8h}, [x1], x2
ld1 {v28.8h}, [x3], x2
ld1 {v29.8h}, [x3], x2
subs x12, x12, #4
lowpass_8_10 v26, v27, v16, v17, v26, v27
urhadd v26.8h, v26.8h, v28.8h
urhadd v27.8h, v27.8h, v29.8h
.ifc \type,avg
ld1 {v2.8h}, [x0], x2
ld1 {v3.8h}, [x0]
urhadd v26.8h, v26.8h, v2.8h
urhadd v27.8h, v27.8h, v3.8h
sub x0, x0, x2
.endif
st1 {v26.8h}, [x0], x2
st1 {v27.8h}, [x0], x2
b.ne 1b
ret
endfunc
.endm
h264_qpel_h_lowpass_l2_10 put
h264_qpel_h_lowpass_l2_10 avg
function put_h264_qpel16_v_lowpass_neon_packed_10
mov x4, x30
mov x2, #8
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
mov x30, x4
b put_h264_qpel8_v_lowpass_neon
endfunc
.macro h264_qpel_v_lowpass_10 type
function \type\()_h264_qpel16_v_lowpass_neon_10
mov x4, x30
bl \type\()_h264_qpel8_v_lowpass_neon_10
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_neon_10
sub x0, x0, x2, lsl #4
add x0, x0, #16
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #16
bl \type\()_h264_qpel8_v_lowpass_neon_10
sub x1, x1, x3, lsl #2
mov x30, x4
endfunc
function \type\()_h264_qpel8_v_lowpass_neon_10
ld1 {v16.8h}, [x1], x3
ld1 {v17.8h}, [x1], x3
ld1 {v18.8h}, [x1], x3
ld1 {v19.8h}, [x1], x3
ld1 {v20.8h}, [x1], x3
ld1 {v21.8h}, [x1], x3
ld1 {v22.8h}, [x1], x3
ld1 {v23.8h}, [x1], x3
ld1 {v24.8h}, [x1], x3
ld1 {v25.8h}, [x1], x3
ld1 {v26.8h}, [x1], x3
ld1 {v27.8h}, [x1], x3
ld1 {v28.8h}, [x1]
lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
.ifc \type,avg
ld1 {v24.8h}, [x0], x2
ld1 {v25.8h}, [x0], x2
ld1 {v26.8h}, [x0], x2
urhadd v16.8h, v16.8h, v24.8h
ld1 {v27.8h}, [x0], x2
urhadd v17.8h, v17.8h, v25.8h
ld1 {v28.8h}, [x0], x2
urhadd v18.8h, v18.8h, v26.8h
ld1 {v29.8h}, [x0], x2
urhadd v19.8h, v19.8h, v27.8h
ld1 {v30.8h}, [x0], x2
urhadd v20.8h, v20.8h, v28.8h
ld1 {v31.8h}, [x0], x2
urhadd v21.8h, v21.8h, v29.8h
urhadd v22.8h, v22.8h, v30.8h
urhadd v23.8h, v23.8h, v31.8h
sub x0, x0, x2, lsl #3
.endif
st1 {v16.8h}, [x0], x2
st1 {v17.8h}, [x0], x2
st1 {v18.8h}, [x0], x2
st1 {v19.8h}, [x0], x2
st1 {v20.8h}, [x0], x2
st1 {v21.8h}, [x0], x2
st1 {v22.8h}, [x0], x2
st1 {v23.8h}, [x0], x2
ret
endfunc
.endm
h264_qpel_v_lowpass_10 put
h264_qpel_v_lowpass_10 avg
.macro h264_qpel_v_lowpass_l2_10 type
function \type\()_h264_qpel16_v_lowpass_l2_neon_10
mov x4, x30
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
sub x0, x0, x3, lsl #4
sub x12, x12, x2, lsl #4
add x0, x0, #16
add x12, x12, #16
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #16
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
sub x1, x1, x3, lsl #2
mov x30, x4
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon_10
ld1 {v16.8h}, [x1], x3
ld1 {v17.8h}, [x1], x3
ld1 {v18.8h}, [x1], x3
ld1 {v19.8h}, [x1], x3
ld1 {v20.8h}, [x1], x3
ld1 {v21.8h}, [x1], x3
ld1 {v22.8h}, [x1], x3
ld1 {v23.8h}, [x1], x3
ld1 {v24.8h}, [x1], x3
ld1 {v25.8h}, [x1], x3
ld1 {v26.8h}, [x1], x3
ld1 {v27.8h}, [x1], x3
ld1 {v28.8h}, [x1]
lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
ld1 {v24.8h}, [x12], x2
ld1 {v25.8h}, [x12], x2
ld1 {v26.8h}, [x12], x2
ld1 {v27.8h}, [x12], x2
ld1 {v28.8h}, [x12], x2
urhadd v16.8h, v24.8h, v16.8h
urhadd v17.8h, v25.8h, v17.8h
ld1 {v29.8h}, [x12], x2
urhadd v18.8h, v26.8h, v18.8h
urhadd v19.8h, v27.8h, v19.8h
ld1 {v30.8h}, [x12], x2
urhadd v20.8h, v28.8h, v20.8h
urhadd v21.8h, v29.8h, v21.8h
ld1 {v31.8h}, [x12], x2
urhadd v22.8h, v30.8h, v22.8h
urhadd v23.8h, v31.8h, v23.8h
.ifc \type,avg
ld1 {v24.8h}, [x0], x3
ld1 {v25.8h}, [x0], x3
ld1 {v26.8h}, [x0], x3
urhadd v16.8h, v16.8h, v24.8h
ld1 {v27.8h}, [x0], x3
urhadd v17.8h, v17.8h, v25.8h
ld1 {v28.8h}, [x0], x3
urhadd v18.8h, v18.8h, v26.8h
ld1 {v29.8h}, [x0], x3
urhadd v19.8h, v19.8h, v27.8h
ld1 {v30.8h}, [x0], x3
urhadd v20.8h, v20.8h, v28.8h
ld1 {v31.8h}, [x0], x3
urhadd v21.8h, v21.8h, v29.8h
urhadd v22.8h, v22.8h, v30.8h
urhadd v23.8h, v23.8h, v31.8h
sub x0, x0, x3, lsl #3
.endif
st1 {v16.8h}, [x0], x3
st1 {v17.8h}, [x0], x3
st1 {v18.8h}, [x0], x3
st1 {v19.8h}, [x0], x3
st1 {v20.8h}, [x0], x3
st1 {v21.8h}, [x0], x3
st1 {v22.8h}, [x0], x3
st1 {v23.8h}, [x0], x3
ret
endfunc
.endm
h264_qpel_v_lowpass_l2_10 put
h264_qpel_v_lowpass_l2_10 avg
.macro h264_qpel8_10 type
function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
lowpass_const w3
mov x3, x1
sub x1, x1, #4
mov x12, #16
b \type\()_h264_qpel8_h_lowpass_l2_neon_10
endfunc
function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
lowpass_const w3
sub x1, x1, #4
mov x3, x2
mov x12, #16
b \type\()_h264_qpel8_h_lowpass_neon_10
endfunc
function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
lowpass_const w3
add x3, x1, #2
sub x1, x1, #4
mov x12, #16
b \type\()_h264_qpel8_h_lowpass_l2_neon_10
endfunc
function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
mov x14, x30
mov x12, x1
\type\()_h264_qpel8_mc01_10:
lowpass_const w3
mov x3, x2
sub x1, x1, x2, lsl #1
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel8_mc11_10:
lowpass_const w3
mov x11, sp
sub sp, sp, #128
mov x0, sp
sub x1, x1, #4
mov x3, #16
mov x12, #16
bl put_h264_qpel8_h_lowpass_neon_10
mov x0, x8
mov x3, x2
mov x12, sp
sub x1, x9, x2, lsl #1
mov x2, #16
bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
add x1, x1, #2
mov x14, x30
mov x8, x0
mov x9, x1
sub x1, x1, #2
b \type\()_h264_qpel8_mc11_10
endfunc
function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
mov x14, x30
lowpass_const w3
sub x1, x1, x2, lsl #1
mov x3, x2
bl \type\()_h264_qpel8_v_lowpass_neon_10
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
mov x14, x30
add x12, x1, x2
b \type\()_h264_qpel8_mc01_10
endfunc
function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel8_mc11_10
endfunc
function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
add x1, x1, #2
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
sub x1, x1, #2
b \type\()_h264_qpel8_mc11_10
endfunc
.endm
h264_qpel8_10 put
h264_qpel8_10 avg
.macro h264_qpel16_10 type
function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
lowpass_const w3
mov x3, x1
sub x1, x1, #4
b \type\()_h264_qpel16_h_lowpass_l2_neon_10
endfunc
function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
lowpass_const w3
sub x1, x1, #4
mov x3, x2
b \type\()_h264_qpel16_h_lowpass_neon_10
endfunc
function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
lowpass_const w3
add x3, x1, #2
sub x1, x1, #4
b \type\()_h264_qpel16_h_lowpass_l2_neon_10
endfunc
function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
mov x14, x30
mov x12, x1
\type\()_h264_qpel16_mc01_10:
lowpass_const w3
mov x3, x2
sub x1, x1, x2, lsl #1
bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel16_mc11_10:
lowpass_const w3
mov x11, sp
sub sp, sp, #512
mov x0, sp
sub x1, x1, #4
mov x3, #32
bl put_h264_qpel16_h_lowpass_neon_10
mov x0, x8
mov x3, x2
mov x12, sp
sub x1, x9, x2, lsl #1
mov x2, #32
bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
add x1, x1, #2
mov x14, x30
mov x8, x0
mov x9, x1
sub x1, x1, #2
b \type\()_h264_qpel16_mc11_10
endfunc
function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
mov x14, x30
lowpass_const w3
sub x1, x1, x2, lsl #1
mov x3, x2
bl \type\()_h264_qpel16_v_lowpass_neon_10
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
mov x14, x30
add x12, x1, x2
b \type\()_h264_qpel16_mc01_10
endfunc
function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel16_mc11_10
endfunc
function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
add x1, x1, #2
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
sub x1, x1, #2
b \type\()_h264_qpel16_mc11_10
endfunc
.endm
h264_qpel16_10 put
h264_qpel16_10 avg