arm: Avoid using .dn register aliases

clang now (in the upcoming 5.0 version) is capable of building our
arm assembly without relying on gas-preprocessor, although clang/LLVM
doesn't support .dn register aliases.

The VC1 MC assembly was only built and used if the chosen assembler
supported the .dn directives though. This was supported as long as
gas-preprocessor was used.

This means that VC1 decoding got a speed regression on clang 5.0,
unless the user manually chose using gas-preprocessor again.

By avoiding using the .dn register aliases, we can build the VC1 MC
assembly with the latest clang version.

Support for the .dn/.qn directives in clang/LLVM isn't actively planned,
see https://bugs.llvm.org/show_bug.cgi?id=18199.

This partially reverts 896a5bff64.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2017-05-09 10:25:44 +03:00
parent 6ccf76aec7
commit d7320ca3ed
3 changed files with 5 additions and 25 deletions

5
configure vendored
View File

@ -1662,7 +1662,6 @@ SYSTEM_FUNCS="
TOOLCHAIN_FEATURES=" TOOLCHAIN_FEATURES="
as_arch_directive as_arch_directive
as_dn_directive
as_fpu_directive as_fpu_directive
as_func as_func
as_object_arch as_object_arch
@ -4379,10 +4378,6 @@ EOF
check_as <<EOF && enable as_arch_directive check_as <<EOF && enable as_arch_directive
.arch armv7-a .arch armv7-a
EOF
check_as <<EOF && enable as_dn_directive
ra .dn d0.i16
.unreq ra
EOF EOF
check_as <<EOF && enable as_fpu_directive check_as <<EOF && enable as_fpu_directive
.fpu neon .fpu neon

View File

@ -22,8 +22,6 @@
#include "libavcodec/vc1dsp.h" #include "libavcodec/vc1dsp.h"
#include "vc1dsp.h" #include "vc1dsp.h"
#include "config.h"
void ff_vc1_inv_trans_8x8_neon(int16_t *block); void ff_vc1_inv_trans_8x8_neon(int16_t *block);
void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@ -93,7 +91,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon; dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
if (HAVE_AS_DN_DIRECTIVE) {
dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon; dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon; dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon; dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
@ -109,7 +106,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon; dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon; dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon; dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
}
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;

View File

@ -663,7 +663,6 @@ function ff_vc1_inv_trans_4x4_neon, export=1
bx lr bx lr
endfunc endfunc
#if HAVE_AS_DN_DIRECTIVE
@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits. @ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}). @ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
#define MSPEL_MODE_1_MUL_CONSTANTS 4 53 18 3 #define MSPEL_MODE_1_MUL_CONSTANTS 4 53 18 3
@ -689,22 +688,18 @@ endfunc
@ Setup constants in registers for a subsequent use of mspel_filter{,.16}. @ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register .macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
@ Define double-word register aliases. Typesize should be i8 or i16. @ Typesize should be i8 or i16.
ra .dn \reg_a\().\typesize
rb .dn \reg_b\().\typesize
rc .dn \reg_c\().\typesize
rd .dn \reg_d\().\typesize
@ Only set the register if the value is not 1 and unique @ Only set the register if the value is not 1 and unique
.if \filter_a != 1 .if \filter_a != 1
vmov ra, #\filter_a @ ra = filter_a vmov.\typesize \reg_a, #\filter_a @ reg_a = filter_a
.endif .endif
vmov rb, #\filter_b @ rb = filter_b vmov.\typesize \reg_b, #\filter_b @ reg_b = filter_b
.if \filter_b != \filter_c .if \filter_b != \filter_c
vmov rc, #\filter_c @ rc = filter_c vmov.\typesize \reg_c, #\filter_c @ reg_c = filter_c
.endif .endif
.if \filter_d != 1 .if \filter_d != 1
vmov rd, #\filter_d @ rd = filter_d vmov.\typesize \reg_d, #\filter_d @ reg_d = filter_d
.endif .endif
@ vdup to double the size of typesize @ vdup to double the size of typesize
.ifc \typesize,i8 .ifc \typesize,i8
@ -712,11 +707,6 @@ endfunc
.else .else
vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register
.endif .endif
.unreq ra
.unreq rb
.unreq rc
.unreq rd
.endm .endm
@ After mspel_constants has been used, do the filtering. @ After mspel_constants has been used, do the filtering.
@ -987,7 +977,6 @@ PUT_VC1_MSPEL_MC_V_ONLY(2)
PUT_VC1_MSPEL_MC_V_ONLY(3) PUT_VC1_MSPEL_MC_V_ONLY(3)
#undef PUT_VC1_MSPEL_MC_V_ONLY #undef PUT_VC1_MSPEL_MC_V_ONLY
#endif
function ff_put_pixels8x8_neon, export=1 function ff_put_pixels8x8_neon, export=1
vld1.64 {d0}, [r1], r2 vld1.64 {d0}, [r1], r2