diff --git a/Changelog b/Changelog index beb2d2615d..389ca6c4db 100644 --- a/Changelog +++ b/Changelog @@ -36,6 +36,7 @@ version 4.2: - derain filter - deesser filter - mov muxer writes tracks with unspecified language instead of English by default +- add support for using clang to compile CUDA kernels version 4.1: diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h new file mode 100644 index 0000000000..dbe50f8711 --- /dev/null +++ b/compat/cuda/cuda_runtime.h @@ -0,0 +1,131 @@ +/* + * Minimum CUDA compatibility definitions header + * + * Copyright (c) 2019 Rodger Combs + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AV_COMPAT_CUDA_CUDA_RUNTIME_H +#define AV_COMPAT_CUDA_CUDA_RUNTIME_H + +// Common macros +#define __global__ __attribute__((global)) +#define __device__ __attribute__((device)) +#define __device_builtin__ __attribute__((device_builtin)) +#define __align__(N) __attribute__((aligned(N))) +#define __inline__ __inline__ __attribute__((always_inline)) + +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define abs(x) ((x) < 0 ? -(x) : (x)) + +#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST)) + +// Basic typedefs +typedef __device_builtin__ unsigned long long cudaTextureObject_t; + +typedef struct __device_builtin__ __align__(2) uchar2 +{ + unsigned char x, y; +} uchar2; + +typedef struct __device_builtin__ __align__(4) ushort2 +{ + unsigned short x, y; +} ushort2; + +typedef struct __device_builtin__ uint3 +{ + unsigned int x, y, z; +} uint3; + +typedef struct uint3 dim3; + +typedef struct __device_builtin__ __align__(8) int2 +{ + int x, y; +} int2; + +typedef struct __device_builtin__ __align__(4) uchar4 +{ + unsigned char x, y, z, w; +} uchar4; + +typedef struct __device_builtin__ __align__(8) ushort4 +{ + unsigned char x, y, z, w; +} ushort4; + +typedef struct __device_builtin__ __align__(16) int4 +{ + int x, y, z, w; +} int4; + +// Accessors for special registers +#define GETCOMP(reg, comp) \ + asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \ + ret.comp = tmp; + +#define GET(name, reg) static inline __device__ uint3 name() {\ + uint3 ret; \ + unsigned tmp; \ + GETCOMP(reg, x) \ + GETCOMP(reg, y) \ + GETCOMP(reg, z) \ + return ret; \ +} + +GET(getBlockIdx, ctaid) +GET(getBlockDim, ntid) +GET(getThreadIdx, tid) + +// Instead of externs for these registers, we turn access to them into calls into trivial ASM +#define blockIdx (getBlockIdx()) +#define blockDim (getBlockDim()) +#define threadIdx (getThreadIdx()) + +// Basic initializers (simple macros rather than inline functions) +#define make_uchar2(a, b) ((uchar2){.x = a, .y = b}) +#define make_ushort2(a, b) ((ushort2){.x = a, .y = b}) +#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d}) +#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d}) + +// Conversions from the tex instruction's 4-register output to various types +#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);} + +TEX2D(unsigned char, a & 0xFF) +TEX2D(unsigned short, a & 0xFFFF) +TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF)) +TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF)) +TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF)) +TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF)) + +// Template calling tex instruction and converting the output to the selected type +template +static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) +{ + T ret; + unsigned ret1, ret2, ret3, ret4; + asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : + "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) : + "l"(texObject), "f"(x), "f"(y)); + conv(&ret, ret1, ret2, ret3, ret4); + return ret; +} + +#endif diff --git a/configure b/configure index 5a4f507246..bddc05b850 100755 --- a/configure +++ b/configure @@ -322,6 +322,7 @@ External library support: --disable-amf disable AMF video encoding code [autodetect] --disable-audiotoolbox disable Apple AudioToolbox code [autodetect] --enable-cuda-nvcc enable Nvidia CUDA compiler [no] + --disable-cuda-llvm disable CUDA compilation using clang [autodetect] --disable-cuvid disable Nvidia CUVID support [autodetect] --disable-d3d11va disable Microsoft Direct3D 11 video acceleration code [autodetect] --disable-dxva2 disable Microsoft DirectX 9 video acceleration code [autodetect] @@ -370,7 +371,7 @@ Toolchain options: --cxx=CXX use C compiler CXX [$cxx_default] --objcc=OCC use ObjC compiler OCC [$cc_default] --dep-cc=DEPCC use dependency generator DEPCC [$cc_default] - --nvcc=NVCC use Nvidia CUDA compiler NVCC [$nvcc_default] + --nvcc=NVCC use Nvidia CUDA compiler NVCC or clang [$nvcc_default] --ld=LD use linker LD [$ld_default] --pkg-config=PKGCONFIG use pkg-config tool PKGCONFIG [$pkg_config_default] --pkg-config-flags=FLAGS pass additional flags to pkgconf [] @@ -1038,12 +1039,16 @@ test_nvcc(){ tmpcu_=$TMPCU tmpo_=$TMPO [ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_) - test_cmd $nvcc -ptx $NVCCFLAGS "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_ + test_cmd $nvcc $nvccflags "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_ } check_nvcc() { log check_nvcc "$@" - test_nvcc <= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } } @@ -6701,16 +6734,6 @@ if [ -z "$optflags" ]; then fi fi -if [ -z "$nvccflags" ]; then - nvccflags=$nvccflags_default -fi - -if enabled x86_64 || enabled ppc64 || enabled aarch64; then - nvccflags="$nvccflags -m64" -else - nvccflags="$nvccflags -m32" -fi - check_optflags(){ check_cflags "$@" enabled lto && check_ldflags "$@" diff --git a/ffbuild/common.mak b/ffbuild/common.mak index d2b33320c0..7355508ea0 100644 --- a/ffbuild/common.mak +++ b/ffbuild/common.mak @@ -38,7 +38,6 @@ OBJCCFLAGS = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS) ASFLAGS := $(CPPFLAGS) $(ASFLAGS) CXXFLAGS := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS) X86ASMFLAGS += $(IFLAGS:%=%/) -I$($@ -%.ptx: %.cu +%.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h $(COMPILE_NVCC) %.ptx.c: %.ptx