ffmpeg/compat/cuda/cuda_runtime.h

/*
 * Minimum CUDA compatibility definitions header
 *
 * Copyright (c) 2019 rcombs
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef COMPAT_CUDA_CUDA_RUNTIME_H
#define COMPAT_CUDA_CUDA_RUNTIME_H

// Common macros
#define __global__ __attribute__((global))
#define __device__ __attribute__((device))
#define __device_builtin__ __attribute__((device_builtin))
#define __align__(N) __attribute__((aligned(N)))
#define __inline__ __inline__ __attribute__((always_inline))

#define max(a, b) ((a) > (b) ? (a) : (b))
#define min(a, b) ((a) < (b) ? (a) : (b))
#define abs(x) ((x) < 0 ? -(x) : (x))

#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))

// Basic typedefs
typedef __device_builtin__ unsigned long long cudaTextureObject_t;

typedef struct __device_builtin__ __align__(2) uchar2
{
    unsigned char x, y;
} uchar2;

typedef struct __device_builtin__ __align__(4) ushort2
{
    unsigned short x, y;
} ushort2;

typedef struct __device_builtin__ __align__(8) float2
{
    float x, y;
} float2;

typedef struct __device_builtin__ __align__(8) int2
{
    int x, y;
} int2;

typedef struct __device_builtin__ uint3
{
    unsigned int x, y, z;
} uint3;

typedef struct uint3 dim3;

typedef struct __device_builtin__ __align__(4) uchar4
{
    unsigned char x, y, z, w;
} uchar4;

typedef struct __device_builtin__ __align__(8) ushort4
{
    unsigned short x, y, z, w;
} ushort4;

typedef struct __device_builtin__ __align__(16) int4
{
    int x, y, z, w;
} int4;

typedef struct __device_builtin__ __align__(16) float4
{
    float x, y, z, w;
} float4;

// Accessors for special registers
#define GETCOMP(reg, comp) \
    asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
    ret.comp = tmp;

#define GET(name, reg) static inline __device__ uint3 name() {\
    uint3 ret; \
    unsigned tmp; \
    GETCOMP(reg, x) \
    GETCOMP(reg, y) \
    GETCOMP(reg, z) \
    return ret; \
}

GET(getBlockIdx, ctaid)
GET(getBlockDim, ntid)
GET(getThreadIdx, tid)

// Instead of externs for these registers, we turn access to them into calls into trivial ASM
#define blockIdx (getBlockIdx())
#define blockDim (getBlockDim())
#define threadIdx (getThreadIdx())

// Basic initializers (simple macros rather than inline functions)
#define make_int2(a, b) ((int2){.x = a, .y = b})
#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
#define make_float2(a, b) ((float2){.x = a, .y = b})
#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})

// Conversions from the tex instruction's 4-register output to various types
#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}

TEX2D(unsigned char, a & 0xFF)
TEX2D(unsigned short, a & 0xFFFF)
TEX2D(float, a)
TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
TEX2D(float2, make_float2(a, b))
TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
TEX2D(float4, make_float4(a, b, c, d))

// Template calling tex instruction and converting the output to the selected type
template<typename T>
inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
{
  T ret;
  unsigned ret1, ret2, ret3, ret4;
  asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
      "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
      "l"(texObject), "f"(x), "f"(y));
  conv(&ret, ret1, ret2, ret3, ret4);
  return ret;
}

template<>
inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
{
    float4 ret;
    asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
        "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
        "l"(texObject), "f"(x), "f"(y));
    return ret;
}

template<>
inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
{
    return tex2D<float4>(texObject, x, y).x;
}

template<>
inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
{
    float4 ret = tex2D<float4>(texObject, x, y);
    return make_float2(ret.x, ret.y);
}

// Math helper functions
static inline __device__ float floorf(float a) { return __builtin_floorf(a); }
static inline __device__ float floor(float a) { return __builtin_floorf(a); }
static inline __device__ double floor(double a) { return __builtin_floor(a); }
static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
static inline __device__ double ceil(double a) { return __builtin_ceil(a); }
static inline __device__ float truncf(float a) { return __builtin_truncf(a); }
static inline __device__ float trunc(float a) { return __builtin_truncf(a); }
static inline __device__ double trunc(double a) { return __builtin_trunc(a); }
static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
static inline __device__ double fabs(double a) { return __builtin_fabs(a); }

static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }

#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`/*`
			`* Minimum CUDA compatibility definitions header`
			`*`
All: update names in copyright headers 2021-01-20 07:02:56 +00:00			`* Copyright (c) 2019 rcombs`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

compat/cuda: Change inclusion guards cuda_runtime.h as well as dynlink_loader.h used nonstandard inclusion guards with an AV_ prefix, although these files are not in an libav*/ path. So change the inclusion guards and adapt the ref file of the source fate test accordingly. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com> Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-08-05 01:09:41 +00:00			`#ifndef COMPAT_CUDA_CUDA_RUNTIME_H`
			`#define COMPAT_CUDA_CUDA_RUNTIME_H`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00
			`// Common macros`
			`#define __global__ __attribute__((global))`
			`#define __device__ __attribute__((device))`
			`#define __device_builtin__ __attribute__((device_builtin))`
			`#define __align__(N) __attribute__((aligned(N)))`
			`#define __inline__ __inline__ __attribute__((always_inline))`

			`#define max(a, b) ((a) > (b) ? (a) : (b))`
			`#define min(a, b) ((a) < (b) ? (a) : (b))`
			`#define abs(x) ((x) < 0 ? -(x) : (x))`

			`#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))`

			`// Basic typedefs`
			`typedef __device_builtin__ unsigned long long cudaTextureObject_t;`

			`typedef struct __device_builtin__ __align__(2) uchar2`
			`{`
			`unsigned char x, y;`
			`} uchar2;`

			`typedef struct __device_builtin__ __align__(4) ushort2`
			`{`
			`unsigned short x, y;`
			`} ushort2;`

avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`typedef struct __device_builtin__ __align__(8) float2`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`{`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`float x, y;`
			`} float2;`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00
			`typedef struct __device_builtin__ __align__(8) int2`
			`{`
			`int x, y;`
			`} int2;`

avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`typedef struct __device_builtin__ uint3`
			`{`
			`unsigned int x, y, z;`
			`} uint3;`

			`typedef struct uint3 dim3;`

build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`typedef struct __device_builtin__ __align__(4) uchar4`
			`{`
			`unsigned char x, y, z, w;`
			`} uchar4;`

			`typedef struct __device_builtin__ __align__(8) ushort4`
			`{`
compat/cuda: correct ushort4 to use ushort Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2021-02-12 15:58:25 +00:00			`unsigned short x, y, z, w;`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`} ushort4;`

			`typedef struct __device_builtin__ __align__(16) int4`
			`{`
			`int x, y, z, w;`
			`} int4;`

avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`typedef struct __device_builtin__ __align__(16) float4`
			`{`
			`float x, y, z, w;`
			`} float4;`

build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`// Accessors for special registers`
			`#define GETCOMP(reg, comp) \`
			`asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \`
			`ret.comp = tmp;`

			`#define GET(name, reg) static inline __device__ uint3 name() {\`
			`uint3 ret; \`
			`unsigned tmp; \`
			`GETCOMP(reg, x) \`
			`GETCOMP(reg, y) \`
			`GETCOMP(reg, z) \`
			`return ret; \`
			`}`

			`GET(getBlockIdx, ctaid)`
			`GET(getBlockDim, ntid)`
			`GET(getThreadIdx, tid)`

			`// Instead of externs for these registers, we turn access to them into calls into trivial ASM`
			`#define blockIdx (getBlockIdx())`
			`#define blockDim (getBlockDim())`
			`#define threadIdx (getThreadIdx())`

			`// Basic initializers (simple macros rather than inline functions)`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`#define make_int2(a, b) ((int2){.x = a, .y = b})`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})`
			`#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`#define make_float2(a, b) ((float2){.x = a, .y = b})`
			`#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})`
			`#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00
			`// Conversions from the tex instruction's 4-register output to various types`
			`#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}`

			`TEX2D(unsigned char, a & 0xFF)`
			`TEX2D(unsigned short, a & 0xFFFF)`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`TEX2D(float, a)`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))`
			`TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`TEX2D(float2, make_float2(a, b))`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))`
			`TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`TEX2D(float4, make_float4(a, b, c, d))`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00
			`// Template calling tex instruction and converting the output to the selected type`
avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`template<typename T>`
			`inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)`
build: add support for building CUDA files with clang This avoids using the CUDA SDK at all; instead, we provide a minimal reimplementation of the basic functionality that lavfi actually uses. It generates very similar code to what NVCC produces. The header contains no implementation code derived from the SDK. The function and type declarations are derived from the SDK only to the extent required to build a compatible implementation. This is generally accepted to qualify as fair use. Because this option does not require the proprietary SDK, it does not require the "--enable-nonfree" flag in configure. Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-07-30 07:51:42 +00:00			`{`
			`T ret;`
			`unsigned ret1, ret2, ret3, ret4;`
			`asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :`
			`"=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :`
			`"l"(texObject), "f"(x), "f"(y));`
			`conv(&ret, ret1, ret2, ret3, ret4);`
			`return ret;`
			`}`

avfilter/scale_cuda: add bicubic interpolation 2020-10-31 19:22:33 +00:00			`template<>`
			`inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)`
			`{`
			`float4 ret;`
			`asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :`
			`"=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :`
			`"l"(texObject), "f"(x), "f"(y));`
			`return ret;`
			`}`

			`template<>`
			`inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)`
			`{`
			`return tex2D<float4>(texObject, x, y).x;`
			`}`

			`template<>`
			`inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)`
			`{`
			`float4 ret = tex2D<float4>(texObject, x, y);`
			`return make_float2(ret.x, ret.y);`
			`}`

			`// Math helper functions`
			`static inline __device__ float floorf(float a) { return __builtin_floorf(a); }`
			`static inline __device__ float floor(float a) { return __builtin_floorf(a); }`
			`static inline __device__ double floor(double a) { return __builtin_floor(a); }`
			`static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }`
			`static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }`
			`static inline __device__ double ceil(double a) { return __builtin_ceil(a); }`
			`static inline __device__ float truncf(float a) { return __builtin_truncf(a); }`
			`static inline __device__ float trunc(float a) { return __builtin_truncf(a); }`
			`static inline __device__ double trunc(double a) { return __builtin_trunc(a); }`
			`static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }`
			`static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }`
			`static inline __device__ double fabs(double a) { return __builtin_fabs(a); }`

avfilter/scale_cuda: add lanczos algorithm 2020-11-04 00:43:00 +00:00			`static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }`
			`static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }`

compat/cuda: Change inclusion guards cuda_runtime.h as well as dynlink_loader.h used nonstandard inclusion guards with an AV_ prefix, although these files are not in an libav*/ path. So change the inclusion guards and adapt the ref file of the source fate test accordingly. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com> Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org> 2019-08-05 01:09:41 +00:00			`#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */`