compat/cuda: switch from powf to __powf intrinsic

The powf builtin causes crashes on older clang, so manually implement
the (faster) intrinsic.
The code it spawns is identical to that of nvcc.
This commit is contained in:
Timo Rothenpieler 2022-09-03 19:49:53 +02:00
parent 73fada029c
commit 416923346a
2 changed files with 5 additions and 5 deletions

View File

@ -182,11 +182,11 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
static inline __device__ float powf(float a, float y) { return __builtin_powf(a,y); }
static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); }
#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */

View File

@ -34,9 +34,9 @@ extern "C"
__device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv)
{
float ans = 0;
ans += powf(first_yuv.x - second_yuv.x, 2);
ans += powf(first_yuv.y - second_yuv.y, 2);
ans += powf(first_yuv.z - second_yuv.z, 2);
ans += __powf(first_yuv.x - second_yuv.x, 2);
ans += __powf(first_yuv.y - second_yuv.y, 2);
ans += __powf(first_yuv.z - second_yuv.z, 2);
return ans;
}
@ -52,7 +52,7 @@ __device__ static inline float calculate_w(int x, int y, int r, int c,
float sigma_space, float sigma_color)
{
float first_term, second_term;
first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space);
first_term = (__powf(x - r, 2) + __powf(y - c, 2)) / (2 * sigma_space * sigma_space);
second_term = norm_squared(pixel_value, neighbor_value) / (2 * sigma_color * sigma_color);
return __expf(-first_term - second_term);
}