diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h index c441ff38fe..160209bdc5 100644 --- a/video/decode/gpu_memcpy_sse4.h +++ b/video/decode/gpu_memcpy_sse4.h @@ -19,14 +19,18 @@ * Taken from the QuickSync decoder by Eric Gur */ -#include +#ifndef GPU_MEMCPY_SSE4_H_ +#define GPU_MEMCPY_SSE4_H_ + +#pragma GCC push_options +#pragma GCC target("sse4.1") +#include // gpu_memcpy is a memcpy style function that copied data very fast from a // GPU tiled memory (write back) // Performance tip: page offset (12 lsb) of both addresses should be different // optimally use a 2K offset between them. -__attribute__((target("sse4"))) static inline void -*gpu_memcpy(void *restrict d, const void *restrict s, size_t size) +static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size) { static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 @@ -127,3 +131,6 @@ __attribute__((target("sse4"))) static inline void return d; } + +#pragma GCC pop_options +#endif