Blender V5.0
device/oneapi/compat.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
7#define __KERNEL_GPU__
8#define __KERNEL_ONEAPI__
9#define __KERNEL_64_BIT__
10
11#ifdef WITH_EMBREE_GPU
12# define __KERNEL_GPU_RAYTRACING__
13#endif
14
15#define CCL_NAMESPACE_BEGIN
16#define CCL_NAMESPACE_END
17
18#include <cstdint>
19#include <math.h>
20
21#ifndef __NODES_MAX_GROUP__
22# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
23#endif
24#ifndef __NODES_FEATURES__
25# define __NODES_FEATURES__ NODE_FEATURE_ALL
26#endif
27
28/* This one does not have an abstraction.
29 * It's used by other devices directly.
30 */
31
32#define __device__
33
34/* Qualifier wrappers for different names on different devices */
35
36#define ccl_device inline
37#define ccl_device_extern extern "C"
38#define ccl_global
39#define ccl_always_inline __attribute__((always_inline))
40#define ccl_device_inline __attribute__((always_inline))
41#define ccl_noinline __attribute__((noinline))
42#define ccl_inline_constant const constexpr
43#define ccl_device_constant static constexpr
44#define ccl_static_constexpr static constexpr
45#define ccl_device_forceinline __attribute__((always_inline))
46#define ccl_device_noinline __attribute__((noinline))
47#define ccl_device_noinline_cpu ccl_device
48#define ccl_device_inline_method ccl_device
49#define ccl_device_template_spec template<> ccl_device_inline
50#define ccl_restrict __restrict__
51#define ccl_optional_struct_init
52#define ccl_private
53#define ccl_ray_data ccl_private
54#define ccl_gpu_shared
55#define ATTR_FALLTHROUGH __attribute__((fallthrough))
56#define ccl_constant const
57#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
58#define ccl_align(n) __attribute__((aligned(n)))
59#define kernel_assert(cond)
60#define ccl_may_alias
61
62/* clang-format off */
63
64/* kernel.h adapters */
65#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
66#define ccl_gpu_kernel_threads(block_num_threads)
67
68#ifndef WITH_ONEAPI_SYCL_HOST_TASK
69# define __ccl_gpu_kernel_signature(name, ...) \
70void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
71 size_t kernel_global_size, \
72 size_t kernel_local_size, \
73 sycl::handler &cgh, \
74 __VA_ARGS__) { \
75 (void)(kg); \
76 cgh.parallel_for( \
77 sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
78 [=](sycl::nd_item<1> item) {
79
80# define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
81
82# define ccl_gpu_kernel_postfix \
83 }); \
84 }
85#else
86/* Additional anonymous lambda is required to handle all "return" statements in the kernel code */
87# define ccl_gpu_kernel_signature(name, ...) \
88void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
89 size_t kernel_global_size, \
90 size_t kernel_local_size, \
91 sycl::handler &cgh, \
92 __VA_ARGS__) { \
93 (void)(kg); \
94 (kernel_local_size); \
95 cgh.host_task( \
96 [=]() {\
97 for (size_t gid = (size_t)0; gid < kernel_global_size; gid++) { \
98 kg->nd_item_local_id_0 = 0; \
99 kg->nd_item_local_range_0 = 1; \
100 kg->nd_item_group_id_0 = gid; \
101 kg->nd_item_group_range_0 = kernel_global_size; \
102 kg->nd_item_global_id_0 = gid; \
103 kg->nd_item_global_range_0 = kernel_global_size; \
104 auto kernel = [=]() {
105
106# define ccl_gpu_kernel_postfix \
107 }; \
108 kernel(); \
109 } \
110 }); \
111}
112#endif
113
114#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
115#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
116
117#define ccl_gpu_kernel_lambda(func, ...) \
118 struct KernelLambda \
119 { \
120 KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
121 ccl_private const ONEAPIKernelContext *kg; \
122 __VA_ARGS__; \
123 int operator()(const int state) const { return (func); } \
124 } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
125
126/* GPU thread, block, grid size and index */
127
128#ifndef WITH_ONEAPI_SYCL_HOST_TASK
129# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0))
130# define ccl_gpu_block_dim_x (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_range(0))
131# define ccl_gpu_block_idx_x (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0))
132# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group_range(0))
133# define ccl_gpu_warp_size (sycl::ext::oneapi::this_work_item::get_sub_group().get_local_range()[0])
134# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
135
136# define ccl_gpu_global_id_x() (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0))
137# define ccl_gpu_global_size_x() (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_range(0))
138
139/* GPU warp synchronization */
140# define ccl_gpu_syncthreads() sycl::ext::oneapi::this_work_item::get_nd_item<1>().barrier()
141# define ccl_gpu_local_syncthreads() sycl::ext::oneapi::this_work_item::get_nd_item<1>().barrier(sycl::access::fence_space::local_space)
142# ifdef __SYCL_DEVICE_ONLY__
143# define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::this_work_item::get_sub_group(), predicate).count())
144# else
145# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
146# endif
147#else
148# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
149# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
150# define ccl_gpu_block_idx_x (kg->nd_item_group_id_0)
151# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
152# define ccl_gpu_warp_size (1)
153# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
154
155# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
156# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
157
158# define ccl_gpu_syncthreads()
159# define ccl_gpu_local_syncthreads()
160# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
161#endif
162
163/* Debug defines */
164#if defined(__SYCL_DEVICE_ONLY__)
165# define CCL_ONEAPI_CONSTANT __attribute__((opencl_constant))
166#else
167# define CCL_ONEAPI_CONSTANT
168#endif
169
170#define sycl_printf(format, ...) { \
171 static const CCL_ONEAPI_CONSTANT char fmt[] = format; \
172 sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \
173 }
174
175#define sycl_printf_(format) { \
176 static const CCL_ONEAPI_CONSTANT char fmt[] = format; \
177 sycl::ext::oneapi::experimental::printf(fmt); \
178 }
179
180/* GPU texture objects */
181
182/* clang-format on */
183
184/* Types */
185
186/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
187 * because these types have different interfaces from blender version. */
188
189using uchar = unsigned char;
190using sycl::half;
191
192/* math functions */
194{
195 return sycl::bit_cast<float>(x);
196}
197ccl_device_forceinline unsigned int __float_as_uint(const float x)
198{
199 return sycl::bit_cast<unsigned int>(x);
200}
202{
203 return sycl::bit_cast<float>(x);
204}
206{
207 return sycl::bit_cast<int>(x);
208}
209
210#define fabsf(x) sycl::fabs((x))
211#define copysignf(x, y) sycl::copysign((x), (y))
212#define asinf(x) sycl::asin((x))
213#define acosf(x) sycl::acos((x))
214#define atanf(x) sycl::atan((x))
215#define floorf(x) sycl::floor((x))
216#define ceilf(x) sycl::ceil((x))
217#define roundf(x) sycl::round((x))
218#define sinhf(x) sycl::sinh((x))
219#define coshf(x) sycl::cosh((x))
220#define tanhf(x) sycl::tanh((x))
221#define hypotf(x, y) sycl::hypot((x), (y))
222#define atan2f(x, y) sycl::atan2((x), (y))
223#define fmaxf(x, y) sycl::fmax((x), (y))
224#define fminf(x, y) sycl::fmin((x), (y))
225#define fmodf(x, y) sycl::fmod((x), (y))
226#define lgammaf(x) sycl::lgamma((x))
227#define ldexpf(x, y) sycl::ldexp((x), (y))
228
229#define cosf(x) sycl::native::cos(((float)(x)))
230#define sinf(x) sycl::native::sin(((float)(x)))
231#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
232#define tanf(x) sycl::native::tan(((float)(x)))
233#define logf(x) sycl::native::log(((float)(x)))
234#define expf(x) sycl::native::exp(((float)(x)))
235#define sqrtf(x) sycl::native::sqrt(((float)(x)))
236
237#define __forceinline __attribute__((always_inline))
238
239/* Types */
240#include "util/half.h"
241#include "util/types.h"
242
243static_assert(
244 sizeof(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type) ==
245 sizeof(uint64_t));
248
249template<typename T>
251 const float x,
252 const float y)
253{
254 /* Generic implementation not possible due to limitation with SYCL bindless sampled images
255 * not being able to read in a format, which is different from the supported data type of
256 * the texture.
257 * But looks it looks like this is not a problem at the moment. */
258 static_assert(false);
259 return T();
260}
261
262template<>
264 const float x,
265 const float y)
266{
267 sycl::ext::oneapi::experimental::sampled_image_handle image(
268 (sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
269 return sycl::ext::oneapi::experimental::sample_image<float>(image, sycl::float2{x, y});
270}
271
272template<>
274 const ccl_gpu_tex_object_2D texobj, const float x, const float y)
275{
276 sycl::ext::oneapi::experimental::sampled_image_handle image(
277 (sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
278 return sycl::ext::oneapi::experimental::sample_image<float4, sycl::vec<float, 4>>(
279 image, sycl::float2{x, y});
280}
281
282template<typename T>
284 const float x,
285 const float y,
286 const float z)
287{
288 /* A generic implementation is not possible due to limitations with SYCL bindless sampled images
289 * not being able to read in a format that is different from the supported data type of
290 * the texture.
291 * However, it looks like this is not a problem at the moment, but I am leaving a static
292 * assert in order to easily detect if it becomes a problem in the future. */
293 static_assert(false);
294 return T();
295}
296
297template<>
299 const float x,
300 const float y,
301 const float z)
302{
303 sycl::ext::oneapi::experimental::sampled_image_handle image(
304 (sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
305 return sycl::ext::oneapi::experimental::sample_image<float>(image, sycl::float3{x, y, z});
306}
307
308template<>
310 const ccl_gpu_tex_object_3D texobj, const float x, const float y, const float z)
311{
312 sycl::ext::oneapi::experimental::sampled_image_handle image(
313 (sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
314 return sycl::ext::oneapi::experimental::sample_image<float4, sycl::vec<float, 4>>(
315 image, sycl::float3{x, y, z});
316}
unsigned char uchar
unsigned long long int uint64_t
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
#define ccl_device_forceinline
CUtexObject ccl_gpu_tex_object_2D
#define __int_as_float(x)
#define __float_as_int(x)
#define __float_as_uint(x)
#define __uint_as_float(x)
ccl_device_forceinline float4 ccl_gpu_tex_object_read_2D< float4 >(const ccl_gpu_tex_object_2D texobj, const float x, const float y)
ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object_2D texobj, const float x, const float y)
uint64_t ccl_gpu_tex_object_3D
ccl_device_forceinline float ccl_gpu_tex_object_read_3D< float >(const ccl_gpu_tex_object_3D texobj, const float x, const float y, const float z)
ccl_device_forceinline float4 ccl_gpu_tex_object_read_3D< float4 >(const ccl_gpu_tex_object_3D texobj, const float x, const float y, const float z)
ccl_device_forceinline float ccl_gpu_tex_object_read_2D< float >(const ccl_gpu_tex_object_2D texobj, const float x, const float y)
ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D texobj, const float x, const float y, const float z)
#define T