Blender V5.0
atomic.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2014-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
7#ifndef __KERNEL_GPU__
8
9/* Using atomic ops header from Blender. */
10# include "atomic_ops.h" // IWYU pragma: export
11
12# define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
13# define atomic_compare_and_swap_float(p, old_val, new_val) \
14 atomic_cas_float((p), (old_val), (new_val))
15
16# define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
17# define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
18
19# define CCL_LOCAL_MEM_FENCE 0
20# define ccl_barrier(flags) ((void)0)
21
22#else /* __KERNEL_GPU__ */
23
24# ifndef __KERNEL_ONEAPI__
25# define atomic_fetch_and_add_uint32_shared atomic_fetch_and_add_uint32
26# endif
27
28# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
29
30# define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
31
32# define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int *)(p), (unsigned int)(x))
33# define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
34# define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
35# define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
36# define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
37
38ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
39 const float old_val,
40 const float new_val)
41{
42 union {
43 unsigned int int_value;
44 float float_value;
45 } new_value, prev_value, result;
46 prev_value.float_value = old_val;
47 new_value.float_value = new_val;
48 result.int_value = atomicCAS((unsigned int *)dest, prev_value.int_value, new_value.int_value);
49 return result.float_value;
50}
51
52# define CCL_LOCAL_MEM_FENCE
53# define ccl_barrier(flags) __syncthreads()
54
55# endif /* __KERNEL_CUDA__ */
56
57# ifdef __KERNEL_METAL__
58
59// global address space versions
60ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
61 const float operand)
62{
63# if __METAL_VERSION__ >= 300
64 return atomic_fetch_add_explicit(
65 (ccl_global atomic_float *)_source, operand, memory_order_relaxed);
66# else
67 volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
68 union {
69 int int_value;
70 float float_value;
71 } new_value, prev_value;
72 prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
73 do {
74 new_value.float_value = prev_value.float_value + operand;
75 } while (!atomic_compare_exchange_weak_explicit(source,
76 &prev_value.int_value,
77 new_value.int_value,
78 memory_order_relaxed,
79 memory_order_relaxed));
80
81 return new_value.float_value;
82# endif
83}
84
85template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, const int x)
86{
87 return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
88}
89
90template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, const int x)
91{
92 return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
93}
94
96{
97 return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
98}
99
100template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
101{
102 return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
103}
104
105template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, const int x)
106{
107 return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
108}
109
110template<class T>
112{
113 return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
114}
115
116template<class T>
117ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, const int x)
118{
119 return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
120}
121
122template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
123{
124 return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
125}
126
127template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
128{
129 return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
130}
131
132template<class T>
134{
135 return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
136}
137
139 const float old_val,
140 const float new_val)
141{
142# if __METAL_VERSION__ >= 300
143 float prev_value = old_val;
144 atomic_compare_exchange_weak_explicit((ccl_global atomic_float *)dest,
145 &prev_value,
146 new_val,
147 memory_order_relaxed,
148 memory_order_relaxed);
149 return prev_value;
150# else
151 int prev_value;
152 prev_value = __float_as_int(old_val);
153 atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
154 &prev_value,
155 __float_as_int(new_val),
156 memory_order_relaxed,
157 memory_order_relaxed);
158 return __int_as_float(prev_value);
159# endif
160}
161
162# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
163# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
164
165# define atomic_store_local(p, x) \
166 atomic_store_explicit((ccl_gpu_shared atomic_int *)p, x, memory_order_relaxed)
167# define atomic_load_local(p) \
168 atomic_load_explicit((ccl_gpu_shared atomic_int *)p, memory_order_relaxed)
169
170# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
171# define ccl_barrier(flags) threadgroup_barrier(flags)
172
173# endif /* __KERNEL_METAL__ */
174
175# ifdef __KERNEL_ONEAPI__
176
177ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, const float x)
178{
179 sycl::atomic_ref<float,
180 sycl::memory_order::relaxed,
181 sycl::memory_scope::device,
182 sycl::access::address_space::ext_intel_global_device_space>
183 atomic(*p);
184 return atomic.fetch_add(x);
185}
186
188 float old_val,
189 float new_val)
190{
191 sycl::atomic_ref<float,
192 sycl::memory_order::relaxed,
193 sycl::memory_scope::device,
194 sycl::access::address_space::ext_intel_global_device_space>
195 atomic(*source);
196 atomic.compare_exchange_weak(old_val, new_val);
197 return old_val;
198}
199
200ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p,
201 unsigned int x)
202{
203 sycl::atomic_ref<unsigned int,
204 sycl::memory_order::relaxed,
205 sycl::memory_scope::device,
206 sycl::access::address_space::ext_intel_global_device_space>
207 atomic(*p);
208 return atomic.fetch_add(x);
209}
210
212{
213 sycl::atomic_ref<int,
214 sycl::memory_order::relaxed,
215 sycl::memory_scope::device,
216 sycl::access::address_space::ext_intel_global_device_space>
217 atomic(*p);
218 return atomic.fetch_add(x);
219}
220
221ccl_device_inline int atomic_fetch_and_add_uint32_shared(int *p, const int x)
222{
223 sycl::atomic_ref<int,
224 sycl::memory_order::relaxed,
225 sycl::memory_scope::device,
226 sycl::access::address_space::local_space>
227 atomic(*p);
228 return atomic.fetch_add(x);
229}
230
231ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
232 unsigned int x)
233{
234 sycl::atomic_ref<unsigned int,
235 sycl::memory_order::relaxed,
236 sycl::memory_scope::device,
237 sycl::access::address_space::ext_intel_global_device_space>
238 atomic(*p);
239 return atomic.fetch_sub(x);
240}
241
242ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, const int x)
243{
244 sycl::atomic_ref<int,
245 sycl::memory_order::relaxed,
246 sycl::memory_scope::device,
247 sycl::access::address_space::ext_intel_global_device_space>
248 atomic(*p);
249 return atomic.fetch_sub(x);
250}
251
252ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p)
253{
254 return atomic_fetch_and_add_uint32(p, 1);
255}
256
258{
259 return atomic_fetch_and_add_uint32(p, 1);
260}
261
262ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p)
263{
264 return atomic_fetch_and_sub_uint32(p, 1);
265}
266
268{
269 return atomic_fetch_and_sub_uint32(p, 1);
270}
271
272ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p,
273 unsigned int x)
274{
275 sycl::atomic_ref<unsigned int,
276 sycl::memory_order::relaxed,
277 sycl::memory_scope::device,
278 sycl::access::address_space::ext_intel_global_device_space>
279 atomic(*p);
280 return atomic.fetch_or(x);
281}
282
284{
285 sycl::atomic_ref<int,
286 sycl::memory_order::relaxed,
287 sycl::memory_scope::device,
288 sycl::access::address_space::ext_intel_global_device_space>
289 atomic(*p);
290 return atomic.fetch_or(x);
291}
292
293ccl_device_inline void atomic_store_local(int *p, const int x)
294{
295 sycl::atomic_ref<int,
296 sycl::memory_order::relaxed,
297 sycl::memory_scope::device,
298 sycl::access::address_space::local_space>
299 atomic(*p);
300 atomic.store(x);
301}
302
303ccl_device_inline int atomic_load_local(int *p)
304{
305 sycl::atomic_ref<int,
306 sycl::memory_order::relaxed,
307 sycl::memory_scope::device,
308 sycl::access::address_space::local_space>
309 atomic(*p);
310 return atomic.load();
311}
312
313# endif /* __KERNEL_ONEAPI__ */
314
315#endif /* __KERNEL_GPU__ */
#define atomic_fetch_and_dec_uint32(p)
Definition atomic.h:17
#define atomic_compare_and_swap_float(p, old_val, new_val)
Definition atomic.h:13
#define atomic_fetch_and_inc_uint32(p)
Definition atomic.h:16
#define atomic_add_and_fetch_float(p, x)
Definition atomic.h:12
Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operatio...
ATOMIC_INLINE uint32_t atomic_fetch_and_or_uint32(uint32_t *p, uint32_t x)
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
nullptr float
#define ccl_device_inline
#define ccl_global
#define __int_as_float(x)
#define __float_as_int(x)
#define T