12# define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
13# define atomic_compare_and_swap_float(p, old_val, new_val) \
14 atomic_cas_float((p), (old_val), (new_val))
16# define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
17# define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
19# define CCL_LOCAL_MEM_FENCE 0
20# define ccl_barrier(flags) ((void)0)
24# ifndef __KERNEL_ONEAPI__
25# define atomic_fetch_and_add_uint32_shared atomic_fetch_and_add_uint32
28# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
30# define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
32# define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int *)(p), (unsigned int)(x))
33# define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
34# define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
35# define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
36# define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
43 unsigned int int_value;
45 } new_value, prev_value,
result;
46 prev_value.float_value = old_val;
47 new_value.float_value = new_val;
48 result.int_value = atomicCAS((
unsigned int *)dest, prev_value.int_value, new_value.int_value);
52# define CCL_LOCAL_MEM_FENCE
53# define ccl_barrier(flags) __syncthreads()
57# ifdef __KERNEL_METAL__
63# if __METAL_VERSION__ >= 300
64 return atomic_fetch_add_explicit(
65 (
ccl_global atomic_float *)_source, operand, memory_order_relaxed);
71 } new_value, prev_value;
72 prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
74 new_value.float_value = prev_value.float_value + operand;
75 }
while (!atomic_compare_exchange_weak_explicit(source,
76 &prev_value.int_value,
79 memory_order_relaxed));
81 return new_value.float_value;
87 return atomic_fetch_add_explicit((device atomic_uint *)p,
x, memory_order_relaxed);
92 return atomic_fetch_sub_explicit((device atomic_uint *)p,
x, memory_order_relaxed);
97 return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
102 return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
107 return atomic_fetch_or_explicit((device atomic_uint *)p,
x, memory_order_relaxed);
113 return atomic_fetch_add_explicit((threadgroup atomic_uint *)p,
x, memory_order_relaxed);
119 return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p,
x, memory_order_relaxed);
124 return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
129 return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
135 return atomic_fetch_or_explicit((threadgroup atomic_uint *)p,
x, memory_order_relaxed);
142# if __METAL_VERSION__ >= 300
143 float prev_value = old_val;
144 atomic_compare_exchange_weak_explicit((
ccl_global atomic_float *)dest,
147 memory_order_relaxed,
148 memory_order_relaxed);
153 atomic_compare_exchange_weak_explicit((
ccl_global atomic_int *)dest,
156 memory_order_relaxed,
157 memory_order_relaxed);
162# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
163# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
165# define atomic_store_local(p, x) \
166 atomic_store_explicit((ccl_gpu_shared atomic_int *)p, x, memory_order_relaxed)
167# define atomic_load_local(p) \
168 atomic_load_explicit((ccl_gpu_shared atomic_int *)p, memory_order_relaxed)
170# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
171# define ccl_barrier(flags) threadgroup_barrier(flags)
175# ifdef __KERNEL_ONEAPI__
179 sycl::atomic_ref<
float,
180 sycl::memory_order::relaxed,
181 sycl::memory_scope::device,
182 sycl::access::address_space::ext_intel_global_device_space>
184 return atomic.fetch_add(
x);
191 sycl::atomic_ref<
float,
192 sycl::memory_order::relaxed,
193 sycl::memory_scope::device,
194 sycl::access::address_space::ext_intel_global_device_space>
196 atomic.compare_exchange_weak(old_val, new_val);
203 sycl::atomic_ref<
unsigned int,
204 sycl::memory_order::relaxed,
205 sycl::memory_scope::device,
206 sycl::access::address_space::ext_intel_global_device_space>
208 return atomic.fetch_add(
x);
213 sycl::atomic_ref<int,
214 sycl::memory_order::relaxed,
215 sycl::memory_scope::device,
216 sycl::access::address_space::ext_intel_global_device_space>
218 return atomic.fetch_add(
x);
223 sycl::atomic_ref<int,
224 sycl::memory_order::relaxed,
225 sycl::memory_scope::device,
226 sycl::access::address_space::local_space>
228 return atomic.fetch_add(
x);
234 sycl::atomic_ref<
unsigned int,
235 sycl::memory_order::relaxed,
236 sycl::memory_scope::device,
237 sycl::access::address_space::ext_intel_global_device_space>
239 return atomic.fetch_sub(
x);
244 sycl::atomic_ref<int,
245 sycl::memory_order::relaxed,
246 sycl::memory_scope::device,
247 sycl::access::address_space::ext_intel_global_device_space>
249 return atomic.fetch_sub(
x);
264 return atomic_fetch_and_sub_uint32(p, 1);
269 return atomic_fetch_and_sub_uint32(p, 1);
275 sycl::atomic_ref<
unsigned int,
276 sycl::memory_order::relaxed,
277 sycl::memory_scope::device,
278 sycl::access::address_space::ext_intel_global_device_space>
280 return atomic.fetch_or(
x);
285 sycl::atomic_ref<int,
286 sycl::memory_order::relaxed,
287 sycl::memory_scope::device,
288 sycl::access::address_space::ext_intel_global_device_space>
290 return atomic.fetch_or(
x);
295 sycl::atomic_ref<int,
296 sycl::memory_order::relaxed,
297 sycl::memory_scope::device,
298 sycl::access::address_space::local_space>
305 sycl::atomic_ref<int,
306 sycl::memory_order::relaxed,
307 sycl::memory_scope::device,
308 sycl::access::address_space::local_space>
310 return atomic.load();
#define atomic_fetch_and_dec_uint32(p)
#define atomic_compare_and_swap_float(p, old_val, new_val)
#define atomic_fetch_and_inc_uint32(p)
#define atomic_add_and_fetch_float(p, x)
Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operatio...
ATOMIC_INLINE uint32_t atomic_fetch_and_or_uint32(uint32_t *p, uint32_t x)
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_device_inline