17# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
19# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
26#ifdef __KERNEL_ONEAPI__
28template<
typename IsActiveOp>
32 IsActiveOp is_active_op)
34# ifdef WITH_ONEAPI_SYCL_HOST_TASK
36 for (
int state_index = 0; state_index <
num_states; state_index++) {
37 if (is_active_op(state_index))
38 indices[write_index++] = state_index;
40 *num_indices = write_index;
44 const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
45 const uint blocksize = item_id.get_local_range(0);
48 sycl::access::address_space::local_space>
49 ptr = sycl::ext::oneapi::group_local_memory<
51 int *warp_offset = *
ptr;
58 const uint thread_index = item_id.get_local_id(0);
59 const uint thread_warp = item_id.get_sub_group().get_local_id();
61 const uint warp_index = item_id.get_sub_group().get_group_id();
62 const uint num_warps = item_id.get_sub_group().get_group_range()[0];
64 const uint state_index = item_id.get_global_id(0);
67 const uint is_active = (state_index <
num_states) ? is_active_op(state_index) : 0;
69# ifndef __KERNEL_METAL__
70template<
typename IsActiveOp>
80 const int thread_index,
81 const uint state_index,
83 const int thread_warp,
86 threadgroup
int *warp_offset)
89 IsActiveOp is_active_op)
93# ifndef __KERNEL_METAL__
106 const uint is_active = (state_index <
num_states) ? is_active_op(state_index) : 0;
110#ifdef __KERNEL_ONEAPI__
111 const uint thread_offset = sycl::exclusive_scan_over_group(
112 item_id.get_sub_group(), is_active, std::plus<>());
119#ifdef __KERNEL_ONEAPI__
120 if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
124 warp_offset[warp_index] = thread_offset + is_active;
127#ifdef __KERNEL_ONEAPI__
137 if (thread_index == blocksize - 1) {
140 for (
int i = 0; i < num_warps; i++) {
141 int num_active = warp_offset[i];
142 warp_offset[i] = offset;
143 offset += num_active;
146 const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
150#ifdef __KERNEL_ONEAPI__
160 const uint block_offset = warp_offset[num_warps];
161 indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
165#ifdef __KERNEL_METAL__
167# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
168 const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
169 is_active_op(ccl_gpu_global_id_x()) : \
171 gpu_parallel_active_index_array_impl(num_states, \
182 (threadgroup int *)threadgroup_array)
183#elif defined(__KERNEL_ONEAPI__)
185# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
186 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
190# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
191 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_syncthreads()
#define ccl_gpu_warp_size
#define ccl_gpu_thread_mask(thread_warp)
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define CCL_NAMESPACE_END
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, IsActiveOp is_active_op)
ccl_device_inline uint popcount(uint x)