18#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
21 const uint partition_size,
22 const uint max_shaders,
23 const uint queued_kernel,
33 if (local_id < max_shaders) {
34 atomic_store_local(&buckets[local_id], 0);
37# ifdef __KERNEL_ONEAPI__
47 const uint partition_start = partition_size *
uint(grid_id);
50 for (
int state_index = partition_start +
uint(local_id); state_index < partition_end;
51 state_index +=
uint(local_size))
53 ushort kernel_index = d_queued_kernel[state_index];
54 if (kernel_index == queued_kernel) {
55 uint key = d_shader_sort_key[state_index] % max_shaders;
56 atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
60# ifdef __KERNEL_ONEAPI__
72 for (
int i = 0;
i < max_shaders;
i++) {
73 partition_key_offsets[
i +
uint(grid_id) * (max_shaders + 1)] = offset;
74 offset = offset + atomic_load_local(&buckets[
i]);
78 partition_key_offsets[max_shaders +
uint(grid_id) * (max_shaders + 1)] = offset;
83 const uint partition_size,
84 const uint max_shaders,
85 const uint queued_kernel,
86 const int num_states_limit,
99 if (local_id < max_shaders) {
100 int partition_offset = 0;
101 for (
int i = 0;
i <
uint(grid_id);
i++) {
102 int partition_key_count = partition_key_offsets[max_shaders +
uint(
i) * (max_shaders + 1)];
103 partition_offset += partition_key_count;
106 ccl_global int *key_offsets = partition_key_offsets + (
uint(grid_id) * (max_shaders + 1));
107 atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
110# ifdef __KERNEL_ONEAPI__
120 const uint partition_start = partition_size *
uint(grid_id);
123 ccl_global int *key_offsets = partition_key_offsets + (
uint(grid_id) * max_shaders);
125 for (
int state_index = partition_start +
uint(local_id); state_index < partition_end;
126 state_index +=
uint(local_size))
128 ushort kernel_index = d_queued_kernel[state_index];
129 if (kernel_index == queued_kernel) {
130 uint key = d_shader_sort_key[state_index] % max_shaders;
131 int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
132 if (index < num_states_limit) {
141template<
typename GetKeyOp>
144 const int num_states_limit,
151 const int key = (state_index <
num_states) ? get_key_op(state_index) :
156 if (index < num_states_limit) {
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define ccl_device_inline
#define ccl_gpu_syncthreads()
#define CCL_NAMESPACE_END
#define ccl_gpu_local_syncthreads()
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)