18# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
20# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
22#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
23#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
25#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
28 const uint partition_size,
29 const uint max_shaders,
30 const uint queued_kernel,
40 if (local_id < max_shaders) {
41 atomic_store_local(&buckets[local_id], 0);
44# ifdef __KERNEL_ONEAPI__
54 const uint partition_start = partition_size *
uint(grid_id);
57 for (
int state_index = partition_start +
uint(local_id); state_index < partition_end;
58 state_index +=
uint(local_size))
60 ushort kernel_index = d_queued_kernel[state_index];
61 if (kernel_index == queued_kernel) {
62 uint key = d_shader_sort_key[state_index] % max_shaders;
63 atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
67# ifdef __KERNEL_ONEAPI__
79 for (
int i = 0; i < max_shaders; i++) {
80 partition_key_offsets[i +
uint(grid_id) * (max_shaders + 1)] = offset;
81 offset = offset + atomic_load_local(&buckets[i]);
85 partition_key_offsets[max_shaders +
uint(grid_id) * (max_shaders + 1)] = offset;
90 const uint partition_size,
91 const uint max_shaders,
92 const uint queued_kernel,
93 const int num_states_limit,
106 if (local_id < max_shaders) {
107 int partition_offset = 0;
108 for (
int i = 0; i <
uint(grid_id); i++) {
109 int partition_key_count = partition_key_offsets[max_shaders +
uint(i) * (max_shaders + 1)];
110 partition_offset += partition_key_count;
113 ccl_global int *key_offsets = partition_key_offsets + (
uint(grid_id) * (max_shaders + 1));
114 atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
117# ifdef __KERNEL_ONEAPI__
127 const uint partition_start = partition_size *
uint(grid_id);
130 ccl_global int *key_offsets = partition_key_offsets + (
uint(grid_id) * max_shaders);
132 for (
int state_index = partition_start +
uint(local_id); state_index < partition_end;
133 state_index +=
uint(local_size))
135 ushort kernel_index = d_queued_kernel[state_index];
136 if (kernel_index == queued_kernel) {
137 uint key = d_shader_sort_key[state_index] % max_shaders;
138 int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
139 if (index < num_states_limit) {
140 indices[index] = state_index;
148template<
typename GetKeyOp>
151 const int num_states_limit,
158 const int key = (state_index <
num_states) ? get_key_op(state_index) :
163 if (index < num_states_limit) {
165 indices[index] = state_index;
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_gpu_syncthreads()
#define ccl_device_inline
#define CCL_NAMESPACE_END
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
__device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)