Blender V4.3
parallel_sorted_index.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
8
9/* Given an array of states, build an array of indices for which the states
10 * are active and sorted by a given key. The prefix sum of the number of active
11 * states per key must have already been computed.
12 *
13 * TODO: there may be ways to optimize this to avoid this many atomic ops? */
14
15#include "util/atomic.h"
16
17#ifdef __HIP__
18# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
19#else
20# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
21#endif
22#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
23#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
24
25#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
26
27ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
28 const uint partition_size,
29 const uint max_shaders,
30 const uint queued_kernel,
31 ccl_global ushort *d_queued_kernel,
32 ccl_global uint *d_shader_sort_key,
33 ccl_global int *partition_key_offsets,
34 ccl_gpu_shared int *buckets,
35 const ushort local_id,
36 const ushort local_size,
37 const uint grid_id)
38{
39 /* Zero the bucket sizes. */
40 if (local_id < max_shaders) {
41 atomic_store_local(&buckets[local_id], 0);
42 }
43
44# ifdef __KERNEL_ONEAPI__
45 /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
46 * so faster local barriers can be used. */
48# else
50# endif
51
52 /* Determine bucket sizes within the partitions. */
53
54 const uint partition_start = partition_size * uint(grid_id);
55 const uint partition_end = min(num_states, partition_start + partition_size);
56
57 for (int state_index = partition_start + uint(local_id); state_index < partition_end;
58 state_index += uint(local_size))
59 {
60 ushort kernel_index = d_queued_kernel[state_index];
61 if (kernel_index == queued_kernel) {
62 uint key = d_shader_sort_key[state_index] % max_shaders;
63 atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
64 }
65 }
66
67# ifdef __KERNEL_ONEAPI__
68 /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
69 * so faster local barriers can be used. */
71# else
73# endif
74
75 /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
76
77 if (local_id == 0) {
78 int offset = 0;
79 for (int i = 0; i < max_shaders; i++) {
80 partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
81 offset = offset + atomic_load_local(&buckets[i]);
82 }
83
84 /* Store the number of active states in this partition. */
85 partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
86 }
87}
88
89ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
90 const uint partition_size,
91 const uint max_shaders,
92 const uint queued_kernel,
93 const int num_states_limit,
94 ccl_global int *indices,
95 ccl_global ushort *d_queued_kernel,
96 ccl_global uint *d_shader_sort_key,
97 ccl_global int *partition_key_offsets,
98 ccl_gpu_shared int *local_offset,
99 const ushort local_id,
100 const ushort local_size,
101 const uint grid_id)
102{
103 /* Calculate each partition's global offset from the prefix sum of the active state counts per
104 * partition. */
105
106 if (local_id < max_shaders) {
107 int partition_offset = 0;
108 for (int i = 0; i < uint(grid_id); i++) {
109 int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
110 partition_offset += partition_key_count;
111 }
112
113 ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
114 atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
115 }
116
117# ifdef __KERNEL_ONEAPI__
118 /* NOTE(@nsirgien): For us here only local memory writing (local_offset) is important,
119 * so faster local barriers can be used. */
121# else
123# endif
124
125 /* Write the sorted active indices. */
126
127 const uint partition_start = partition_size * uint(grid_id);
128 const uint partition_end = min(num_states, partition_start + partition_size);
129
130 ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
131
132 for (int state_index = partition_start + uint(local_id); state_index < partition_end;
133 state_index += uint(local_size))
134 {
135 ushort kernel_index = d_queued_kernel[state_index];
136 if (kernel_index == queued_kernel) {
137 uint key = d_shader_sort_key[state_index] % max_shaders;
138 int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
139 if (index < num_states_limit) {
140 indices[index] = state_index;
141 }
142 }
143 }
144}
145
146#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
147
148template<typename GetKeyOp>
150 const uint num_states,
151 const int num_states_limit,
152 ccl_global int *indices,
153 ccl_global int *num_indices,
154 ccl_global int *key_counter,
155 ccl_global int *key_prefix_sum,
156 GetKeyOp get_key_op)
157{
158 const int key = (state_index < num_states) ? get_key_op(state_index) :
160
162 const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
163 if (index < num_states_limit) {
164 /* Assign state index. */
165 indices[index] = state_index;
166 }
167 else {
168 /* Can't process this state now, increase the counter again so that
169 * it will be handled in another iteration. */
170 atomic_fetch_and_add_uint32(&key_counter[key], 1);
171 }
172 }
173}
174
unsigned short ushort
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_gpu_syncthreads()
#define ccl_gpu_shared
#define ccl_device_inline
#define ccl_global
#define CCL_NAMESPACE_END
#define __device__
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
__device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
#define min(a, b)
Definition sort.c:32