Blender V5.0
parallel_sorted_index.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
8
9/* Given an array of states, build an array of indices for which the states
10 * are active and sorted by a given key. The prefix sum of the number of active
11 * states per key must have already been computed.
12 *
13 * TODO: there may be ways to optimize this to avoid this many atomic ops? */
14
16#include "util/atomic.h"
17
18#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
19
20ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
21 const uint partition_size,
22 const uint max_shaders,
23 const uint queued_kernel,
24 ccl_global ushort *d_queued_kernel,
25 ccl_global uint *d_shader_sort_key,
26 ccl_global int *partition_key_offsets,
27 ccl_gpu_shared int *buckets,
28 const ushort local_id,
29 const ushort local_size,
30 const uint grid_id)
31{
32 /* Zero the bucket sizes. */
33 if (local_id < max_shaders) {
34 atomic_store_local(&buckets[local_id], 0);
35 }
36
37# ifdef __KERNEL_ONEAPI__
38 /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
39 * so faster local barriers can be used. */
41# else
43# endif
44
45 /* Determine bucket sizes within the partitions. */
46
47 const uint partition_start = partition_size * uint(grid_id);
48 const uint partition_end = min(num_states, partition_start + partition_size);
49
50 for (int state_index = partition_start + uint(local_id); state_index < partition_end;
51 state_index += uint(local_size))
52 {
53 ushort kernel_index = d_queued_kernel[state_index];
54 if (kernel_index == queued_kernel) {
55 uint key = d_shader_sort_key[state_index] % max_shaders;
56 atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
57 }
58 }
59
60# ifdef __KERNEL_ONEAPI__
61 /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
62 * so faster local barriers can be used. */
64# else
66# endif
67
68 /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
69
70 if (local_id == 0) {
71 int offset = 0;
72 for (int i = 0; i < max_shaders; i++) {
73 partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
74 offset = offset + atomic_load_local(&buckets[i]);
75 }
76
77 /* Store the number of active states in this partition. */
78 partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
79 }
80}
81
82ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
83 const uint partition_size,
84 const uint max_shaders,
85 const uint queued_kernel,
86 const int num_states_limit,
88 ccl_global ushort *d_queued_kernel,
89 ccl_global uint *d_shader_sort_key,
90 ccl_global int *partition_key_offsets,
91 ccl_gpu_shared int *local_offset,
92 const ushort local_id,
93 const ushort local_size,
94 const uint grid_id)
95{
96 /* Calculate each partition's global offset from the prefix sum of the active state counts per
97 * partition. */
98
99 if (local_id < max_shaders) {
100 int partition_offset = 0;
101 for (int i = 0; i < uint(grid_id); i++) {
102 int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
103 partition_offset += partition_key_count;
104 }
105
106 ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
107 atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
108 }
109
110# ifdef __KERNEL_ONEAPI__
111 /* NOTE(@nsirgien): For us here only local memory writing (local_offset) is important,
112 * so faster local barriers can be used. */
114# else
116# endif
117
118 /* Write the sorted active indices. */
119
120 const uint partition_start = partition_size * uint(grid_id);
121 const uint partition_end = min(num_states, partition_start + partition_size);
122
123 ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
124
125 for (int state_index = partition_start + uint(local_id); state_index < partition_end;
126 state_index += uint(local_size))
127 {
128 ushort kernel_index = d_queued_kernel[state_index];
129 if (kernel_index == queued_kernel) {
130 uint key = d_shader_sort_key[state_index] % max_shaders;
131 int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
132 if (index < num_states_limit) {
133 indices[index] = state_index;
134 }
135 }
136 }
137}
138
139#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
140
141template<typename GetKeyOp>
143 const uint num_states,
144 const int num_states_limit,
145 ccl_global int *indices,
146 ccl_global int *num_indices,
147 ccl_global int *key_counter,
148 ccl_global int *key_prefix_sum,
149 GetKeyOp get_key_op)
150{
151 const int key = (state_index < num_states) ? get_key_op(state_index) :
153
155 const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
156 if (index < num_states_limit) {
157 /* Assign state index. */
158 indices[index] = state_index;
159 }
160 else {
161 /* Can't process this state now, increase the counter again so that
162 * it will be handled in another iteration. */
163 atomic_fetch_and_add_uint32(&key_counter[key], 1);
164 }
165 }
166}
167
unsigned int uint
unsigned short ushort
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
Definition block_sizes.h:17
#define ccl_device_inline
#define ccl_global
#define ccl_gpu_syncthreads()
#define ccl_gpu_shared
#define CCL_NAMESPACE_END
#define __device__
#define ccl_gpu_local_syncthreads()
static ushort indices[]
const int num_states
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
#define min(a, b)
Definition sort.cc:36
i
Definition text_draw.cc:230