Blender V4.3
parallel_active_index.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
8
9/* Given an array of states, build an array of indices for which the states
10 * are active.
11 *
12 * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
13
14#include "util/atomic.h"
15
16#ifdef __HIP__
17# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
18#else
19# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
20#endif
21
22/* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
23 * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
24 * and keep device specific code in `compat.h`. */
25
26#ifdef __KERNEL_ONEAPI__
27
28template<typename IsActiveOp>
30 ccl_global int *ccl_restrict indices,
31 ccl_global int *ccl_restrict num_indices,
32 IsActiveOp is_active_op)
33{
34# ifdef WITH_ONEAPI_SYCL_HOST_TASK
35 int write_index = 0;
36 for (int state_index = 0; state_index < num_states; state_index++) {
37 if (is_active_op(state_index))
38 indices[write_index++] = state_index;
39 }
40 *num_indices = write_index;
41 return;
42# endif /* WITH_ONEAPI_SYCL_HOST_TASK */
43
44 const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
45 const uint blocksize = item_id.get_local_range(0);
46
47 sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
48 sycl::access::address_space::local_space>
49 ptr = sycl::ext::oneapi::group_local_memory<
50 int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
51 int *warp_offset = *ptr;
52
53 /* NOTE(@nsirgien): Here we calculate the same value as below but
54 * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
55 * something faster already but DPC++ doesn't, so it's better to use
56 * direct request of needed parameters - switching from this computation to computation below
57 * will cause 2.5x performance slowdown. */
58 const uint thread_index = item_id.get_local_id(0);
59 const uint thread_warp = item_id.get_sub_group().get_local_id();
60
61 const uint warp_index = item_id.get_sub_group().get_group_id();
62 const uint num_warps = item_id.get_sub_group().get_group_range()[0];
63
64 const uint state_index = item_id.get_global_id(0);
65
66 /* Test if state corresponding to this thread is active. */
67 const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
68#else /* !__KERNEL__ONEAPI__ */
69# ifndef __KERNEL_METAL__
70template<typename IsActiveOp>
72# endif
73 void
75 ccl_global int *indices,
76 ccl_global int *num_indices,
77# ifdef __KERNEL_METAL__
78 const uint is_active,
79 const uint blocksize,
80 const int thread_index,
81 const uint state_index,
82 const int ccl_gpu_warp_size,
83 const int thread_warp,
84 const int warp_index,
85 const int num_warps,
86 threadgroup int *warp_offset)
87{
88# else
89 IsActiveOp is_active_op)
90{
91 extern ccl_gpu_shared int warp_offset[];
92
93# ifndef __KERNEL_METAL__
94 const uint blocksize = ccl_gpu_block_dim_x;
95# endif
96
97 const uint thread_index = ccl_gpu_thread_idx_x;
98 const uint thread_warp = thread_index % ccl_gpu_warp_size;
99
100 const uint warp_index = thread_index / ccl_gpu_warp_size;
101 const uint num_warps = blocksize / ccl_gpu_warp_size;
102
103 const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
104
105 /* Test if state corresponding to this thread is active. */
106 const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
107# endif
108#endif /* !__KERNEL_ONEAPI__ */
109 /* For each thread within a warp compute how many other active states precede it. */
110#ifdef __KERNEL_ONEAPI__
111 const uint thread_offset = sycl::exclusive_scan_over_group(
112 item_id.get_sub_group(), is_active, std::plus<>());
113#else
114 const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
115 ccl_gpu_thread_mask(thread_warp));
116#endif
117
118 /* Last thread in warp stores number of active states for each warp. */
119#ifdef __KERNEL_ONEAPI__
120 if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
121#else
122 if (thread_warp == ccl_gpu_warp_size - 1) {
123#endif
124 warp_offset[warp_index] = thread_offset + is_active;
125 }
126
127#ifdef __KERNEL_ONEAPI__
128 /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
129 * so faster local barriers can be used. */
131#else
133#endif
134
135 /* Last thread in block converts per-warp sizes to offsets, increments global size of
136 * index array and gets offset to write to. */
137 if (thread_index == blocksize - 1) {
138 /* TODO: parallelize this. */
139 int offset = 0;
140 for (int i = 0; i < num_warps; i++) {
141 int num_active = warp_offset[i];
142 warp_offset[i] = offset;
143 offset += num_active;
144 }
145
146 const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
147 warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
148 }
149
150#ifdef __KERNEL_ONEAPI__
151 /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
152 * so faster local barriers can be used. */
154#else
156#endif
157
158 /* Write to index array. */
159 if (is_active) {
160 const uint block_offset = warp_offset[num_warps];
161 indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
162 }
163}
164
165#ifdef __KERNEL_METAL__
166
167# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
168 const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
169 is_active_op(ccl_gpu_global_id_x()) : \
170 0; \
171 gpu_parallel_active_index_array_impl(num_states, \
172 indices, \
173 num_indices, \
174 is_active, \
175 metal_local_size, \
176 metal_local_id, \
177 metal_global_id, \
178 simdgroup_size, \
179 simd_lane_index, \
180 simd_group_index, \
181 num_simd_groups, \
182 (threadgroup int *)threadgroup_array)
183#elif defined(__KERNEL_ONEAPI__)
184
185# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
186 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
187
188#else
189
190# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
191 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
192
193#endif
194
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_gpu_block_dim_x
#define ccl_restrict
#define ccl_gpu_thread_idx_x
#define ccl_gpu_syncthreads()
#define ccl_gpu_warp_size
#define ccl_gpu_thread_mask(thread_warp)
#define ccl_gpu_shared
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define ccl_global
#define CCL_NAMESPACE_END
#define __KERNEL_METAL__
#define __device__
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, IsActiveOp is_active_op)
ccl_device_inline uint popcount(uint x)
Definition util/math.h:855
PointerRNA * ptr
Definition wm_files.cc:4126