Blender V5.0
parallel_active_index.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
8
9/* Given an array of states, build an array of indices for which the states
10 * are active.
11 *
12 * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
13
15#include "util/atomic.h"
16
17/* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
18 * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
19 * and keep device specific code in `compat.h`. */
20
21#ifdef __KERNEL_ONEAPI__
22
23template<typename IsActiveOp>
26 ccl_global int *ccl_restrict num_indices,
27 IsActiveOp is_active_op)
28{
29# ifdef WITH_ONEAPI_SYCL_HOST_TASK
30 int write_index = 0;
31 for (int state_index = 0; state_index < num_states; state_index++) {
32 if (is_active_op(state_index)) {
33 indices[write_index++] = state_index;
34 }
35 }
36 *num_indices = write_index;
37 return;
38# endif /* WITH_ONEAPI_SYCL_HOST_TASK */
39
40 const sycl::nd_item<1> &item_id = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
41 const uint blocksize = item_id.get_local_range(0);
42
43 sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
44 sycl::access::address_space::local_space>
45 ptr = sycl::ext::oneapi::group_local_memory<
46 int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
47 int *warp_offset = *ptr;
48
49 /* NOTE(@nsirgien): Here we calculate the same value as below but
50 * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
51 * something faster already but DPC++ doesn't, so it's better to use
52 * direct request of needed parameters - switching from this computation to computation below
53 * will cause 2.5x performance slowdown. */
54 const uint thread_index = item_id.get_local_id(0);
55 const uint thread_warp = item_id.get_sub_group().get_local_id();
56
57 const uint warp_index = item_id.get_sub_group().get_group_id();
58 const uint num_warps = item_id.get_sub_group().get_group_range()[0];
59
60 const uint state_index = item_id.get_global_id(0);
61
62 /* Test if state corresponding to this thread is active. */
63 const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
64#else /* !__KERNEL__ONEAPI__ */
65# ifndef __KERNEL_METAL__
66template<typename IsActiveOp>
68# endif
69 void
72 ccl_global int *num_indices,
73# ifdef __KERNEL_METAL__
74 const uint is_active,
75 const uint blocksize,
76 const int thread_index,
77 const uint state_index,
78 const int ccl_gpu_warp_size,
79 const int thread_warp,
80 const int warp_index,
81 const int num_warps,
82 threadgroup int *warp_offset)
83{
84# else
85 IsActiveOp is_active_op)
86{
87 extern ccl_gpu_shared int warp_offset[];
88
89# ifndef __KERNEL_METAL__
90 const uint blocksize = ccl_gpu_block_dim_x;
91# endif
92
93 const uint thread_index = ccl_gpu_thread_idx_x;
94 const uint thread_warp = thread_index % ccl_gpu_warp_size;
95
96 const uint warp_index = thread_index / ccl_gpu_warp_size;
97 const uint num_warps = blocksize / ccl_gpu_warp_size;
98
99 const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
100
101 /* Test if state corresponding to this thread is active. */
102 const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
103# endif
104#endif /* !__KERNEL_ONEAPI__ */
105 /* For each thread within a warp compute how many other active states precede it. */
106#ifdef __KERNEL_ONEAPI__
107 const uint thread_offset = sycl::exclusive_scan_over_group(
108 item_id.get_sub_group(), is_active, std::plus<>());
109#else
110 const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
111 ccl_gpu_thread_mask(thread_warp));
112#endif
113
114 /* Last thread in warp stores number of active states for each warp. */
115#ifdef __KERNEL_ONEAPI__
116 if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
117#else
118 if (thread_warp == ccl_gpu_warp_size - 1) {
119#endif
120 warp_offset[warp_index] = thread_offset + is_active;
121 }
122
123#ifdef __KERNEL_ONEAPI__
124 /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
125 * so faster local barriers can be used. */
127#else
129#endif
130
131 /* Last thread in block converts per-warp sizes to offsets, increments global size of
132 * index array and gets offset to write to. */
133 if (thread_index == blocksize - 1) {
134 /* TODO: parallelize this. */
135 int offset = 0;
136 for (int i = 0; i < num_warps; i++) {
137 int num_active = warp_offset[i];
138 warp_offset[i] = offset;
139 offset += num_active;
140 }
141
142 const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
143 warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
144 }
145
146#ifdef __KERNEL_ONEAPI__
147 /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
148 * so faster local barriers can be used. */
150#else
152#endif
153
154 /* Write to index array. */
155 if (is_active) {
156 const uint block_offset = warp_offset[num_warps];
157 indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
158 }
159}
160
161#ifdef __KERNEL_METAL__
162
163# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
164 const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
165 is_active_op(ccl_gpu_global_id_x()) : \
166 0; \
167 gpu_parallel_active_index_array_impl(num_states, \
168 indices, \
169 num_indices, \
170 is_active, \
171 metal_local_size, \
172 metal_local_id, \
173 metal_global_id, \
174 simdgroup_size, \
175 simd_lane_index, \
176 simd_group_index, \
177 num_simd_groups, \
178 (threadgroup int *)threadgroup_array)
179#elif defined(__KERNEL_ONEAPI__)
180
181# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
182 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
183
184#else
185
186# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
187 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
188
189#endif
190
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
Definition block_sizes.h:12
#define ccl_restrict
#define ccl_global
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_syncthreads()
#define ccl_gpu_warp_size
#define ccl_gpu_thread_mask(thread_warp)
#define ccl_gpu_shared
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define CCL_NAMESPACE_END
#define __KERNEL_METAL__
#define __device__
#define ccl_gpu_local_syncthreads()
static ushort indices[]
const int num_states
ccl_device_inline uint popcount(const uint x)
Definition math_base.h:688
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, IsActiveOp is_active_op)
i
Definition text_draw.cc:230
PointerRNA * ptr
Definition wm_files.cc:4238