Blender V4.3
task_range.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
11#include <cstdlib>
12
13#include "MEM_guardedalloc.h"
14
15#include "BLI_array.hh"
16#include "BLI_lazy_threading.hh"
17#include "BLI_offset_indices.hh"
18#include "BLI_task.h"
19#include "BLI_task.hh"
20#include "BLI_threads.h"
21#include "BLI_vector.hh"
22
23#include "atomic_ops.h"
24
25#ifdef WITH_TBB
26# include <tbb/blocked_range.h>
27# include <tbb/enumerable_thread_specific.h>
28# include <tbb/parallel_for.h>
29# include <tbb/parallel_reduce.h>
30#endif
31
32#ifdef WITH_TBB
33
34/* Functor for running TBB parallel_for and parallel_reduce. */
35struct RangeTask {
37 void *userdata;
38 const TaskParallelSettings *settings;
39
40 void *userdata_chunk;
41
42 /* Root constructor. */
43 RangeTask(TaskParallelRangeFunc func, void *userdata, const TaskParallelSettings *settings)
44 : func(func), userdata(userdata), settings(settings)
45 {
46 init_chunk(settings->userdata_chunk);
47 }
48
49 /* Copy constructor. */
50 RangeTask(const RangeTask &other)
51 : func(other.func), userdata(other.userdata), settings(other.settings)
52 {
53 init_chunk(settings->userdata_chunk);
54 }
55
56 /* Splitting constructor for parallel reduce. */
57 RangeTask(RangeTask &other, tbb::split /*unused*/)
58 : func(other.func), userdata(other.userdata), settings(other.settings)
59 {
60 init_chunk(settings->userdata_chunk);
61 }
62
63 ~RangeTask()
64 {
65 if (settings->func_free != nullptr) {
66 settings->func_free(userdata, userdata_chunk);
67 }
68 MEM_SAFE_FREE(userdata_chunk);
69 }
70
71 void init_chunk(void *from_chunk)
72 {
73 if (from_chunk) {
74 userdata_chunk = MEM_mallocN(settings->userdata_chunk_size, "RangeTask");
75 memcpy(userdata_chunk, from_chunk, settings->userdata_chunk_size);
76 }
77 else {
78 userdata_chunk = nullptr;
79 }
80 }
81
82 void operator()(const tbb::blocked_range<int> &r) const
83 {
85 tls.userdata_chunk = userdata_chunk;
86 for (int i = r.begin(); i != r.end(); ++i) {
87 func(userdata, i, &tls);
88 }
89 }
90
91 void join(const RangeTask &other)
92 {
93 settings->func_reduce(userdata, userdata_chunk, other.userdata_chunk);
94 }
95};
96
97#endif
98
99void BLI_task_parallel_range(const int start,
100 const int stop,
101 void *userdata,
103 const TaskParallelSettings *settings)
104{
105#ifdef WITH_TBB
106 /* Multithreading. */
107 if (settings->use_threading && BLI_task_scheduler_num_threads() > 1) {
108 RangeTask task(func, userdata, settings);
109 const size_t grainsize = std::max(settings->min_iter_per_thread, 1);
110 const tbb::blocked_range<int> range(start, stop, grainsize);
111
113
114 if (settings->func_reduce) {
115 parallel_reduce(range, task);
116 if (settings->userdata_chunk) {
117 memcpy(settings->userdata_chunk, task.userdata_chunk, settings->userdata_chunk_size);
118 }
119 }
120 else {
121 parallel_for(range, task);
122 }
123 return;
124 }
125#endif
126
127 /* Single threaded. Nothing to reduce as everything is accumulated into the
128 * main userdata chunk directly. */
129 TaskParallelTLS tls;
130 tls.userdata_chunk = settings->userdata_chunk;
131 for (int i = start; i < stop; i++) {
132 func(userdata, i, &tls);
133 }
134 if (settings->func_free != nullptr) {
135 settings->func_free(userdata, settings->userdata_chunk);
136 }
137}
138
140{
141#ifdef WITH_TBB
142 /* Get a unique thread ID for texture nodes. In the future we should get rid
143 * of the thread ID and change texture evaluation to not require per-thread
144 * storage that can't be efficiently allocated on the stack. */
145 static tbb::enumerable_thread_specific<int> tbb_thread_id(-1);
146 static int tbb_thread_id_counter = 0;
147
148 int &thread_id = tbb_thread_id.local();
149 if (thread_id == -1) {
150 thread_id = atomic_fetch_and_add_int32(&tbb_thread_id_counter, 1);
151 if (thread_id >= BLENDER_MAX_THREADS) {
152 BLI_assert_msg(0, "Maximum number of threads exceeded for sculpting");
153 thread_id = thread_id % BLENDER_MAX_THREADS;
154 }
155 }
156 return thread_id;
157#else
158 return 0;
159#endif
160}
161
163
164#ifdef WITH_TBB
165static void parallel_for_impl_static_size(const IndexRange range,
166 const int64_t grain_size,
167 const FunctionRef<void(IndexRange)> function)
168{
169 tbb::parallel_for(tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
170 [function](const tbb::blocked_range<int64_t> &subrange) {
171 function(IndexRange(subrange.begin(), subrange.size()));
172 });
173}
174#endif /* WITH_TBB */
175
176#ifdef WITH_TBB
177static void parallel_for_impl_individual_size_lookup(
178 const IndexRange range,
179 const int64_t grain_size,
180 const FunctionRef<void(IndexRange)> function,
181 const TaskSizeHints_IndividualLookup &size_hints)
182{
183 /* Shouldn't be too small, because then there is more overhead when the individual tasks are
184 * small. Also shouldn't be too large because then the serial code to split up tasks causes extra
185 * overhead. */
186 const int64_t outer_grain_size = std::min<int64_t>(grain_size, 512);
187 threading::parallel_for(range, outer_grain_size, [&](const IndexRange sub_range) {
188 /* Compute the size of every task in the current range. */
189 Array<int64_t, 1024> task_sizes(sub_range.size());
190 size_hints.lookup_individual_sizes(sub_range, task_sizes);
191
192 /* Split range into multiple segments that have a size that approximates the grain size. */
193 Vector<int64_t, 256> offsets_vec;
194 offsets_vec.append(0);
195 int64_t counter = 0;
196 for (const int64_t i : sub_range.index_range()) {
197 counter += task_sizes[i];
198 if (counter >= grain_size) {
199 offsets_vec.append(i + 1);
200 counter = 0;
201 }
202 }
203 if (offsets_vec.last() < sub_range.size()) {
204 offsets_vec.append(sub_range.size());
205 }
206 const OffsetIndices<int64_t> offsets = offsets_vec.as_span();
207
208 /* Run the dynamically split tasks in parallel. */
209 threading::parallel_for(offsets.index_range(), 1, [&](const IndexRange offsets_range) {
210 for (const int64_t i : offsets_range) {
211 const IndexRange actual_range = offsets[i].shift(sub_range.start());
212 function(actual_range);
213 }
214 });
215 });
216}
217
218#endif /* WITH_TBB */
219
221 const IndexRange range,
222 const int64_t grain_size,
223 const FunctionRef<void(IndexRange)> function,
224 const TaskSizeHints_AccumulatedLookup &size_hints)
225{
226 BLI_assert(!range.is_empty());
227 if (range.size() == 1) {
228 /* Can't subdivide further. */
229 function(range);
230 return;
231 }
232 const int64_t total_size = size_hints.lookup_accumulated_size(range);
233 if (total_size <= grain_size) {
234 function(range);
235 return;
236 }
237 const int64_t middle = range.size() / 2;
238 const IndexRange left_range = range.take_front(middle);
239 const IndexRange right_range = range.drop_front(middle);
240 threading::parallel_invoke(
241 [&]() {
242 parallel_for_impl_accumulated_size_lookup(left_range, grain_size, function, size_hints);
243 },
244 [&]() {
245 parallel_for_impl_accumulated_size_lookup(right_range, grain_size, function, size_hints);
246 });
247}
248
250 const int64_t grain_size,
251 const FunctionRef<void(IndexRange)> function,
252 const TaskSizeHints &size_hints)
253{
254#ifdef WITH_TBB
255 lazy_threading::send_hint();
256 switch (size_hints.type) {
257 case TaskSizeHints::Type::Static: {
258 const int64_t task_size = static_cast<const detail::TaskSizeHints_Static &>(size_hints).size;
259 const int64_t final_grain_size = task_size == 1 ?
260 grain_size :
261 std::max<int64_t>(1, grain_size / task_size);
262 parallel_for_impl_static_size(range, final_grain_size, function);
263 break;
264 }
265 case TaskSizeHints::Type::IndividualLookup: {
266 parallel_for_impl_individual_size_lookup(
267 range,
268 grain_size,
269 function,
270 static_cast<const detail::TaskSizeHints_IndividualLookup &>(size_hints));
271 break;
272 }
273 case TaskSizeHints::Type::AccumulatedLookup: {
275 range,
276 grain_size,
277 function,
278 static_cast<const detail::TaskSizeHints_AccumulatedLookup &>(size_hints));
279 break;
280 }
281 }
282
283#else
284 UNUSED_VARS(grain_size, size_hints);
285 function(range);
286#endif
287}
288
290{
291#ifdef WITH_TBB
292 /* This is the maximum number of threads that may perform these memory bandwidth bound tasks at
293 * the same time. Often fewer threads are already enough to use up the full bandwidth capacity.
294 * Additional threads usually have a negligible benefit and can even make performance worse.
295 *
296 * It's better to use fewer threads here so that the CPU cores can do other tasks at the same
297 * time which may be more compute intensive. */
298 const int num_threads = 8;
299 if (num_threads >= BLI_task_scheduler_num_threads()) {
300 /* Avoid overhead of using a task arena when it would not have any effect anyway. */
301 function();
302 return;
303 }
304 static tbb::task_arena arena{num_threads};
305
306 /* Make sure the lazy threading hints are send now, because they shouldn't be send out of an
307 * isolated region. */
308 lazy_threading::send_hint();
310
311 arena.execute(function);
312#else
313 function();
314#endif
315}
316
317} // namespace blender::threading::detail
#define BLI_assert(a)
Definition BLI_assert.h:50
#define BLI_assert_msg(a, msg)
Definition BLI_assert.h:57
int BLI_task_scheduler_num_threads(void)
void(* TaskParallelRangeFunc)(void *__restrict userdata, int iter, const TaskParallelTLS *__restrict tls)
Definition BLI_task.h:149
#define BLENDER_MAX_THREADS
Definition BLI_threads.h:20
#define UNUSED_VARS(...)
Read Guarded memory(de)allocation.
#define MEM_SAFE_FREE(v)
Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operatio...
ATOMIC_INLINE int32_t atomic_fetch_and_add_int32(int32_t *p, int32_t x)
SIMD_FORCE_INLINE btVector3 operator()(const btVector3 &x) const
Return the transform of the vector.
Definition btTransform.h:90
constexpr IndexRange take_front(int64_t n) const
constexpr IndexRange drop_front(int64_t n) const
virtual int64_t lookup_accumulated_size(IndexRange range) const =0
IndexRange range
void *(* MEM_mallocN)(size_t len, const char *str)
Definition mallocn.cc:44
void parallel_for_impl(IndexRange range, int64_t grain_size, FunctionRef< void(IndexRange)> function, const TaskSizeHints &size_hints)
void memory_bandwidth_bound_task_impl(FunctionRef< void()> function)
static void parallel_for_impl_accumulated_size_lookup(const IndexRange range, const int64_t grain_size, const FunctionRef< void(IndexRange)> function, const TaskSizeHints_AccumulatedLookup &size_hints)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:95
__int64 int64_t
Definition stdint.h:89
void * userdata_chunk
Definition BLI_task.h:146
void BLI_task_parallel_range(const int start, const int stop, void *userdata, TaskParallelRangeFunc func, const TaskParallelSettings *settings)
Definition task_range.cc:99
int BLI_task_parallel_thread_id(const TaskParallelTLS *)