Blender V4.3
BLI_task.hh
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#pragma once
6
11#ifdef WITH_TBB
12/* Quiet top level deprecation message, unrelated to API usage here. */
13# if defined(WIN32) && !defined(NOMINMAX)
14/* TBB includes Windows.h which will define min/max macros causing issues
15 * when we try to use std::min and std::max later on. */
16# define NOMINMAX
17# define TBB_MIN_MAX_CLEANUP
18# endif
19# include <tbb/blocked_range.h>
20# include <tbb/parallel_for.h>
21# include <tbb/parallel_for_each.h>
22# include <tbb/parallel_invoke.h>
23# include <tbb/parallel_reduce.h>
24# include <tbb/task_arena.h>
25# ifdef WIN32
26/* We cannot keep this defined, since other parts of the code deal with this on their own, leading
27 * to multiple define warnings unless we un-define this, however we can only undefine this if we
28 * were the ones that made the definition earlier. */
29# ifdef TBB_MIN_MAX_CLEANUP
30# undef NOMINMAX
31# endif
32# endif
33#endif
34
35#include "BLI_function_ref.hh"
36#include "BLI_index_range.hh"
37#include "BLI_lazy_threading.hh"
38#include "BLI_span.hh"
40#include "BLI_utildefines.h"
41
42namespace blender {
43
47struct GrainSize {
49
50 explicit constexpr GrainSize(const int64_t grain_size) : value(grain_size) {}
51};
52
53} // namespace blender
54
55namespace blender::threading {
56
57template<typename Range, typename Function>
58inline void parallel_for_each(Range &&range, const Function &function)
59{
60#ifdef WITH_TBB
61 tbb::parallel_for_each(range, function);
62#else
63 for (auto &&value : range) {
64 function(value);
65 }
66#endif
67}
68
69namespace detail {
71 int64_t grain_size,
72 FunctionRef<void(IndexRange)> function,
73 const TaskSizeHints &size_hints);
75} // namespace detail
76
94template<typename Function>
95inline void parallel_for(const IndexRange range,
96 const int64_t grain_size,
97 const Function &function,
98 const TaskSizeHints &size_hints = detail::TaskSizeHints_Static(1))
99{
100 if (range.is_empty()) {
101 return;
102 }
103 /* Invoking tbb for small workloads has a large overhead. */
104 if (use_single_thread(size_hints, range, grain_size)) {
105 function(range);
106 return;
107 }
108 detail::parallel_for_impl(range, grain_size, function, size_hints);
109}
110
115inline IndexRange align_sub_range(const IndexRange unaligned_range,
116 const int64_t alignment,
117 const IndexRange global_range)
118{
119 const int64_t global_begin = global_range.start();
120 const int64_t global_end = global_range.one_after_last();
121 const int64_t alignment_mask = ~(alignment - 1);
122
123 const int64_t unaligned_begin = unaligned_range.start();
124 const int64_t unaligned_end = unaligned_range.one_after_last();
125 const int64_t aligned_begin = std::max(global_begin, unaligned_begin & alignment_mask);
126 const int64_t aligned_end = unaligned_end == global_end ?
127 unaligned_end :
128 std::max(global_begin, unaligned_end & alignment_mask);
129 const IndexRange aligned_range = IndexRange::from_begin_end(aligned_begin, aligned_end);
130 return aligned_range;
131}
132
140template<typename Function>
141inline void parallel_for_aligned(const IndexRange range,
142 const int64_t grain_size,
143 const int64_t alignment,
144 const Function &function)
145{
146 parallel_for(range, grain_size, [&](const IndexRange unaligned_range) {
147 const IndexRange aligned_range = align_sub_range(unaligned_range, alignment, range);
148 function(aligned_range);
149 });
150}
151
152template<typename Value, typename Function, typename Reduction>
153inline Value parallel_reduce(IndexRange range,
154 int64_t grain_size,
155 const Value &identity,
156 const Function &function,
157 const Reduction &reduction)
158{
159#ifdef WITH_TBB
160 if (range.size() >= grain_size) {
162 return tbb::parallel_reduce(
163 tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
164 identity,
165 [&](const tbb::blocked_range<int64_t> &subrange, const Value &ident) {
166 return function(IndexRange(subrange.begin(), subrange.size()), ident);
167 },
168 reduction);
169 }
170#else
171 UNUSED_VARS(grain_size, reduction);
172#endif
173 return function(range, identity);
174}
175
176template<typename Value, typename Function, typename Reduction>
177inline Value parallel_reduce_aligned(const IndexRange range,
178 const int64_t grain_size,
179 const int64_t alignment,
180 const Value &identity,
181 const Function &function,
182 const Reduction &reduction)
183{
185 range,
186 grain_size,
187 identity,
188 [&](const IndexRange unaligned_range, const Value &ident) {
189 const IndexRange aligned_range = align_sub_range(unaligned_range, alignment, range);
190 function(aligned_range, ident);
191 },
192 reduction);
193}
194
199template<typename... Functions> inline void parallel_invoke(Functions &&...functions)
200{
201#ifdef WITH_TBB
202 tbb::parallel_invoke(std::forward<Functions>(functions)...);
203#else
204 (functions(), ...);
205#endif
206}
207
213template<typename... Functions>
214inline void parallel_invoke(const bool use_threading, Functions &&...functions)
215{
216 if (use_threading) {
218 parallel_invoke(std::forward<Functions>(functions)...);
219 }
220 else {
221 (functions(), ...);
222 }
223}
224
226template<typename Function> inline void isolate_task(const Function &function)
227{
228#ifdef WITH_TBB
230 tbb::this_task_arena::isolate(function);
231#else
232 function();
233#endif
234}
235
242template<typename Function>
243inline void memory_bandwidth_bound_task(const int64_t approximate_bytes_touched,
244 const Function &function)
245{
246 /* Don't limit threading when all touched memory can stay in the CPU cache, because there a much
247 * higher memory bandwidth is available compared to accessing RAM. This value is supposed to be
248 * on the order of the L3 cache size. Accessing that value is not quite straight forward and even
249 * if it was, it's not clear if using the exact cache size would be beneficial because there is
250 * often more stuff going on the CPU at the same time. */
251 if (approximate_bytes_touched <= 8 * 1024 * 1024) {
252 function();
253 return;
254 }
256}
257
258} // namespace blender::threading
#define UNUSED_VARS(...)
constexpr int64_t one_after_last() const
static constexpr IndexRange from_begin_end(const int64_t begin, const int64_t end)
constexpr int64_t start() const
IndexRange range
void parallel_for_impl(IndexRange range, int64_t grain_size, FunctionRef< void(IndexRange)> function, const TaskSizeHints &size_hints)
void memory_bandwidth_bound_task_impl(FunctionRef< void()> function)
void isolate_task(const Function &function)
Definition BLI_task.hh:226
void parallel_invoke(Functions &&...functions)
Definition BLI_task.hh:199
void parallel_for_each(Range &&range, const Function &function)
Definition BLI_task.hh:58
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:95
bool use_single_thread(const TaskSizeHints &size_hints, const IndexRange range, const int64_t threshold)
void memory_bandwidth_bound_task(const int64_t approximate_bytes_touched, const Function &function)
Definition BLI_task.hh:243
void parallel_for_aligned(const IndexRange range, const int64_t grain_size, const int64_t alignment, const Function &function)
Definition BLI_task.hh:141
Value parallel_reduce(IndexRange range, int64_t grain_size, const Value &identity, const Function &function, const Reduction &reduction)
Definition BLI_task.hh:153
Value parallel_reduce_aligned(const IndexRange range, const int64_t grain_size, const int64_t alignment, const Value &identity, const Function &function, const Reduction &reduction)
Definition BLI_task.hh:177
IndexRange align_sub_range(const IndexRange unaligned_range, const int64_t alignment, const IndexRange global_range)
Definition BLI_task.hh:115
__int64 int64_t
Definition stdint.h:89
constexpr GrainSize(const int64_t grain_size)
Definition BLI_task.hh:50