26# include <tbb/blocked_range.h>
27# include <tbb/enumerable_thread_specific.h>
28# include <tbb/parallel_for.h>
29# include <tbb/parallel_reduce.h>
44 : func(func), userdata(userdata), settings(settings)
46 init_chunk(settings->userdata_chunk);
50 RangeTask(
const RangeTask &other)
51 : func(other.func), userdata(other.userdata), settings(other.settings)
53 init_chunk(settings->userdata_chunk);
57 RangeTask(RangeTask &other, tbb::split )
58 : func(other.func), userdata(other.userdata), settings(other.settings)
60 init_chunk(settings->userdata_chunk);
65 if (settings->func_free !=
nullptr) {
66 settings->func_free(userdata, userdata_chunk);
71 void init_chunk(
void *from_chunk)
74 userdata_chunk =
MEM_mallocN(settings->userdata_chunk_size,
"RangeTask");
75 memcpy(userdata_chunk, from_chunk, settings->userdata_chunk_size);
78 userdata_chunk =
nullptr;
82 void operator()(
const tbb::blocked_range<int> &r)
const
86 for (
int i = r.begin(); i != r.end(); ++i) {
87 func(userdata, i, &tls);
91 void join(
const RangeTask &other)
93 settings->func_reduce(userdata, userdata_chunk, other.userdata_chunk);
108 RangeTask task(func, userdata, settings);
109 const size_t grainsize = std::max(settings->min_iter_per_thread, 1);
110 const tbb::blocked_range<int>
range(start, stop, grainsize);
114 if (settings->func_reduce) {
115 parallel_reduce(range, task);
116 if (settings->userdata_chunk) {
117 memcpy(settings->userdata_chunk, task.userdata_chunk, settings->userdata_chunk_size);
121 parallel_for(range, task);
131 for (
int i = start; i < stop; i++) {
132 func(userdata, i, &tls);
134 if (settings->func_free !=
nullptr) {
135 settings->func_free(userdata, settings->userdata_chunk);
145 static tbb::enumerable_thread_specific<int> tbb_thread_id(-1);
146 static int tbb_thread_id_counter = 0;
148 int &thread_id = tbb_thread_id.local();
149 if (thread_id == -1) {
152 BLI_assert_msg(0,
"Maximum number of threads exceeded for sculpting");
165static void parallel_for_impl_static_size(
const IndexRange range,
167 const FunctionRef<
void(IndexRange)> function)
169 tbb::parallel_for(tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
170 [function](
const tbb::blocked_range<int64_t> &subrange) {
171 function(IndexRange(subrange.begin(), subrange.size()));
177static void parallel_for_impl_individual_size_lookup(
178 const IndexRange range,
180 const FunctionRef<
void(IndexRange)> function,
181 const TaskSizeHints_IndividualLookup &size_hints)
186 const int64_t outer_grain_size = std::min<int64_t>(grain_size, 512);
189 Array<int64_t, 1024> task_sizes(sub_range.size());
190 size_hints.lookup_individual_sizes(sub_range, task_sizes);
193 Vector<int64_t, 256> offsets_vec;
194 offsets_vec.append(0);
196 for (
const int64_t i : sub_range.index_range()) {
197 counter += task_sizes[i];
198 if (counter >= grain_size) {
199 offsets_vec.append(i + 1);
203 if (offsets_vec.last() < sub_range.size()) {
204 offsets_vec.append(sub_range.size());
206 const OffsetIndices<int64_t> offsets = offsets_vec.as_span();
210 for (const int64_t i : offsets_range) {
211 const IndexRange actual_range = offsets[i].shift(sub_range.start());
212 function(actual_range);
227 if (range.size() == 1) {
233 if (total_size <= grain_size) {
237 const int64_t middle = range.size() / 2;
240 threading::parallel_invoke(
255 lazy_threading::send_hint();
256 switch (size_hints.
type) {
257 case TaskSizeHints::Type::Static: {
259 const int64_t final_grain_size = task_size == 1 ?
261 std::max<int64_t>(1, grain_size / task_size);
262 parallel_for_impl_static_size(range, final_grain_size, function);
265 case TaskSizeHints::Type::IndividualLookup: {
266 parallel_for_impl_individual_size_lookup(
273 case TaskSizeHints::Type::AccumulatedLookup: {
298 const int num_threads = 8;
304 static tbb::task_arena arena{num_threads};
308 lazy_threading::send_hint();
311 arena.execute(function);
#define BLI_assert_msg(a, msg)
int BLI_task_scheduler_num_threads(void)
void(* TaskParallelRangeFunc)(void *__restrict userdata, int iter, const TaskParallelTLS *__restrict tls)
#define BLENDER_MAX_THREADS
Read Guarded memory(de)allocation.
Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operatio...
ATOMIC_INLINE int32_t atomic_fetch_and_add_int32(int32_t *p, int32_t x)
constexpr IndexRange take_front(int64_t n) const
constexpr IndexRange drop_front(int64_t n) const
virtual int64_t lookup_accumulated_size(IndexRange range) const =0
void *(* MEM_mallocN)(size_t len, const char *str)
void parallel_for_impl(IndexRange range, int64_t grain_size, FunctionRef< void(IndexRange)> function, const TaskSizeHints &size_hints)
void memory_bandwidth_bound_task_impl(FunctionRef< void()> function)
static void parallel_for_impl_accumulated_size_lookup(const IndexRange range, const int64_t grain_size, const FunctionRef< void(IndexRange)> function, const TaskSizeHints_AccumulatedLookup &size_hints)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
void BLI_task_parallel_range(const int start, const int stop, void *userdata, TaskParallelRangeFunc func, const TaskParallelSettings *settings)
int BLI_task_parallel_thread_id(const TaskParallelTLS *)