Blender V5.0
path_trace_work_cpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
6
7#include "device/cpu/kernel.h"
8#include "device/device.h"
9
10#ifdef WITH_CYCLES_DEBUG
11# include "kernel/film/write.h"
12#endif
13
15
18
19#include "scene/scene.h"
20#include "session/buffers.h"
21
22#include "util/tbb.h"
23#include "util/time.h"
24
26
27/* Create TBB arena for execution of path tracing and rendering tasks. */
28static inline tbb::task_arena local_tbb_arena_create(const Device *device)
29{
30 /* TODO: limit this to number of threads of CPU device, it may be smaller than
31 * the system number of threads when we reduce the number of CPU threads in
32 * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
33 return tbb::task_arena(device->info.cpu_threads);
34}
35
36/* Get ThreadKernelGlobalsCPU for the current thread. */
38 vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
39{
40 const int thread_index = tbb::this_task_arena::current_thread_index();
41 DCHECK_GE(thread_index, 0);
42 DCHECK_LE(thread_index, kernel_thread_globals.size());
43
44 return &kernel_thread_globals[thread_index];
45}
46
48 Film *film,
49 DeviceScene *device_scene,
50 const bool *cancel_requested_flag)
51 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
52 kernels_(Device::get_cpu_kernels())
53{
54 DCHECK_EQ(device->info.type, DEVICE_CPU);
55}
56
58{
59 /* Cache per-thread kernel globals. */
60 device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
61}
62
64 const int start_sample,
65 const int samples_num,
66 const int sample_offset)
67{
70 const int64_t total_pixels_num = image_width * image_height;
71
72 if (device_->profiler.active()) {
73 for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
74 kernel_globals.start_profiling();
75 }
76 }
77
78 tbb::task_arena local_arena = local_tbb_arena_create(device_);
79 local_arena.execute([&]() {
80 parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
81 if (is_cancel_requested()) {
82 return;
83 }
84
85 const int y = work_index / image_width;
86 const int x = work_index - y * image_width;
87
88 KernelWorkTile work_tile;
89 work_tile.x = effective_buffer_params_.full_x + x;
90 work_tile.y = effective_buffer_params_.full_y + y;
91 work_tile.w = 1;
92 work_tile.h = 1;
93 work_tile.start_sample = start_sample;
94 work_tile.sample_offset = sample_offset;
95 work_tile.num_samples = 1;
96 work_tile.offset = effective_buffer_params_.offset;
97 work_tile.stride = effective_buffer_params_.stride;
98
100
101 render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
102 });
103 });
104 if (device_->profiler.active()) {
105 for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
106 kernel_globals.stop_profiling();
107 }
108 }
109
110 statistics.occupancy = 1.0f;
111}
112
114 const KernelWorkTile &work_tile,
115 const int samples_num)
116{
117 const bool has_bake = device_scene_->data.bake.use;
118
119 IntegratorStateCPU integrator_states[2];
120
121 IntegratorStateCPU *state = &integrator_states[0];
122 IntegratorStateCPU *shadow_catcher_state = nullptr;
123
124 if (device_scene_->data.integrator.has_shadow_catcher) {
125 shadow_catcher_state = &integrator_states[1];
126 path_state_init_queues(shadow_catcher_state);
127 }
128
129 KernelWorkTile sample_work_tile = work_tile;
130 float *render_buffer = buffers_->buffer.data();
131
132 fast_timer render_timer;
133
134 for (int sample = 0; sample < samples_num; ++sample) {
135 if (is_cancel_requested()) {
136 break;
137 }
138
139 if (has_bake) {
140 if (!kernels_.integrator_init_from_bake(
141 kernel_globals, state, &sample_work_tile, render_buffer))
142 {
143 break;
144 }
145 }
146 else {
147 if (!kernels_.integrator_init_from_camera(
148 kernel_globals, state, &sample_work_tile, render_buffer))
149 {
150 break;
151 }
152 }
153
154#if defined(WITH_PATH_GUIDING)
155 if (kernel_globals->data.integrator.train_guiding) {
156 assert(kernel_globals->opgl_path_segment_storage);
157 assert(kernel_globals->opgl_path_segment_storage->GetNumSegments() == 0);
158
159 kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
160
161 /* Push the generated sample data to the global sample data storage. */
162 guiding_push_sample_data_to_global_storage(kernel_globals, state, render_buffer);
163
164 /* No training for shadow catcher paths. */
165 if (shadow_catcher_state) {
166 kernel_globals->data.integrator.train_guiding = false;
167 kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
168 kernel_globals->data.integrator.train_guiding = true;
169 }
170 }
171 else
172#endif
173 {
174 kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
175 if (shadow_catcher_state) {
176 kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
177 }
178 }
179
180 if (kernel_globals->data.film.pass_render_time != PASS_UNUSED) {
181 uint64_t time;
182 if (render_timer.lap(time)) {
183 ccl_global float *buffer = render_buffer + (uint64_t)state->path.render_pixel_index *
184 kernel_globals->data.film.pass_stride;
185 *(buffer + kernel_globals->data.film.pass_render_time) += float(time);
186 }
187 }
188 ++sample_work_tile.start_sample;
189 }
190}
191
193 PassMode pass_mode,
194 const int num_samples)
195{
196 half4 *rgba_half = display->map_texture_buffer();
197 if (!rgba_half) {
198 /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for
199 * some implementations of PathTraceDisplay which can not map memory? */
200 return;
201 }
202
203 const KernelFilm &kfilm = device_scene_->data.film;
204
205 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
206 if (pass_access_info.type == PASS_NONE) {
207 return;
208 }
209
210 const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
211
212 PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
213 destination.pixels_half_rgba = rgba_half;
214
215 tbb::task_arena local_arena = local_tbb_arena_create(device_);
216 local_arena.execute([&]() {
217 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
218 });
219
220 display->unmap_texture_buffer();
221}
222
224
226{
227 return buffers_->copy_from_device();
228}
229
231{
232 buffers_->buffer.copy_to_device();
233 return true;
234}
235
237{
238 buffers_->zero();
239 return true;
240}
241
243 bool reset)
244{
245 const int full_x = effective_buffer_params_.full_x;
246 const int full_y = effective_buffer_params_.full_y;
247 const int width = effective_buffer_params_.width;
248 const int height = effective_buffer_params_.height;
249 const int offset = effective_buffer_params_.offset;
250 const int stride = effective_buffer_params_.stride;
251
252 float *render_buffer = buffers_->buffer.data();
253
254 uint num_active_pixels = 0;
255
256 tbb::task_arena local_arena = local_tbb_arena_create(device_);
257
258 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
259 local_arena.execute([&]() {
260 parallel_for(full_y, full_y + height, [&](int y) {
261 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
262
263 bool row_converged = true;
264 uint num_row_pixels_active = 0;
265 for (int x = 0; x < width; ++x) {
266 if (!kernels_.adaptive_sampling_convergence_check(
267 kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride))
268 {
269 ++num_row_pixels_active;
270 row_converged = false;
271 }
272 }
273
274 atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
275
276 if (!row_converged) {
277 kernels_.adaptive_sampling_filter_x(
278 kernel_globals, render_buffer, y, full_x, width, offset, stride);
279 }
280 });
281 });
282
283 if (num_active_pixels) {
284 local_arena.execute([&]() {
285 parallel_for(full_x, full_x + width, [&](int x) {
286 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
287 kernels_.adaptive_sampling_filter_y(
288 kernel_globals, render_buffer, x, full_y, height, offset, stride);
289 });
290 });
291 }
292
293 return num_active_pixels;
294}
295
297{
298 const int width = effective_buffer_params_.width;
299 const int height = effective_buffer_params_.height;
300
301 float *render_buffer = buffers_->buffer.data();
302
303 tbb::task_arena local_arena = local_tbb_arena_create(device_);
304
305 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
306 local_arena.execute([&]() {
307 parallel_for(0, height, [&](int y) {
308 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
309 int pixel_index = y * width;
310
311 for (int x = 0; x < width; ++x, ++pixel_index) {
312 kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
313 }
314 });
315 });
316}
317
319{
320 const int min_x = effective_buffer_params_.full_x;
321 const int min_y = effective_buffer_params_.full_y;
322 const int max_x = effective_buffer_params_.width + min_x;
323 const int max_y = effective_buffer_params_.height + min_y;
324 const int offset = effective_buffer_params_.offset;
325 const int stride = effective_buffer_params_.stride;
326
327 float *render_buffer = buffers_->buffer.data();
328
329 tbb::task_arena local_arena = local_tbb_arena_create(device_);
330
331 const blocked_range2d<int> range(min_x, max_x, min_y, max_y);
332
333 /* Filter in x direction. */
334 local_arena.execute([&]() {
335 parallel_for(range, [&](const blocked_range2d<int> r) {
336 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
337 for (int y = r.cols().begin(); y < r.cols().end(); ++y) {
338 for (int x = r.rows().begin(); x < r.rows().end(); ++x) {
339 kernels_.volume_guiding_filter_x(
340 kernel_globals, render_buffer, y, x, min_x, max_x, offset, stride);
341 }
342 }
343 });
344 });
345
346 /* Filter in y direction. Unlike `filter_x`, the inner loop of `filter_y` is serially run inside
347 * the kernel, to avoid the need of intermediate buffers. */
348 local_arena.execute([&]() {
349 parallel_for(min_x, max_x, [&](int x) {
350 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
351 kernels_.volume_guiding_filter_y(
352 kernel_globals, render_buffer, x, min_y, max_y, offset, stride);
353 });
354 });
355}
356
357#if defined(WITH_PATH_GUIDING)
358/* NOTE: It seems that this is called before every rendering iteration/progression and not once per
359 * rendering. May be we find a way to call it only once per rendering. */
360void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
361 void *sample_data_storage,
362 const bool train)
363{
364 /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
365 * kernel globals. */
366 for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
368 openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;
369
370 /* Allocate sampling distributions. */
371 kg.opgl_guiding_field = field;
372
373# if PATH_GUIDING_LEVEL >= 4
374 if (kg.opgl_surface_sampling_distribution) {
375 kg.opgl_surface_sampling_distribution.reset();
376 }
377 if (kg.opgl_volume_sampling_distribution) {
378 kg.opgl_volume_sampling_distribution.reset();
379 }
380
381 if (field) {
382 kg.opgl_surface_sampling_distribution =
383 make_unique<openpgl::cpp::SurfaceSamplingDistribution>(field);
384 kg.opgl_volume_sampling_distribution = make_unique<openpgl::cpp::VolumeSamplingDistribution>(
385 field);
386 }
387# endif
388
389 /* Reserve storage for training. */
390 kg.data.integrator.train_guiding = train;
391 kg.opgl_sample_data_storage = (openpgl::cpp::SampleStorage *)sample_data_storage;
392
393 if (train) {
394 kg.opgl_path_segment_storage->Reserve(kg.data.integrator.transparent_max_bounce +
395 kg.data.integrator.max_bounce + 3);
396 kg.opgl_path_segment_storage->Clear();
397 }
398 }
399}
400
401void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(ThreadKernelGlobalsCPU *kg,
405{
406# ifdef WITH_CYCLES_DEBUG
408 /* Check if the generated path segments contain valid values. */
409 const bool validSegments = kg->opgl_path_segment_storage->ValidateSegments();
410 if (!validSegments) {
411 LOG_DEBUG << "Guiding: invalid path segments!";
412 }
413 }
414
415 /* Write debug render pass to validate it matches combined pass. */
416 pgl_vec3f pgl_final_color = kg->opgl_path_segment_storage->CalculatePixelEstimate(false);
418 float3 final_color = make_float3(pgl_final_color.x, pgl_final_color.y, pgl_final_color.z);
419 if (kernel_data.film.pass_guiding_color != PASS_UNUSED) {
420 film_write_pass_float3(buffer + kernel_data.film.pass_guiding_color, final_color);
421 }
422# else
423 (void)state;
424 (void)render_buffer;
425# endif
426
427 /* Convert the path segment representation of the random walk into radiance samples. */
428# if PATH_GUIDING_LEVEL >= 2
429 const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
430 const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
431 kg->opgl_path_segment_storage->PrepareSamples(use_mis_weights, use_direct_light, false);
432# endif
433
434# ifdef WITH_CYCLES_DEBUG
435 /* Check if the training/radiance samples generated by the path segment storage are valid. */
437 const bool validSamples = kg->opgl_path_segment_storage->ValidateSamples();
438 if (!validSamples) {
440 << "Guiding: path segment storage generated/contains invalid radiance/training samples!";
441 }
442 }
443# endif
444
445# if PATH_GUIDING_LEVEL >= 3
446 /* Push radiance samples from current random walk/path to the global sample storage. */
447 size_t num_samples = 0;
448 const openpgl::cpp::SampleData *samples = kg->opgl_path_segment_storage->GetSamples(num_samples);
449 kg->opgl_sample_data_storage->AddSamples(samples, num_samples);
450# endif
451
452 /* Clear storage for the current path, to be ready for the next path. */
453 kg->opgl_path_segment_storage->Clear();
454}
455#endif
456
static constexpr int image_width
static constexpr int image_height
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
long long int int64_t
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
DeviceType type
DeviceInfo info
Definition film.h:29
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
bool copy_render_buffers_to_device() override
vector< ThreadKernelGlobalsCPU > kernel_thread_globals_
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
void init_execution() override
bool copy_render_buffers_from_device() override
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
void destroy_gpu_resources(PathTraceDisplay *display) override
void cryptomatte_postproces() override
const CPUKernels & kernels_
void render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals, const KernelWorkTile &work_tile, const int samples_num)
PathTraceWorkCPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
void denoise_volume_guiding_buffers() override
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display, const PassMode mode) const
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
bool lap(uint64_t &delta)
Definition time.h:70
#define kernel_data
#define PASS_UNUSED
#define ccl_restrict
#define ccl_global
#define CCL_NAMESPACE_END
@ DEVICE_CPU
ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
#define assert(assertion)
VecBase< float, 3 > float3
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
@ PASS_NONE
#define DCHECK_EQ(a, b)
Definition log.h:144
#define DCHECK_GE(a, b)
Definition log.h:142
#define LOG_DEBUG
Definition log.h:107
#define DCHECK_LE(a, b)
Definition log.h:147
#define LOG_IS_ON(level)
Definition log.h:113
@ LOG_LEVEL_DEBUG
Definition log.h:26
static ulong state[N]
PassMode
Definition pass.h:20
CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init_queues(IntegratorState state)
Definition path_state.h:16
static ThreadKernelGlobalsCPU * kernel_thread_globals_get(vector< ThreadKernelGlobalsCPU > &kernel_thread_globals)
static CCL_NAMESPACE_BEGIN tbb::task_arena local_tbb_arena_create(const Device *device)
CCL_NAMESPACE_END KernelData data
Definition half.h:60
ccl_device_inline void film_write_pass_float3(ccl_global float *ccl_restrict buffer, const float3 value)
Definition write.h:68
CCL_NAMESPACE_BEGIN ccl_device_forceinline ccl_global float * film_pass_pixel_render_buffer(KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition write.h:24