Blender V4.5
path_trace_work_cpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
6
7#include "device/cpu/kernel.h"
8#include "device/device.h"
9
10#ifdef WITH_CYCLES_DEBUG
11# include "kernel/film/write.h"
12#endif
13
15
18
19#include "scene/scene.h"
20#include "session/buffers.h"
21
22#include "util/tbb.h"
23
25
26/* Create TBB arena for execution of path tracing and rendering tasks. */
27static inline tbb::task_arena local_tbb_arena_create(const Device *device)
28{
29 /* TODO: limit this to number of threads of CPU device, it may be smaller than
30 * the system number of threads when we reduce the number of CPU threads in
31 * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
32 return tbb::task_arena(device->info.cpu_threads);
33}
34
35/* Get ThreadKernelGlobalsCPU for the current thread. */
37 vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
38{
39 const int thread_index = tbb::this_task_arena::current_thread_index();
40 DCHECK_GE(thread_index, 0);
41 DCHECK_LE(thread_index, kernel_thread_globals.size());
42
43 return &kernel_thread_globals[thread_index];
44}
45
47 Film *film,
48 DeviceScene *device_scene,
49 const bool *cancel_requested_flag)
50 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
51 kernels_(Device::get_cpu_kernels())
52{
53 DCHECK_EQ(device->info.type, DEVICE_CPU);
54}
55
57{
58 /* Cache per-thread kernel globals. */
59 device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
60}
61
63 const int start_sample,
64 const int samples_num,
65 const int sample_offset)
66{
69 const int64_t total_pixels_num = image_width * image_height;
70
71 if (device_->profiler.active()) {
72 for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
73 kernel_globals.start_profiling();
74 }
75 }
76
77 tbb::task_arena local_arena = local_tbb_arena_create(device_);
78 local_arena.execute([&]() {
79 parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
80 if (is_cancel_requested()) {
81 return;
82 }
83
84 const int y = work_index / image_width;
85 const int x = work_index - y * image_width;
86
87 KernelWorkTile work_tile;
88 work_tile.x = effective_buffer_params_.full_x + x;
89 work_tile.y = effective_buffer_params_.full_y + y;
90 work_tile.w = 1;
91 work_tile.h = 1;
92 work_tile.start_sample = start_sample;
93 work_tile.sample_offset = sample_offset;
94 work_tile.num_samples = 1;
95 work_tile.offset = effective_buffer_params_.offset;
96 work_tile.stride = effective_buffer_params_.stride;
97
99
100 render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
101 });
102 });
103 if (device_->profiler.active()) {
104 for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
105 kernel_globals.stop_profiling();
106 }
107 }
108
109 statistics.occupancy = 1.0f;
110}
111
113 const KernelWorkTile &work_tile,
114 const int samples_num)
115{
116 const bool has_bake = device_scene_->data.bake.use;
117
118 IntegratorStateCPU integrator_states[2];
119
120 IntegratorStateCPU *state = &integrator_states[0];
121 IntegratorStateCPU *shadow_catcher_state = nullptr;
122
123 if (device_scene_->data.integrator.has_shadow_catcher) {
124 shadow_catcher_state = &integrator_states[1];
125 path_state_init_queues(shadow_catcher_state);
126 }
127
128 KernelWorkTile sample_work_tile = work_tile;
129 float *render_buffer = buffers_->buffer.data();
130
131 for (int sample = 0; sample < samples_num; ++sample) {
132 if (is_cancel_requested()) {
133 break;
134 }
135
136 if (has_bake) {
137 if (!kernels_.integrator_init_from_bake(
138 kernel_globals, state, &sample_work_tile, render_buffer))
139 {
140 break;
141 }
142 }
143 else {
144 if (!kernels_.integrator_init_from_camera(
145 kernel_globals, state, &sample_work_tile, render_buffer))
146 {
147 break;
148 }
149 }
150
151#ifdef WITH_PATH_GUIDING
152 if (kernel_globals->data.integrator.train_guiding) {
153 assert(kernel_globals->opgl_path_segment_storage);
154 assert(kernel_globals->opgl_path_segment_storage->GetNumSegments() == 0);
155
156 kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
157
158 /* Push the generated sample data to the global sample data storage. */
159 guiding_push_sample_data_to_global_storage(kernel_globals, state, render_buffer);
160
161 /* No training for shadow catcher paths. */
162 if (shadow_catcher_state) {
163 kernel_globals->data.integrator.train_guiding = false;
164 kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
165 kernel_globals->data.integrator.train_guiding = true;
166 }
167 }
168 else
169#endif
170 {
171 kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
172 if (shadow_catcher_state) {
173 kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
174 }
175 }
176 ++sample_work_tile.start_sample;
177 }
178}
179
181 PassMode pass_mode,
182 const int num_samples)
183{
184 half4 *rgba_half = display->map_texture_buffer();
185 if (!rgba_half) {
186 /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for
187 * some implementations of PathTraceDisplay which can not map memory? */
188 return;
189 }
190
191 const KernelFilm &kfilm = device_scene_->data.film;
192
193 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
194 if (pass_access_info.type == PASS_NONE) {
195 return;
196 }
197
198 const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
199
201 destination.pixels_half_rgba = rgba_half;
202
203 tbb::task_arena local_arena = local_tbb_arena_create(device_);
204 local_arena.execute([&]() {
205 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
206 });
207
208 display->unmap_texture_buffer();
209}
210
212
214{
215 return buffers_->copy_from_device();
216}
217
219{
220 buffers_->buffer.copy_to_device();
221 return true;
222}
223
225{
226 buffers_->zero();
227 return true;
228}
229
231 bool reset)
232{
233 const int full_x = effective_buffer_params_.full_x;
234 const int full_y = effective_buffer_params_.full_y;
235 const int width = effective_buffer_params_.width;
236 const int height = effective_buffer_params_.height;
237 const int offset = effective_buffer_params_.offset;
238 const int stride = effective_buffer_params_.stride;
239
240 float *render_buffer = buffers_->buffer.data();
241
242 uint num_active_pixels = 0;
243
244 tbb::task_arena local_arena = local_tbb_arena_create(device_);
245
246 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
247 local_arena.execute([&]() {
248 parallel_for(full_y, full_y + height, [&](int y) {
249 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
250
251 bool row_converged = true;
252 uint num_row_pixels_active = 0;
253 for (int x = 0; x < width; ++x) {
254 if (!kernels_.adaptive_sampling_convergence_check(
255 kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride))
256 {
257 ++num_row_pixels_active;
258 row_converged = false;
259 }
260 }
261
262 atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
263
264 if (!row_converged) {
265 kernels_.adaptive_sampling_filter_x(
266 kernel_globals, render_buffer, y, full_x, width, offset, stride);
267 }
268 });
269 });
270
271 if (num_active_pixels) {
272 local_arena.execute([&]() {
273 parallel_for(full_x, full_x + width, [&](int x) {
274 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
275 kernels_.adaptive_sampling_filter_y(
276 kernel_globals, render_buffer, x, full_y, height, offset, stride);
277 });
278 });
279 }
280
281 return num_active_pixels;
282}
283
285{
286 const int width = effective_buffer_params_.width;
287 const int height = effective_buffer_params_.height;
288
289 float *render_buffer = buffers_->buffer.data();
290
291 tbb::task_arena local_arena = local_tbb_arena_create(device_);
292
293 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
294 local_arena.execute([&]() {
295 parallel_for(0, height, [&](int y) {
296 ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
297 int pixel_index = y * width;
298
299 for (int x = 0; x < width; ++x, ++pixel_index) {
300 kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
301 }
302 });
303 });
304}
305
306#ifdef WITH_PATH_GUIDING
307/* NOTE: It seems that this is called before every rendering iteration/progression and not once per
308 * rendering. May be we find a way to call it only once per rendering. */
309void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
310 void *sample_data_storage,
311 const bool train)
312{
313 /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
314 * kernel globals. */
315 for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
317 openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;
318
319 /* Allocate sampling distributions. */
320 kg.opgl_guiding_field = field;
321
322# if PATH_GUIDING_LEVEL >= 4
323 if (kg.opgl_surface_sampling_distribution) {
324 kg.opgl_surface_sampling_distribution.reset();
325 }
326 if (kg.opgl_volume_sampling_distribution) {
327 kg.opgl_volume_sampling_distribution.reset();
328 }
329
330 if (field) {
331 kg.opgl_surface_sampling_distribution =
332 make_unique<openpgl::cpp::SurfaceSamplingDistribution>(field);
333 kg.opgl_volume_sampling_distribution = make_unique<openpgl::cpp::VolumeSamplingDistribution>(
334 field);
335 }
336# endif
337
338 /* Reserve storage for training. */
339 kg.data.integrator.train_guiding = train;
340 kg.opgl_sample_data_storage = (openpgl::cpp::SampleStorage *)sample_data_storage;
341
342 if (train) {
343 kg.opgl_path_segment_storage->Reserve(kg.data.integrator.transparent_max_bounce +
344 kg.data.integrator.max_bounce + 3);
345 kg.opgl_path_segment_storage->Clear();
346 }
347 }
348}
349
350void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(ThreadKernelGlobalsCPU *kg,
354{
355# ifdef WITH_CYCLES_DEBUG
356 if (VLOG_WORK_IS_ON) {
357 /* Check if the generated path segments contain valid values. */
358 const bool validSegments = kg->opgl_path_segment_storage->ValidateSegments();
359 if (!validSegments) {
360 VLOG_WORK << "Guiding: invalid path segments!";
361 }
362 }
363
364 /* Write debug render pass to validate it matches combined pass. */
365 pgl_vec3f pgl_final_color = kg->opgl_path_segment_storage->CalculatePixelEstimate(false);
367 float3 final_color = make_float3(pgl_final_color.x, pgl_final_color.y, pgl_final_color.z);
368 if (kernel_data.film.pass_guiding_color != PASS_UNUSED) {
369 film_write_pass_float3(buffer + kernel_data.film.pass_guiding_color, final_color);
370 }
371# else
372 (void)state;
373 (void)render_buffer;
374# endif
375
376 /* Convert the path segment representation of the random walk into radiance samples. */
377# if PATH_GUIDING_LEVEL >= 2
378 const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
379 const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
380 kg->opgl_path_segment_storage->PrepareSamples(use_mis_weights, use_direct_light, false);
381# endif
382
383# ifdef WITH_CYCLES_DEBUG
384 /* Check if the training/radiance samples generated by the path segment storage are valid. */
385 if (VLOG_WORK_IS_ON) {
386 const bool validSamples = kg->opgl_path_segment_storage->ValidateSamples();
387 if (!validSamples) {
389 << "Guiding: path segment storage generated/contains invalid radiance/training samples!";
390 }
391 }
392# endif
393
394# if PATH_GUIDING_LEVEL >= 3
395 /* Push radiance samples from current random walk/path to the global sample storage. */
396 size_t num_samples = 0;
397 const openpgl::cpp::SampleData *samples = kg->opgl_path_segment_storage->GetSamples(num_samples);
398 kg->opgl_sample_data_storage->AddSamples(samples, num_samples);
399# endif
400
401 /* Clear storage for the current path, to be ready for the next path. */
402 kg->opgl_path_segment_storage->Clear();
403}
404#endif
405
static constexpr int image_width
static constexpr int image_height
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
long long int int64_t
void reset()
clear internal cached data and reset random seed
DeviceType type
DeviceInfo info
Definition film.h:29
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
bool copy_render_buffers_to_device() override
vector< ThreadKernelGlobalsCPU > kernel_thread_globals_
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
void init_execution() override
bool copy_render_buffers_from_device() override
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
void destroy_gpu_resources(PathTraceDisplay *display) override
void cryptomatte_postproces() override
const CPUKernels & kernels_
void render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals, const KernelWorkTile &work_tile, const int samples_num)
PathTraceWorkCPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
#define kernel_data
#define PASS_UNUSED
#define ccl_restrict
#define ccl_global
#define CCL_NAMESPACE_END
@ DEVICE_CPU
ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
#define assert(assertion)
VecBase< float, 3 > float3
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
@ PASS_NONE
#define DCHECK_EQ(a, b)
Definition log.h:58
#define VLOG_WORK
Definition log.h:74
#define DCHECK_GE(a, b)
Definition log.h:56
#define DCHECK_LE(a, b)
Definition log.h:61
#define VLOG_WORK_IS_ON
Definition log.h:75
static ulong state[N]
PassMode
Definition pass.h:20
CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init_queues(IntegratorState state)
Definition path_state.h:16
static ThreadKernelGlobalsCPU * kernel_thread_globals_get(vector< ThreadKernelGlobalsCPU > &kernel_thread_globals)
static CCL_NAMESPACE_BEGIN tbb::task_arena local_tbb_arena_create(const Device *device)
CCL_NAMESPACE_END KernelData data
Definition half.h:60
ccl_device_inline void film_write_pass_float3(ccl_global float *ccl_restrict buffer, const float3 value)
Definition write.h:55
CCL_NAMESPACE_BEGIN ccl_device_forceinline ccl_global float * film_pass_pixel_render_buffer(KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition write.h:23