Blender V4.3
path_trace_work_cpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
6
7#include "device/cpu/kernel.h"
8#include "device/device.h"
9
10#include "kernel/film/write.h"
12
15
16#include "scene/scene.h"
17#include "session/buffers.h"
18
19#include "util/atomic.h"
20#include "util/log.h"
21#include "util/tbb.h"
22
24
25/* Create TBB arena for execution of path tracing and rendering tasks. */
26static inline tbb::task_arena local_tbb_arena_create(const Device *device)
27{
28 /* TODO: limit this to number of threads of CPU device, it may be smaller than
29 * the system number of threads when we reduce the number of CPU threads in
30 * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
31 return tbb::task_arena(device->info.cpu_threads);
32}
33
34/* Get CPUKernelThreadGlobals for the current thread. */
36 vector<CPUKernelThreadGlobals> &kernel_thread_globals)
37{
38 const int thread_index = tbb::this_task_arena::current_thread_index();
39 DCHECK_GE(thread_index, 0);
40 DCHECK_LE(thread_index, kernel_thread_globals.size());
41
42 return &kernel_thread_globals[thread_index];
43}
44
46 Film *film,
47 DeviceScene *device_scene,
48 bool *cancel_requested_flag)
49 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
50 kernels_(Device::get_cpu_kernels())
51{
52 DCHECK_EQ(device->info.type, DEVICE_CPU);
53}
54
56{
57 /* Cache per-thread kernel globals. */
59}
60
62 int start_sample,
63 int samples_num,
64 int sample_offset)
65{
68 const int64_t total_pixels_num = image_width * image_height;
69
70 if (device_->profiler.active()) {
71 for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
72 kernel_globals.start_profiling();
73 }
74 }
75
76 tbb::task_arena local_arena = local_tbb_arena_create(device_);
77 local_arena.execute([&]() {
78 parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
79 if (is_cancel_requested()) {
80 return;
81 }
82
83 const int y = work_index / image_width;
84 const int x = work_index - y * image_width;
85
86 KernelWorkTile work_tile;
87 work_tile.x = effective_buffer_params_.full_x + x;
88 work_tile.y = effective_buffer_params_.full_y + y;
89 work_tile.w = 1;
90 work_tile.h = 1;
91 work_tile.start_sample = start_sample;
92 work_tile.sample_offset = sample_offset;
93 work_tile.num_samples = 1;
96
98
99 render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
100 });
101 });
102 if (device_->profiler.active()) {
103 for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
104 kernel_globals.stop_profiling();
105 }
106 }
107
108 statistics.occupancy = 1.0f;
109}
110
112 const KernelWorkTile &work_tile,
113 const int samples_num)
114{
115 const bool has_bake = device_scene_->data.bake.use;
116
117 IntegratorStateCPU integrator_states[2];
118
119 IntegratorStateCPU *state = &integrator_states[0];
120 IntegratorStateCPU *shadow_catcher_state = nullptr;
121
122 if (device_scene_->data.integrator.has_shadow_catcher) {
123 shadow_catcher_state = &integrator_states[1];
124 path_state_init_queues(shadow_catcher_state);
125 }
126
127 KernelWorkTile sample_work_tile = work_tile;
128 float *render_buffer = buffers_->buffer.data();
129
130 for (int sample = 0; sample < samples_num; ++sample) {
131 if (is_cancel_requested()) {
132 break;
133 }
134
135 if (has_bake) {
137 kernel_globals, state, &sample_work_tile, render_buffer))
138 {
139 break;
140 }
141 }
142 else {
144 kernel_globals, state, &sample_work_tile, render_buffer))
145 {
146 break;
147 }
148 }
149
151
152#ifdef WITH_PATH_GUIDING
153 if (kernel_globals->data.integrator.train_guiding) {
154 /* Push the generated sample data to the global sample data storage. */
155 guiding_push_sample_data_to_global_storage(kernel_globals, state, render_buffer);
156 }
157#endif
158
159 if (shadow_catcher_state) {
160 kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
161 }
162
163 ++sample_work_tile.start_sample;
164 }
165}
166
168 PassMode pass_mode,
169 int num_samples)
170{
171 half4 *rgba_half = display->map_texture_buffer();
172 if (!rgba_half) {
173 /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for
174 * some implementations of PathTraceDisplay which can not map memory? */
175 return;
176 }
177
178 const KernelFilm &kfilm = device_scene_->data.film;
179
180 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
181 if (pass_access_info.type == PASS_NONE) {
182 return;
183 }
184
185 const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
186
188 destination.pixels_half_rgba = rgba_half;
189
190 tbb::task_arena local_arena = local_tbb_arena_create(device_);
191 local_arena.execute([&]() {
192 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
193 });
194
195 display->unmap_texture_buffer();
196}
197
199
201{
202 return buffers_->copy_from_device();
203}
204
206{
207 buffers_->buffer.copy_to_device();
208 return true;
209}
210
212{
213 buffers_->zero();
214 return true;
215}
216
218{
219 const int full_x = effective_buffer_params_.full_x;
220 const int full_y = effective_buffer_params_.full_y;
221 const int width = effective_buffer_params_.width;
222 const int height = effective_buffer_params_.height;
223 const int offset = effective_buffer_params_.offset;
224 const int stride = effective_buffer_params_.stride;
225
226 float *render_buffer = buffers_->buffer.data();
227
228 uint num_active_pixels = 0;
229
230 tbb::task_arena local_arena = local_tbb_arena_create(device_);
231
232 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
233 local_arena.execute([&]() {
234 parallel_for(full_y, full_y + height, [&](int y) {
236
237 bool row_converged = true;
238 uint num_row_pixels_active = 0;
239 for (int x = 0; x < width; ++x) {
241 kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride))
242 {
243 ++num_row_pixels_active;
244 row_converged = false;
245 }
246 }
247
248 atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
249
250 if (!row_converged) {
252 kernel_globals, render_buffer, y, full_x, width, offset, stride);
253 }
254 });
255 });
256
257 if (num_active_pixels) {
258 local_arena.execute([&]() {
259 parallel_for(full_x, full_x + width, [&](int x) {
262 kernel_globals, render_buffer, x, full_y, height, offset, stride);
263 });
264 });
265 }
266
267 return num_active_pixels;
268}
269
271{
272 const int width = effective_buffer_params_.width;
273 const int height = effective_buffer_params_.height;
274
275 float *render_buffer = buffers_->buffer.data();
276
277 tbb::task_arena local_arena = local_tbb_arena_create(device_);
278
279 /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
280 local_arena.execute([&]() {
281 parallel_for(0, height, [&](int y) {
283 int pixel_index = y * width;
284
285 for (int x = 0; x < width; ++x, ++pixel_index) {
286 kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
287 }
288 });
289 });
290}
291
292#ifdef WITH_PATH_GUIDING
293/* NOTE: It seems that this is called before every rendering iteration/progression and not once per
294 * rendering. May be we find a way to call it only once per rendering. */
295void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
296 void *sample_data_storage,
297 const bool train)
298{
299 /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
300 * kernel globals. */
301 for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
303 openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;
304
305 /* Allocate sampling distributions. */
306 kg.opgl_guiding_field = field;
307
308# if PATH_GUIDING_LEVEL >= 4
309 if (kg.opgl_surface_sampling_distribution) {
310 delete kg.opgl_surface_sampling_distribution;
311 kg.opgl_surface_sampling_distribution = nullptr;
312 }
313 if (kg.opgl_volume_sampling_distribution) {
314 delete kg.opgl_volume_sampling_distribution;
315 kg.opgl_volume_sampling_distribution = nullptr;
316 }
317
318 if (field) {
319 kg.opgl_surface_sampling_distribution = new openpgl::cpp::SurfaceSamplingDistribution(field);
320 kg.opgl_volume_sampling_distribution = new openpgl::cpp::VolumeSamplingDistribution(field);
321 }
322# endif
323
324 /* Reserve storage for training. */
325 kg.data.integrator.train_guiding = train;
326 kg.opgl_sample_data_storage = (openpgl::cpp::SampleStorage *)sample_data_storage;
327
328 if (train) {
329 kg.opgl_path_segment_storage->Reserve(kg.data.integrator.transparent_max_bounce +
330 kg.data.integrator.max_bounce + 3);
331 kg.opgl_path_segment_storage->Clear();
332 }
333 }
334}
335
336void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
338{
339# ifdef WITH_CYCLES_DEBUG
340 if (VLOG_WORK_IS_ON) {
341 /* Check if the generated path segments contain valid values. */
342 const bool validSegments = kg->opgl_path_segment_storage->ValidateSegments();
343 if (!validSegments) {
344 VLOG_WORK << "Guiding: invalid path segments!";
345 }
346 }
347
348 /* Write debug render pass to validate it matches combined pass. */
349 pgl_vec3f pgl_final_color = kg->opgl_path_segment_storage->CalculatePixelEstimate(false);
351 float3 final_color = make_float3(pgl_final_color.x, pgl_final_color.y, pgl_final_color.z);
352 if (kernel_data.film.pass_guiding_color != PASS_UNUSED) {
353 film_write_pass_float3(buffer + kernel_data.film.pass_guiding_color, final_color);
354 }
355# else
356 (void)state;
357 (void)render_buffer;
358# endif
359
360 /* Convert the path segment representation of the random walk into radiance samples. */
361# if PATH_GUIDING_LEVEL >= 2
362 const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
363 const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
364 kg->opgl_path_segment_storage->PrepareSamples(use_mis_weights, use_direct_light, false);
365# endif
366
367# ifdef WITH_CYCLES_DEBUG
368 /* Check if the training/radiance samples generated by the path segment storage are valid. */
369 if (VLOG_WORK_IS_ON) {
370 const bool validSamples = kg->opgl_path_segment_storage->ValidateSamples();
371 if (!validSamples) {
373 << "Guiding: path segment storage generated/contains invalid radiance/training samples!";
374 }
375 }
376# endif
377
378# if PATH_GUIDING_LEVEL >= 3
379 /* Push radiance samples from current random walk/path to the global sample storage. */
380 size_t num_samples = 0;
381 const openpgl::cpp::SampleData *samples = kg->opgl_path_segment_storage->GetSamples(num_samples);
382 kg->opgl_sample_data_storage->AddSamples(samples, num_samples);
383# endif
384
385 /* Clear storage for the current path, to be ready for the next path. */
386 kg->opgl_path_segment_storage->Clear();
387}
388#endif
389
static constexpr int image_width
static constexpr int image_height
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
void reset()
clear internal cached data and reset random seed
NODE_DECLARE int width
Definition buffers.h:72
AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check
CryptomattePostprocessFunction cryptomatte_postprocess
IntegratorShadeFunction integrator_megakernel
AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y
IntegratorInitFunction integrator_init_from_bake
AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x
IntegratorInitFunction integrator_init_from_camera
DeviceType type
KernelData data
Definition devicescene.h:95
virtual void get_cpu_kernel_thread_globals(vector< CPUKernelThreadGlobals > &)
Profiler & profiler
DeviceInfo info
Definition film.h:30
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
virtual bool copy_render_buffers_to_device() override
virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num, int sample_offset) override
vector< CPUKernelThreadGlobals > kernel_thread_globals_
virtual void init_execution() override
virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) override
virtual bool copy_render_buffers_from_device() override
virtual void destroy_gpu_resources(PathTraceDisplay *display) override
virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override
virtual void cryptomatte_postproces() override
const CPUKernels & kernels_
PathTraceWorkCPU(Device *device, Film *film, DeviceScene *device_scene, bool *cancel_requested_flag)
void render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals, const KernelWorkTile &work_tile, const int samples_num)
virtual bool zero_render_buffers() override
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
bool is_cancel_requested() const
bool active() const
#define kernel_data
#define ccl_restrict
#define ccl_global
#define CCL_NAMESPACE_END
@ DEVICE_CPU
ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
#define PASS_UNUSED
@ PASS_NONE
#define DCHECK_EQ(a, b)
Definition log.h:59
#define VLOG_WORK
Definition log.h:75
#define DCHECK_GE(a, b)
Definition log.h:57
#define DCHECK_LE(a, b)
Definition log.h:62
#define VLOG_WORK_IS_ON
Definition log.h:76
static ulong state[N]
PassMode
Definition pass.h:20
CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init_queues(IntegratorState state)
Definition path_state.h:14
static CPUKernelThreadGlobals * kernel_thread_globals_get(vector< CPUKernelThreadGlobals > &kernel_thread_globals)
static CCL_NAMESPACE_BEGIN tbb::task_arena local_tbb_arena_create(const Device *device)
__int64 int64_t
Definition stdint.h:89
Definition half.h:61
ccl_device_inline void film_write_pass_float3(ccl_global float *ccl_restrict buffer, float3 value)
Definition write.h:48
CCL_NAMESPACE_BEGIN ccl_device_forceinline ccl_global float * film_pass_pixel_render_buffer(KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition write.h:17