Blender V5.0
path_trace_work_gpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
6
7#include "device/device.h"
8
11
12#include "scene/scene.h"
13#include "session/buffers.h"
14
15#include "util/log.h"
16#include "util/string.h"
17
19#include "kernel/types.h"
20
22
23static size_t estimate_single_state_size(const uint kernel_features)
24{
25 size_t state_size = 0;
26
27#define KERNEL_STRUCT_BEGIN(name) \
28 for (int array_index = 0;; array_index++) {
29
30#ifdef __INTEGRATOR_GPU_PACKED_STATE__
31# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
32 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
33# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
34# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
35 KERNEL_STRUCT_BEGIN(parent_struct) \
36 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
37#else
38# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
39 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
40# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
41# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
42#endif
43
44#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
45 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
46#define KERNEL_STRUCT_END(name) \
47 (void)array_index; \
48 break; \
49 }
50#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
51 if (array_index >= gpu_array_size - 1) { \
52 break; \
53 } \
54 }
55/* TODO(sergey): Look into better estimation for fields which depend on scene features. Maybe
56 * maximum state calculation should happen as `alloc_work_memory()`, so that we can react to an
57 * updated scene state here.
58 * For until then use common value. Currently this size is only used for logging, but is weak to
59 * rely on this. */
60#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
61
63
65
66#undef KERNEL_STRUCT_BEGIN
67#undef KERNEL_STRUCT_BEGIN_PACKED
68#undef KERNEL_STRUCT_MEMBER
69#undef KERNEL_STRUCT_MEMBER_PACKED
70#undef KERNEL_STRUCT_ARRAY_MEMBER
71#undef KERNEL_STRUCT_END
72#undef KERNEL_STRUCT_END_ARRAY
73#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
74
75 return state_size;
76}
77
79 Film *film,
80 DeviceScene *device_scene,
81 const bool *cancel_requested_flag)
82 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
83 queue_(device->gpu_queue_create()),
85 integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
86 integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
88 device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
90 device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
92 device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
94 device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
95 integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
97 device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
98 queued_paths_(device, "queued_paths", MEM_READ_WRITE),
99 num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
100 work_tiles_(device, "work_tiles", MEM_READ_WRITE),
101 display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
105{
106 memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
107}
108
110{
111 /* IntegrateState allocated as structure of arrays. */
112
113 /* Check if we already allocated memory for the required features. */
114 const int requested_volume_stack_size = device_scene_->data.volume_stack_size;
115 const uint kernel_features = device_scene_->data.kernel_features;
116 if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features &&
117 integrator_state_soa_volume_stack_size_ >= requested_volume_stack_size)
118 {
119 return;
120 }
123 requested_volume_stack_size);
124
125 /* Determine the number of path states. Deferring this for as long as possible allows the
126 * back-end to make better decisions about memory availability. */
127 if (max_num_paths_ == 0) {
128 const size_t single_state_size = estimate_single_state_size(kernel_features);
129
130 max_num_paths_ = queue_->num_concurrent_states(single_state_size);
131 min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
132
133 /* Limit number of active paths to the half of the overall state. This is due to the logic in
134 * the path compaction which relies on the fact that regeneration does not happen sooner than
135 * half of the states are available again. */
137 }
138
139 /* Allocate a device only memory buffer before for each struct member, and then
140 * write the pointers into a struct that resides in constant memory.
141 *
142 * TODO: store float3 in separate XYZ arrays. */
143#define KERNEL_STRUCT_BEGIN(name) \
144 for (int array_index = 0;; array_index++) {
145#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
146 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
147 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
148 shadow ? "shadow_" : ""); \
149 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
150 array->alloc_to_device(max_num_paths_); \
151 memcpy(&integrator_state_gpu_.parent_struct.name, \
152 &array->device_pointer, \
153 sizeof(array->device_pointer)); \
154 integrator_state_soa_.emplace_back(std::move(array)); \
155 }
156#ifdef __INTEGRATOR_GPU_PACKED_STATE__
157# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
158 if ((kernel_features & (feature))) { \
159 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
160 shadow ? "shadow_" : ""); \
161 LOG_TRACE << "Skipping " << name_str \
162 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
163 }
164# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
165 KERNEL_STRUCT_BEGIN(parent_struct) \
166 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
167#else
168# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
169# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
170#endif
171
172#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
173 if ((kernel_features & (feature)) && \
174 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
175 { \
176 string name_str = string_printf( \
177 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
178 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
179 array->alloc_to_device(max_num_paths_); \
180 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
181 &array->device_pointer, \
182 sizeof(array->device_pointer)); \
183 integrator_state_soa_.emplace_back(std::move(array)); \
184 }
185#define KERNEL_STRUCT_END(name) \
186 (void)array_index; \
187 break; \
188 }
189#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
190 if (array_index >= gpu_array_size - 1) { \
191 break; \
192 } \
193 }
194#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
195
196 bool shadow = false;
198 shadow = true;
200
201#undef KERNEL_STRUCT_BEGIN
202#undef KERNEL_STRUCT_BEGIN_PACKED
203#undef KERNEL_STRUCT_MEMBER
204#undef KERNEL_STRUCT_MEMBER_PACKED
205#undef KERNEL_STRUCT_ARRAY_MEMBER
206#undef KERNEL_STRUCT_END
207#undef KERNEL_STRUCT_END_ARRAY
208#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
209
211 size_t total_soa_size = 0;
212 for (auto &&soa_memory : integrator_state_soa_) {
213 total_soa_size += soa_memory->memory_size();
214 }
215
216 LOG_TRACE << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
217 }
218}
219
221{
222 if (integrator_queue_counter_.size() == 0) {
224 integrator_queue_counter_.zero_to_device();
225 integrator_queue_counter_.copy_from_device();
227 integrator_queue_counter_.device_pointer;
228 }
229
230 /* Allocate data for active path index arrays. */
231 if (num_queued_paths_.size() == 0) {
232 num_queued_paths_.alloc(1);
233 num_queued_paths_.zero_to_device();
234 }
235
236 if (queued_paths_.size() == 0) {
238 /* TODO: this could be skip if we had a function to just allocate on device. */
239 queued_paths_.zero_to_device();
240 }
241}
242
244{
245 num_sort_partitions_ = queue_->num_sort_partitions(max_num_paths_,
246 device_scene_->data.max_shaders);
247
248 integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
250
251 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
252 /* Allocate array for partitioned shader sorting using local atomics. */
253 const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
254 if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
257 }
258 integrator_state_gpu_.sort_partition_key_offsets =
260 }
261 else {
262 /* Allocate arrays for shader sorting. */
263 const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
264 if (integrator_shader_sort_counter_.size() < sort_buckets) {
265 integrator_shader_sort_counter_.alloc(sort_buckets);
266 integrator_shader_sort_counter_.zero_to_device();
268 (int *)integrator_shader_sort_counter_.device_pointer;
269
270 integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
271 integrator_shader_sort_prefix_sum_.zero_to_device();
272 }
273
274 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
275 if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
279 (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
280 }
281 }
282
283 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
284 if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
285 integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
288 (int *)integrator_shader_mnee_sort_counter_.device_pointer;
289 }
290 }
291 }
292}
293
295{
296 if (integrator_next_shadow_path_index_.size() == 0) {
298 integrator_next_shadow_path_index_.zero_to_device();
299
300 integrator_state_gpu_.next_shadow_path_index =
301 (int *)integrator_next_shadow_path_index_.device_pointer;
302 }
303
304 if (integrator_next_main_path_index_.size() == 0) {
307 integrator_next_main_path_index_.zero_to_device();
308
309 integrator_state_gpu_.next_main_path_index =
310 (int *)integrator_next_main_path_index_.device_pointer;
311 }
312}
313
321
323{
324 queue_->init_execution();
325
326 /* Copy to device side struct in constant memory. */
327 device_->const_copy_to(
328 "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
329}
330
332 const int start_sample,
333 const int samples_num,
334 const int sample_offset)
335{
336 /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
337 * add more work (because tiles are smaller, so there is higher chance that more paths will
338 * become busy after adding new tiles). This is especially important for the shadow catcher which
339 * schedules work in halves of available number of paths. */
340 work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
341 work_tile_scheduler_.set_accelerated_rt(
342 (device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
344 start_sample,
345 samples_num,
346 sample_offset,
347 device_scene_->data.integrator.scrambling_distance);
348
350
351 int num_iterations = 0;
352 uint64_t num_busy_accum = 0;
353
354 /* TODO: set a hard limit in case of undetected kernel failures? */
355 while (true) {
356 /* Enqueue work from the scheduler, on start or when there are not enough
357 * paths to keep the device occupied. */
358 bool finished;
359 if (enqueue_work_tiles(finished)) {
360 /* Copy stats from the device. */
361 queue_->copy_from_device(integrator_queue_counter_);
362
363 if (!queue_->synchronize()) {
364 break; /* Stop on error. */
365 }
366 }
367
368 if (is_cancel_requested()) {
369 break;
370 }
371
372 /* Stop if no more work remaining. */
373 if (finished) {
374 break;
375 }
376
377 /* Enqueue on of the path iteration kernels. */
379 /* Copy stats from the device. */
380 queue_->copy_from_device(integrator_queue_counter_);
381
382 if (!queue_->synchronize()) {
383 break; /* Stop on error. */
384 }
385 }
386
387 if (is_cancel_requested()) {
388 break;
389 }
390
391 num_busy_accum += num_active_main_paths_paths();
392 ++num_iterations;
393 }
394
395 if (num_iterations) {
396 statistics.occupancy = float(num_busy_accum) / num_iterations / max_num_paths_;
397 }
398 else {
399 statistics.occupancy = 0.0f;
400 }
401}
402
404{
405 const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
406
407 int max_num_queued = 0;
409
410 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
411 if (queue_counter->num_queued[i] > max_num_queued) {
412 kernel = (DeviceKernel)i;
413 max_num_queued = queue_counter->num_queued[i];
414 }
415 }
416
417 return kernel;
418}
419
421{
423
425 queue_->zero_to_device(integrator_queue_counter_);
426 if (integrator_shader_sort_counter_.size() != 0) {
428 }
429 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
431 {
433 }
434 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
436 {
438 }
439
440 /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
441 * counter on the host side because `zero_to_device()` is not doing it. */
442 if (integrator_queue_counter_.host_pointer) {
443 memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
444 }
445}
446
448{
449 /* Find kernel to execute, with max number of queued paths. */
450 const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
451
452 int num_active_paths = 0;
453 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
454 num_active_paths += queue_counter->num_queued[i];
455 }
456
457 if (num_active_paths == 0) {
458 return false;
459 }
460
461 /* Find kernel to execute, with max number of queued paths. */
462 const DeviceKernel kernel = get_most_queued_kernel();
463 if (kernel == DEVICE_KERNEL_NUM) {
464 return false;
465 }
466
467 /* For kernels that add shadow paths, check if there is enough space available.
468 * If not, schedule shadow kernels first to clear out the shadow paths. */
469 int num_paths_limit = INT_MAX;
470
471 if (kernel_creates_shadow_paths(kernel)) {
473
474 const int available_shadow_paths = max_num_paths_ -
476 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
479 return true;
480 }
483 return true;
484 }
485 }
486 else if (kernel_creates_ao_paths(kernel)) {
487 /* AO kernel creates two shadow paths, so limit number of states to schedule. */
488 num_paths_limit = available_shadow_paths / 2;
489 }
490 }
491
492 /* Schedule kernel with maximum number of queued items. */
493 enqueue_path_iteration(kernel, num_paths_limit);
494
495 /* Update next shadow path index for kernels that can add shadow paths. */
496 if (kernel_creates_shadow_paths(kernel)) {
498 }
499
500 return true;
501}
502
503void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
504{
505 device_ptr d_path_index = 0;
506
507 /* Create array of path indices for which this kernel is queued to be executed. */
509
511 const int num_queued = queue_counter->num_queued[kernel];
512
513 if (kernel_uses_sorting(kernel)) {
514 /* Compute array of active paths, sorted by shader. */
515 work_size = num_queued;
516 d_path_index = queued_paths_.device_pointer;
517
518 compute_sorted_queued_paths(kernel, num_paths_limit);
519 }
520 else if (num_queued < work_size) {
521 work_size = num_queued;
522 d_path_index = queued_paths_.device_pointer;
523
524 if (kernel_is_shadow_path(kernel)) {
525 /* Compute array of active shadow paths for specific kernel. */
527 }
528 else {
529 /* Compute array of active paths for specific kernel. */
531 }
532 }
533
534 work_size = min(work_size, num_paths_limit);
535
537
538 switch (kernel) {
540 /* Closest ray intersection kernels with integrator state and render buffer. */
541 const DeviceKernelArguments args(
542 &d_path_index, &buffers_->buffer.device_pointer, &work_size);
543
544 queue_->enqueue(kernel, work_size, args);
545 break;
546 }
547
552 /* Ray intersection kernels with integrator state. */
553 const DeviceKernelArguments args(&d_path_index, &work_size);
554
555 queue_->enqueue(kernel, work_size, args);
556 break;
557 }
567 /* Shading kernels with integrator state and render buffer. */
568 const DeviceKernelArguments args(
569 &d_path_index, &buffers_->buffer.device_pointer, &work_size);
570
571 queue_->enqueue(kernel, work_size, args);
572 break;
573 }
574 default:
575 LOG_FATAL << "Unhandled kernel " << device_kernel_as_string(kernel)
576 << " used for path iteration, should never happen.";
577 break;
578 }
579}
580
582 const int num_paths_limit)
583{
584 int d_queued_kernel = queued_kernel;
585
586 /* Launch kernel to fill the active paths arrays. */
587 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
588 const int work_size = kernel_max_active_main_path_index(queued_kernel);
589 device_ptr d_queued_paths = queued_paths_.device_pointer;
590
591 int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
592
593 const DeviceKernelArguments args(
594 &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
595
598 args);
601 args);
602 return;
603 }
604
605 device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
606 device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
607 assert(d_counter != 0 && d_prefix_sum != 0);
608
609 /* Compute prefix sum of number of active paths with each shader. */
610 {
611 const int work_size = 1;
612 int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
613
614 const DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
615
617 }
618
619 queue_->zero_to_device(num_queued_paths_);
620
621 /* Launch kernel to fill the active paths arrays. */
622 {
623 /* TODO: this could be smaller for terminated paths based on amount of work we want
624 * to schedule, and also based on num_paths_limit.
625 *
626 * Also, when the number paths is limited it may be better to prefer paths from the
627 * end of the array since compaction would need to do less work. */
628 const int work_size = kernel_max_active_main_path_index(queued_kernel);
629
630 device_ptr d_queued_paths = queued_paths_.device_pointer;
631 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
632
634 &num_paths_limit,
635 &d_queued_paths,
636 &d_num_queued_paths,
637 &d_counter,
638 &d_prefix_sum,
639 &d_queued_kernel);
640
642 }
643}
644
646{
647 int d_queued_kernel = queued_kernel;
648
649 /* Launch kernel to fill the active paths arrays. */
650 const int work_size = kernel_max_active_main_path_index(queued_kernel);
651 device_ptr d_queued_paths = queued_paths_.device_pointer;
652 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
653
654 const DeviceKernelArguments args(
655 &work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
656
657 queue_->zero_to_device(num_queued_paths_);
658 queue_->enqueue(kernel, work_size, args);
659}
660
661void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
662{
663 /* Early out if there is nothing that needs to be compacted. */
664 if (num_active_paths == 0) {
666 return;
667 }
668
669 const int min_compact_paths = 32;
670 if (max_active_main_path_index_ == num_active_paths ||
671 max_active_main_path_index_ < min_compact_paths)
672 {
673 return;
674 }
675
676 /* Compact. */
677 compact_paths(num_active_paths,
682
683 /* Adjust max active path index now we know which part of the array is actually used. */
684 max_active_main_path_index_ = num_active_paths;
685}
686
688{
690 const int num_active_paths =
693
694 /* Early out if there is nothing that needs to be compacted. */
695 if (num_active_paths == 0) {
696 if (integrator_next_shadow_path_index_.data()[0] != 0) {
699 }
700 return;
701 }
702
703 /* Compact if we can reduce the space used by half. Not always since
704 * compaction has a cost. */
705 const float max_overhead_factor = 2.0f;
706 const int min_compact_paths = 32;
707 const int num_total_paths = integrator_next_shadow_path_index_.data()[0];
708 if (num_total_paths < num_active_paths * max_overhead_factor ||
709 num_total_paths < min_compact_paths)
710 {
711 return;
712 }
713
714 /* Compact. */
715 compact_paths(num_active_paths,
716 num_total_paths,
720
721 /* Adjust max active path index now we know which part of the array is actually used. */
722 integrator_next_shadow_path_index_.data()[0] = num_active_paths;
724}
725
726void PathTraceWorkGPU::compact_paths(const int num_active_paths,
727 const int max_active_path_index,
728 DeviceKernel terminated_paths_kernel,
729 DeviceKernel compact_paths_kernel,
730 DeviceKernel compact_kernel)
731{
732 /* Compact fragmented path states into the start of the array, moving any paths
733 * with index higher than the number of active paths into the gaps. */
734 device_ptr d_compact_paths = queued_paths_.device_pointer;
735 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
736
737 /* Create array with terminated paths that we can write to. */
738 {
739 /* TODO: can the work size be reduced here? */
740 int offset = num_active_paths;
741 const int work_size = num_active_paths;
742
743 const DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset);
744
745 queue_->zero_to_device(num_queued_paths_);
746 queue_->enqueue(terminated_paths_kernel, work_size, args);
747 }
748
749 /* Create array of paths that we need to compact, where the path index is bigger
750 * than the number of active paths. */
751 {
752 const int work_size = max_active_path_index;
753
754 const DeviceKernelArguments args(
755 &work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
756
757 queue_->zero_to_device(num_queued_paths_);
758 queue_->enqueue(compact_paths_kernel, work_size, args);
759 }
760
761 queue_->copy_from_device(num_queued_paths_);
762 queue_->synchronize();
763
764 const int num_compact_paths = num_queued_paths_.data()[0];
765
766 /* Move paths into gaps. */
767 if (num_compact_paths > 0) {
768 int work_size = num_compact_paths;
769 int active_states_offset = 0;
770 int terminated_states_offset = num_active_paths;
771
772 const DeviceKernelArguments args(
773 &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size);
774
775 queue_->enqueue(compact_kernel, work_size, args);
776 }
777}
778
780{
781 /* If there are existing paths wait them to go to intersect closest kernel, which will align the
782 * wavefront of the existing and newly added paths. */
783 /* TODO: Check whether counting new intersection kernels here will have positive affect on the
784 * performance. */
785 const DeviceKernel kernel = get_most_queued_kernel();
787 return false;
788 }
789
790 const int num_active_paths = num_active_main_paths_paths();
791
792 /* Don't schedule more work if canceling. */
793 if (is_cancel_requested()) {
794 if (num_active_paths == 0) {
795 finished = true;
796 }
797 return false;
798 }
799
800 finished = false;
801
802 vector<KernelWorkTile> work_tiles;
803
804 int max_num_camera_paths = max_num_paths_;
805 int num_predicted_splits = 0;
806
807 if (has_shadow_catcher()) {
808 /* When there are shadow catchers in the scene bounce from them will split the state. So we
809 * make sure there is enough space in the path states array to fit split states.
810 *
811 * Basically, when adding N new paths we ensure that there is 2*N available path states, so
812 * that all the new paths can be split.
813 *
814 * Note that it is possible that some of the current states can still split, so need to make
815 * sure there is enough space for them as well. */
816
817 /* Number of currently in-flight states which can still split. */
818 const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
819
820 const int num_available_paths = max_num_paths_ - num_active_paths;
821 const int num_new_paths = num_available_paths / 2;
822 max_num_camera_paths = max(num_active_paths,
823 num_active_paths + num_new_paths - num_scheduled_possible_split);
824 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
825 }
826
827 /* Schedule when we're out of paths or there are too few paths to keep the
828 * device occupied. */
829 int num_paths = num_active_paths;
830 if (num_paths == 0 || num_paths < min_num_active_main_paths_) {
831 /* Get work tiles until the maximum number of path is reached. */
832 while (num_paths < max_num_camera_paths) {
833 KernelWorkTile work_tile;
834 if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
835 work_tiles.push_back(work_tile);
836 num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
837 }
838 else {
839 break;
840 }
841 }
842
843 /* If we couldn't get any more tiles, we're done. */
844 if (work_tiles.empty() && num_paths == 0) {
845 finished = true;
846 return false;
847 }
848 }
849
850 /* Initialize paths from work tiles. */
851 if (work_tiles.empty()) {
852 return false;
853 }
854
855 /* Compact state array when number of paths becomes small relative to the
856 * known maximum path index, which makes computing active index arrays slow. */
857 compact_main_paths(num_active_paths);
858
859 if (has_shadow_catcher()) {
860 integrator_next_main_path_index_.data()[0] = num_paths;
862 }
863
866 work_tiles.data(),
867 work_tiles.size(),
868 num_active_paths,
869 num_predicted_splits);
870
871 return true;
872}
873
875 const KernelWorkTile work_tiles[],
876 const int num_work_tiles,
877 const int num_active_paths,
878 const int num_predicted_splits)
879{
880 /* Copy work tiles to device. */
881 if (work_tiles_.size() < num_work_tiles) {
882 work_tiles_.alloc(num_work_tiles);
883 }
884
885 int path_index_offset = num_active_paths;
886 int max_tile_work_size = 0;
887 for (int i = 0; i < num_work_tiles; i++) {
888 KernelWorkTile &work_tile = work_tiles_.data()[i];
889 work_tile = work_tiles[i];
890
891 const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
892
893 work_tile.path_index_offset = path_index_offset;
894 work_tile.work_size = tile_work_size;
895
896 path_index_offset += tile_work_size;
897
898 max_tile_work_size = max(max_tile_work_size, tile_work_size);
899 }
900
901 queue_->copy_to_device(work_tiles_);
902
903 const device_ptr d_work_tiles = work_tiles_.device_pointer;
904 device_ptr d_render_buffer = buffers_->buffer.device_pointer;
905
906 /* Launch kernel. */
907 const DeviceKernelArguments args(
908 &d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size);
909
910 queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
911
912 max_active_main_path_index_ = path_index_offset + num_predicted_splits;
913}
914
916{
918
919 int num_paths = 0;
920 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
921 DCHECK_GE(queue_counter->num_queued[i], 0)
922 << "Invalid number of queued states for kernel "
923 << device_kernel_as_string(static_cast<DeviceKernel>(i));
924
926 num_paths += queue_counter->num_queued[i];
927 }
928 }
929
930 return num_paths;
931}
932
934{
935 /* There are few aspects with the graphics interop when using multiple devices caused by the fact
936 * that the PathTraceDisplay has a single texture:
937 *
938 * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
939 * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
940 * otherwise one would run into a conflict of where the source of truth is. */
941 if (has_multiple_works()) {
942 return false;
943 }
944
946 Device *device = queue_->device;
948 true);
949
950 if (interop_use_) {
951 LOG_INFO << "Using graphics interop GPU display update.";
952 }
953 else {
954 LOG_INFO << "Using naive GPU display update.";
955 }
956
958 }
959
960 return interop_use_;
961}
962
964 PassMode pass_mode,
965 const int num_samples)
966{
967 if (device_->have_error()) {
968 /* Don't attempt to update GPU display if the device has errors: the error state will make
969 * wrong decisions to happen about interop, causing more chained bugs. */
970 return;
971 }
972
973 if (!buffers_->buffer.device_pointer) {
974 LOG_WARNING << "Request for GPU display update without allocated render buffers.";
975 return;
976 }
977
978 if (should_use_graphics_interop(display)) {
979 if (copy_to_display_interop(display, pass_mode, num_samples)) {
980 return;
981 }
982
983 /* If error happens when trying to use graphics interop fallback to the native implementation
984 * and don't attempt to use interop for the further updates. */
985 interop_use_ = false;
986 }
987
988 copy_to_display_naive(display, pass_mode, num_samples);
989}
990
992 PassMode pass_mode,
993 const int num_samples)
994{
995 const int full_x = effective_buffer_params_.full_x;
996 const int full_y = effective_buffer_params_.full_y;
997 const int width = effective_buffer_params_.window_width;
998 const int height = effective_buffer_params_.window_height;
999 const int final_width = buffers_->params.window_width;
1000 const int final_height = buffers_->params.window_height;
1001
1002 const int texture_x = full_x - effective_big_tile_params_.full_x +
1004 const int texture_y = full_y - effective_big_tile_params_.full_y +
1006
1007 /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
1008 *
1009 * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
1010 * change of the resolution divider. However, if the display becomes smaller, shrink the
1011 * allocated memory as well. */
1012 if (display_rgba_half_.data_width != final_width ||
1013 display_rgba_half_.data_height != final_height)
1014 {
1015 display_rgba_half_.alloc(final_width, final_height);
1016 /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
1017 * transferring zeroes to the device. */
1018 queue_->zero_to_device(display_rgba_half_);
1019 }
1020
1021 PassAccessor::Destination destination(film_->get_display_pass(), pass_mode);
1022 destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
1023
1024 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1025
1026 queue_->copy_from_device(display_rgba_half_);
1027 queue_->synchronize();
1028
1029 display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
1030}
1031
1033 PassMode pass_mode,
1034 const int num_samples)
1035{
1037 device_graphics_interop_ = queue_->graphics_interop_create();
1038 }
1039
1040 GraphicsInteropBuffer &interop_buffer = display->graphics_interop_get_buffer();
1041 device_graphics_interop_->set_buffer(interop_buffer);
1042
1043 const device_ptr d_rgba_half = device_graphics_interop_->map();
1044 if (!d_rgba_half) {
1045 return false;
1046 }
1047
1048 PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
1049 destination.d_pixels_half_rgba = d_rgba_half;
1050
1051 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1052
1053 device_graphics_interop_->unmap();
1054
1055 return true;
1056}
1057
1059{
1061 return;
1062 }
1063 display->graphics_interop_activate();
1064 device_graphics_interop_ = nullptr;
1065 display->graphics_interop_deactivate();
1066}
1067
1069 PassMode pass_mode,
1070 const int num_samples)
1071{
1072 const KernelFilm &kfilm = device_scene_->data.film;
1073
1074 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
1075 if (pass_access_info.type == PASS_NONE) {
1076 return;
1077 }
1078
1079 const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
1080
1081 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
1082}
1083
1085 bool reset)
1086{
1087 const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
1088
1089 if (num_active_pixels) {
1092 queue_->synchronize();
1093 }
1094
1095 return num_active_pixels;
1096}
1097
1099 bool reset)
1100{
1101 device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
1102 num_active_pixels.alloc(1);
1103
1104 queue_->zero_to_device(num_active_pixels);
1105
1107 if (!work_size) {
1108 return 0;
1109 }
1110
1111 const int reset_int = reset; /* No bool kernel arguments. */
1112
1113 const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1118 &threshold,
1119 &reset_int,
1122 &num_active_pixels.device_pointer);
1123
1125
1126 queue_->copy_from_device(num_active_pixels);
1127 queue_->synchronize();
1128
1129 return num_active_pixels.data()[0];
1130}
1131
1147
1163
1165{
1167 if (!work_size) {
1168 return;
1169 }
1170
1171 const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1172 &work_size,
1174 &effective_buffer_params_.stride);
1175
1177}
1178
1180{
1181 const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1187 &effective_buffer_params_.stride);
1188
1189 {
1191 DCHECK_GT(work_size, 0);
1193 }
1194
1195 {
1196 const int work_size = effective_buffer_params_.width;
1197 DCHECK_GT(work_size, 0);
1199 }
1200}
1201
1203{
1204 /* May not exist if cancelled before rendering started. */
1205 if (!buffers_->buffer.device_pointer) {
1206 return false;
1207 }
1208
1209 queue_->copy_from_device(buffers_->buffer);
1210
1211 /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
1212 return queue_->synchronize();
1213}
1214
1216{
1217 queue_->copy_to_device(buffers_->buffer);
1218
1219 /* NOTE: The direct device access to the buffers only happens within this path trace work. The
1220 * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
1221 * which will perform synchronization as needed. */
1222
1223 return true;
1224}
1225
1227{
1228 queue_->zero_to_device(buffers_->buffer);
1229
1230 return true;
1231}
1232
1234{
1235 return device_scene_->data.integrator.has_shadow_catcher;
1236}
1237
1239{
1240 if (max_active_main_path_index_ == 0) {
1241 return 0;
1242 }
1243
1244 if (!has_shadow_catcher()) {
1245 return 0;
1246 }
1247
1248 queue_->zero_to_device(num_queued_paths_);
1249
1251 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
1252
1253 const DeviceKernelArguments args(&work_size, &d_num_queued_paths);
1254
1256 queue_->copy_from_device(num_queued_paths_);
1257 queue_->synchronize();
1258
1259 return num_queued_paths_.data()[0];
1260}
1261
1268
1278
1286
1292
1298
unsigned int uint
#define GPU_PARALLEL_SORT_BLOCK_SIZE
Definition block_sizes.h:18
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
virtual bool should_use_graphics_interop(const GraphicsInteropDevice &, const bool=false)
Definition film.h:29
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
void copy_pixels_to_texture(const half4 *rgba_pixels, const int texture_x, const int texture_y, const int pixels_width, const int pixels_height)
GraphicsInteropBuffer & graphics_interop_get_buffer()
GraphicsInteropDevice graphics_interop_get_device()
bool kernel_is_shadow_path(DeviceKernel kernel)
bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
void destroy_gpu_resources(PathTraceDisplay *display) override
void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
void cryptomatte_postproces() override
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
device_vector< IntegratorQueueCounter > integrator_queue_counter_
void denoise_volume_guiding_buffers() override
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool kernel_creates_ao_paths(DeviceKernel kernel)
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
bool should_use_graphics_interop(PathTraceDisplay *display)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
int adaptive_sampling_convergence_check_count_active(const float threshold, bool reset)
bool copy_render_buffers_to_device() override
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
device_vector< half4 > display_rgba_half_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display, const PassMode mode) const
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
T * alloc(const size_t width, const size_t height=0)
nullptr float
@ MEM_READ_WRITE
#define KERNEL_FEATURE_AO
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
@ BVH_LAYOUT_OPTIX
@ PASS_NONE
@ DEVICE_KERNEL_INTEGRATOR_NUM
DeviceKernel
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME_RAY_MARCHING
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y
@ DEVICE_KERNEL_PREFIX_SUM
#define DCHECK_GE(a, b)
Definition log.h:142
#define DCHECK_LE(a, b)
Definition log.h:147
#define DCHECK_GT(a, b)
Definition log.h:145
#define LOG_IS_ON(level)
Definition log.h:113
#define LOG_FATAL
Definition log.h:99
@ LOG_LEVEL_TRACE
Definition log.h:27
#define LOG_WARNING
Definition log.h:103
#define LOG_INFO
Definition log.h:106
#define LOG_TRACE
Definition log.h:108
PassMode
Definition pass.h:20
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
#define min(a, b)
Definition sort.cc:36
string string_human_readable_size(size_t size)
Definition string.cpp:257
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition state.h:103
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
uint64_t device_ptr
Definition types_base.h:44
ccl_device_inline size_t divide_up(const size_t x, const size_t y)
Definition types_base.h:52