Blender V4.3
path_trace_work_gpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
7
8#include "device/device.h"
9
11#include "scene/scene.h"
12#include "session/buffers.h"
13#include "util/log.h"
14#include "util/string.h"
15#include "util/tbb.h"
16#include "util/time.h"
17
18#include "kernel/types.h"
19
21
22static size_t estimate_single_state_size(const uint kernel_features)
23{
24 size_t state_size = 0;
25
26#define KERNEL_STRUCT_BEGIN(name) \
27 for (int array_index = 0;; array_index++) {
28
29#ifdef __INTEGRATOR_GPU_PACKED_STATE__
30# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
31 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
32# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
33# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
34 KERNEL_STRUCT_BEGIN(parent_struct) \
35 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
36#else
37# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
38 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
39# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
40# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
41#endif
42
43#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
44 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
45#define KERNEL_STRUCT_END(name) \
46 (void)array_index; \
47 break; \
48 }
49#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
50 if (array_index >= gpu_array_size - 1) { \
51 break; \
52 } \
53 }
54/* TODO(sergey): Look into better estimation for fields which depend on scene features. Maybe
55 * maximum state calculation should happen as `alloc_work_memory()`, so that we can react to an
56 * updated scene state here.
57 * For until then use common value. Currently this size is only used for logging, but is weak to
58 * rely on this. */
59#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
60
62
64
65#undef KERNEL_STRUCT_BEGIN
66#undef KERNEL_STRUCT_BEGIN_PACKED
67#undef KERNEL_STRUCT_MEMBER
68#undef KERNEL_STRUCT_MEMBER_PACKED
69#undef KERNEL_STRUCT_ARRAY_MEMBER
70#undef KERNEL_STRUCT_END
71#undef KERNEL_STRUCT_END_ARRAY
72#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
73
74 return state_size;
75}
76
78 Film *film,
79 DeviceScene *device_scene,
80 bool *cancel_requested_flag)
81 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
82 queue_(device->gpu_queue_create()),
83 integrator_state_soa_kernel_features_(0),
84 integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
85 integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
86 integrator_shader_raytrace_sort_counter_(
87 device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
88 integrator_shader_mnee_sort_counter_(
89 device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
90 integrator_shader_sort_prefix_sum_(
91 device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
92 integrator_shader_sort_partition_key_offsets_(
93 device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
94 integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
95 integrator_next_shadow_path_index_(
96 device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
97 queued_paths_(device, "queued_paths", MEM_READ_WRITE),
98 num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
99 work_tiles_(device, "work_tiles", MEM_READ_WRITE),
100 display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
101 max_num_paths_(0),
102 min_num_active_main_paths_(0),
103 max_active_main_path_index_(0)
104{
105 memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
106}
107
109{
110 /* IntegrateState allocated as structure of arrays. */
111
112 /* Check if we already allocated memory for the required features. */
113 const int requested_volume_stack_size = device_scene_->data.volume_stack_size;
114 const uint kernel_features = device_scene_->data.kernel_features;
115 if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features &&
116 integrator_state_soa_volume_stack_size_ >= requested_volume_stack_size)
117 {
118 return;
119 }
122 requested_volume_stack_size);
123
124 /* Determine the number of path states. Deferring this for as long as possible allows the
125 * back-end to make better decisions about memory availability. */
126 if (max_num_paths_ == 0) {
127 size_t single_state_size = estimate_single_state_size(kernel_features);
128
129 max_num_paths_ = queue_->num_concurrent_states(single_state_size);
130 min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
131
132 /* Limit number of active paths to the half of the overall state. This is due to the logic in
133 * the path compaction which relies on the fact that regeneration does not happen sooner than
134 * half of the states are available again. */
136 }
137
138 /* Allocate a device only memory buffer before for each struct member, and then
139 * write the pointers into a struct that resides in constant memory.
140 *
141 * TODO: store float3 in separate XYZ arrays. */
142#define KERNEL_STRUCT_BEGIN(name) \
143 for (int array_index = 0;; array_index++) {
144#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
145 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
146 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
147 shadow ? "shadow_" : ""); \
148 device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
149 array->alloc_to_device(max_num_paths_); \
150 integrator_state_soa_.emplace_back(array); \
151 memcpy(&integrator_state_gpu_.parent_struct.name, \
152 &array->device_pointer, \
153 sizeof(array->device_pointer)); \
154 }
155#ifdef __INTEGRATOR_GPU_PACKED_STATE__
156# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
157 if ((kernel_features & (feature))) { \
158 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
159 shadow ? "shadow_" : ""); \
160 VLOG_DEBUG << "Skipping " << name_str \
161 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
162 }
163# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
164 KERNEL_STRUCT_BEGIN(parent_struct) \
165 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
166#else
167# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
168# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
169#endif
170
171#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
172 if ((kernel_features & (feature)) && \
173 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
174 { \
175 string name_str = string_printf( \
176 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
177 device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
178 array->alloc_to_device(max_num_paths_); \
179 integrator_state_soa_.emplace_back(array); \
180 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
181 &array->device_pointer, \
182 sizeof(array->device_pointer)); \
183 }
184#define KERNEL_STRUCT_END(name) \
185 (void)array_index; \
186 break; \
187 }
188#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
189 if (array_index >= gpu_array_size - 1) { \
190 break; \
191 } \
192 }
193#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
194
195 bool shadow = false;
197 shadow = true;
199
200#undef KERNEL_STRUCT_BEGIN
201#undef KERNEL_STRUCT_BEGIN_PACKED
202#undef KERNEL_STRUCT_MEMBER
203#undef KERNEL_STRUCT_MEMBER_PACKED
204#undef KERNEL_STRUCT_ARRAY_MEMBER
205#undef KERNEL_STRUCT_END
206#undef KERNEL_STRUCT_END_ARRAY
207#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
208
209 if (VLOG_IS_ON(3)) {
210 size_t total_soa_size = 0;
211 for (auto &&soa_memory : integrator_state_soa_) {
212 total_soa_size += soa_memory->memory_size();
213 }
214
215 VLOG_DEVICE_STATS << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
216 }
217}
218
220{
221 if (integrator_queue_counter_.size() == 0) {
227 }
228
229 /* Allocate data for active path index arrays. */
230 if (num_queued_paths_.size() == 0) {
233 }
234
235 if (queued_paths_.size() == 0) {
237 /* TODO: this could be skip if we had a function to just allocate on device. */
239 }
240}
241
243{
244 /* Compute sort partitions, to balance between memory locality and coherence.
245 * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
246 * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
247 */
249 if (device_scene_->data.max_shaders < 300) {
250 const int num_elements = queue_->num_sort_partition_elements();
251 if (num_elements) {
252 num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
253 }
254 }
255
258
259 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
260 /* Allocate array for partitioned shader sorting using local atomics. */
261 const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
265 }
268 }
269 else {
270 /* Allocate arrays for shader sorting. */
271 const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
272 if (integrator_shader_sort_counter_.size() < sort_buckets) {
277
280 }
281
282 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
283 if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
288 }
289 }
290
291 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
292 if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
297 }
298 }
299 }
300}
301
321
329
331{
332 queue_->init_execution();
333
334 /* Copy to device side struct in constant memory. */
336 "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
337}
338
340 int start_sample,
341 int samples_num,
342 int sample_offset)
343{
344 /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
345 * add more work (because tiles are smaller, so there is higher chance that more paths will
346 * become busy after adding new tiles). This is especially important for the shadow catcher which
347 * schedules work in halves of available number of paths. */
352 start_sample,
353 samples_num,
354 sample_offset,
355 device_scene_->data.integrator.scrambling_distance);
356
358
359 int num_iterations = 0;
360 uint64_t num_busy_accum = 0;
361
362 /* TODO: set a hard limit in case of undetected kernel failures? */
363 while (true) {
364 /* Enqueue work from the scheduler, on start or when there are not enough
365 * paths to keep the device occupied. */
366 bool finished;
367 if (enqueue_work_tiles(finished)) {
368 /* Copy stats from the device. */
369 queue_->copy_from_device(integrator_queue_counter_);
370
371 if (!queue_->synchronize()) {
372 break; /* Stop on error. */
373 }
374 }
375
376 if (is_cancel_requested()) {
377 break;
378 }
379
380 /* Stop if no more work remaining. */
381 if (finished) {
382 break;
383 }
384
385 /* Enqueue on of the path iteration kernels. */
387 /* Copy stats from the device. */
388 queue_->copy_from_device(integrator_queue_counter_);
389
390 if (!queue_->synchronize()) {
391 break; /* Stop on error. */
392 }
393 }
394
395 if (is_cancel_requested()) {
396 break;
397 }
398
399 num_busy_accum += num_active_main_paths_paths();
400 ++num_iterations;
401 }
402
403 if (num_iterations) {
404 statistics.occupancy = float(num_busy_accum) / num_iterations / max_num_paths_;
405 }
406 else {
407 statistics.occupancy = 0.0f;
408 }
409}
410
412{
414
415 int max_num_queued = 0;
417
418 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
419 if (queue_counter->num_queued[i] > max_num_queued) {
420 kernel = (DeviceKernel)i;
421 max_num_queued = queue_counter->num_queued[i];
422 }
423 }
424
425 return kernel;
426}
427
429{
431
433 queue_->zero_to_device(integrator_queue_counter_);
436 }
437 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
439 {
441 }
442 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
444 {
446 }
447
448 /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
449 * counter on the host side because `zero_to_device()` is not doing it. */
452 }
453}
454
456{
457 /* Find kernel to execute, with max number of queued paths. */
459
460 int num_active_paths = 0;
461 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
462 num_active_paths += queue_counter->num_queued[i];
463 }
464
465 if (num_active_paths == 0) {
466 return false;
467 }
468
469 /* Find kernel to execute, with max number of queued paths. */
470 const DeviceKernel kernel = get_most_queued_kernel();
471 if (kernel == DEVICE_KERNEL_NUM) {
472 return false;
473 }
474
475 /* For kernels that add shadow paths, check if there is enough space available.
476 * If not, schedule shadow kernels first to clear out the shadow paths. */
477 int num_paths_limit = INT_MAX;
478
479 if (kernel_creates_shadow_paths(kernel)) {
481
482 const int available_shadow_paths = max_num_paths_ -
484 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
487 return true;
488 }
489 else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
491 return true;
492 }
493 }
494 else if (kernel_creates_ao_paths(kernel)) {
495 /* AO kernel creates two shadow paths, so limit number of states to schedule. */
496 num_paths_limit = available_shadow_paths / 2;
497 }
498 }
499
500 /* Schedule kernel with maximum number of queued items. */
501 enqueue_path_iteration(kernel, num_paths_limit);
502
503 /* Update next shadow path index for kernels that can add shadow paths. */
504 if (kernel_creates_shadow_paths(kernel)) {
506 }
507
508 return true;
509}
510
511void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
512{
513 device_ptr d_path_index = 0;
514
515 /* Create array of path indices for which this kernel is queued to be executed. */
517
519 int num_queued = queue_counter->num_queued[kernel];
520
521 if (kernel_uses_sorting(kernel)) {
522 /* Compute array of active paths, sorted by shader. */
523 work_size = num_queued;
524 d_path_index = queued_paths_.device_pointer;
525
526 compute_sorted_queued_paths(kernel, num_paths_limit);
527 }
528 else if (num_queued < work_size) {
529 work_size = num_queued;
530 d_path_index = queued_paths_.device_pointer;
531
532 if (kernel_is_shadow_path(kernel)) {
533 /* Compute array of active shadow paths for specific kernel. */
535 }
536 else {
537 /* Compute array of active paths for specific kernel. */
539 }
540 }
541
542 work_size = min(work_size, num_paths_limit);
543
545
546 switch (kernel) {
548 /* Closest ray intersection kernels with integrator state and render buffer. */
549 DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
550
551 queue_->enqueue(kernel, work_size, args);
552 break;
553 }
554
559 /* Ray intersection kernels with integrator state. */
560 DeviceKernelArguments args(&d_path_index, &work_size);
561
562 queue_->enqueue(kernel, work_size, args);
563 break;
564 }
573 /* Shading kernels with integrator state and render buffer. */
574 DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
575
576 queue_->enqueue(kernel, work_size, args);
577 break;
578 }
579
580 default:
581 LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
582 << " used for path iteration, should never happen.";
583 break;
584 }
585}
586
588 const int num_paths_limit)
589{
590 int d_queued_kernel = queued_kernel;
591
592 /* Launch kernel to fill the active paths arrays. */
593 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
594 const int work_size = kernel_max_active_main_path_index(queued_kernel);
595 device_ptr d_queued_paths = queued_paths_.device_pointer;
596
598
600 &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
601
604 return;
605 }
606
607 device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
609 assert(d_counter != 0 && d_prefix_sum != 0);
610
611 /* Compute prefix sum of number of active paths with each shader. */
612 {
613 const int work_size = 1;
614 int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
615
616 DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
617
619 }
620
621 queue_->zero_to_device(num_queued_paths_);
622
623 /* Launch kernel to fill the active paths arrays. */
624 {
625 /* TODO: this could be smaller for terminated paths based on amount of work we want
626 * to schedule, and also based on num_paths_limit.
627 *
628 * Also, when the number paths is limited it may be better to prefer paths from the
629 * end of the array since compaction would need to do less work. */
630 const int work_size = kernel_max_active_main_path_index(queued_kernel);
631
632 device_ptr d_queued_paths = queued_paths_.device_pointer;
633 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
634
636 &num_paths_limit,
637 &d_queued_paths,
638 &d_num_queued_paths,
639 &d_counter,
640 &d_prefix_sum,
641 &d_queued_kernel);
642
644 }
645}
646
648{
649 int d_queued_kernel = queued_kernel;
650
651 /* Launch kernel to fill the active paths arrays. */
652 const int work_size = kernel_max_active_main_path_index(queued_kernel);
653 device_ptr d_queued_paths = queued_paths_.device_pointer;
654 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
655
656 DeviceKernelArguments args(&work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
657
658 queue_->zero_to_device(num_queued_paths_);
659 queue_->enqueue(kernel, work_size, args);
660}
661
662void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
663{
664 /* Early out if there is nothing that needs to be compacted. */
665 if (num_active_paths == 0) {
667 return;
668 }
669
670 const int min_compact_paths = 32;
671 if (max_active_main_path_index_ == num_active_paths ||
672 max_active_main_path_index_ < min_compact_paths)
673 {
674 return;
675 }
676
677 /* Compact. */
678 compact_paths(num_active_paths,
683
684 /* Adjust max active path index now we know which part of the array is actually used. */
685 max_active_main_path_index_ = num_active_paths;
686}
687
689{
691 const int num_active_paths =
694
695 /* Early out if there is nothing that needs to be compacted. */
696 if (num_active_paths == 0) {
700 }
701 return;
702 }
703
704 /* Compact if we can reduce the space used by half. Not always since
705 * compaction has a cost. */
706 const float shadow_compact_ratio = 0.5f;
707 const int min_compact_paths = 32;
708 if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio ||
709 integrator_next_shadow_path_index_.data()[0] < min_compact_paths)
710 {
711 return;
712 }
713
714 /* Compact. */
715 compact_paths(num_active_paths,
720
721 /* Adjust max active path index now we know which part of the array is actually used. */
722 integrator_next_shadow_path_index_.data()[0] = num_active_paths;
724}
725
726void PathTraceWorkGPU::compact_paths(const int num_active_paths,
727 const int max_active_path_index,
728 DeviceKernel terminated_paths_kernel,
729 DeviceKernel compact_paths_kernel,
730 DeviceKernel compact_kernel)
731{
732 /* Compact fragmented path states into the start of the array, moving any paths
733 * with index higher than the number of active paths into the gaps. */
734 device_ptr d_compact_paths = queued_paths_.device_pointer;
735 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
736
737 /* Create array with terminated paths that we can write to. */
738 {
739 /* TODO: can the work size be reduced here? */
740 int offset = num_active_paths;
741 int work_size = num_active_paths;
742
743 DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset);
744
745 queue_->zero_to_device(num_queued_paths_);
746 queue_->enqueue(terminated_paths_kernel, work_size, args);
747 }
748
749 /* Create array of paths that we need to compact, where the path index is bigger
750 * than the number of active paths. */
751 {
752 int work_size = max_active_path_index;
753
755 &work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
756
757 queue_->zero_to_device(num_queued_paths_);
758 queue_->enqueue(compact_paths_kernel, work_size, args);
759 }
760
761 queue_->copy_from_device(num_queued_paths_);
762 queue_->synchronize();
763
764 int num_compact_paths = num_queued_paths_.data()[0];
765
766 /* Move paths into gaps. */
767 if (num_compact_paths > 0) {
768 int work_size = num_compact_paths;
769 int active_states_offset = 0;
770 int terminated_states_offset = num_active_paths;
771
773 &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size);
774
775 queue_->enqueue(compact_kernel, work_size, args);
776 }
777}
778
780{
781 /* If there are existing paths wait them to go to intersect closest kernel, which will align the
782 * wavefront of the existing and newly added paths. */
783 /* TODO: Check whether counting new intersection kernels here will have positive affect on the
784 * performance. */
785 const DeviceKernel kernel = get_most_queued_kernel();
787 return false;
788 }
789
790 int num_active_paths = num_active_main_paths_paths();
791
792 /* Don't schedule more work if canceling. */
793 if (is_cancel_requested()) {
794 if (num_active_paths == 0) {
795 finished = true;
796 }
797 return false;
798 }
799
800 finished = false;
801
802 vector<KernelWorkTile> work_tiles;
803
804 int max_num_camera_paths = max_num_paths_;
805 int num_predicted_splits = 0;
806
807 if (has_shadow_catcher()) {
808 /* When there are shadow catchers in the scene bounce from them will split the state. So we
809 * make sure there is enough space in the path states array to fit split states.
810 *
811 * Basically, when adding N new paths we ensure that there is 2*N available path states, so
812 * that all the new paths can be split.
813 *
814 * Note that it is possible that some of the current states can still split, so need to make
815 * sure there is enough space for them as well. */
816
817 /* Number of currently in-flight states which can still split. */
818 const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
819
820 const int num_available_paths = max_num_paths_ - num_active_paths;
821 const int num_new_paths = num_available_paths / 2;
822 max_num_camera_paths = max(num_active_paths,
823 num_active_paths + num_new_paths - num_scheduled_possible_split);
824 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
825 }
826
827 /* Schedule when we're out of paths or there are too few paths to keep the
828 * device occupied. */
829 int num_paths = num_active_paths;
830 if (num_paths == 0 || num_paths < min_num_active_main_paths_) {
831 /* Get work tiles until the maximum number of path is reached. */
832 while (num_paths < max_num_camera_paths) {
833 KernelWorkTile work_tile;
834 if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
835 work_tiles.push_back(work_tile);
836 num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
837 }
838 else {
839 break;
840 }
841 }
842
843 /* If we couldn't get any more tiles, we're done. */
844 if (work_tiles.size() == 0 && num_paths == 0) {
845 finished = true;
846 return false;
847 }
848 }
849
850 /* Initialize paths from work tiles. */
851 if (work_tiles.size() == 0) {
852 return false;
853 }
854
855 /* Compact state array when number of paths becomes small relative to the
856 * known maximum path index, which makes computing active index arrays slow. */
857 compact_main_paths(num_active_paths);
858
859 if (has_shadow_catcher()) {
860 integrator_next_main_path_index_.data()[0] = num_paths;
862 }
863
866 work_tiles.data(),
867 work_tiles.size(),
868 num_active_paths,
869 num_predicted_splits);
870
871 return true;
872}
873
875 const KernelWorkTile work_tiles[],
876 const int num_work_tiles,
877 const int num_active_paths,
878 const int num_predicted_splits)
879{
880 /* Copy work tiles to device. */
881 if (work_tiles_.size() < num_work_tiles) {
882 work_tiles_.alloc(num_work_tiles);
883 }
884
885 int path_index_offset = num_active_paths;
886 int max_tile_work_size = 0;
887 for (int i = 0; i < num_work_tiles; i++) {
888 KernelWorkTile &work_tile = work_tiles_.data()[i];
889 work_tile = work_tiles[i];
890
891 const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
892
893 work_tile.path_index_offset = path_index_offset;
894 work_tile.work_size = tile_work_size;
895
896 path_index_offset += tile_work_size;
897
898 max_tile_work_size = max(max_tile_work_size, tile_work_size);
899 }
900
901 queue_->copy_to_device(work_tiles_);
902
904 device_ptr d_render_buffer = buffers_->buffer.device_pointer;
905
906 /* Launch kernel. */
908 &d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size);
909
910 queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
911
912 max_active_main_path_index_ = path_index_offset + num_predicted_splits;
913}
914
916{
918
919 int num_paths = 0;
920 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
921 DCHECK_GE(queue_counter->num_queued[i], 0)
922 << "Invalid number of queued states for kernel "
923 << device_kernel_as_string(static_cast<DeviceKernel>(i));
924
926 num_paths += queue_counter->num_queued[i];
927 }
928 }
929
930 return num_paths;
931}
932
934{
935 /* There are few aspects with the graphics interop when using multiple devices caused by the fact
936 * that the PathTraceDisplay has a single texture:
937 *
938 * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
939 * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
940 * otherwise one would run into a conflict of where the source of truth is. */
941 if (has_multiple_works()) {
942 return false;
943 }
944
946 Device *device = queue_->device;
948
949 if (interop_use_) {
950 VLOG_INFO << "Using graphics interop GPU display update.";
951 }
952 else {
953 VLOG_INFO << "Using naive GPU display update.";
954 }
955
957 }
958
959 return interop_use_;
960}
961
963 PassMode pass_mode,
964 int num_samples)
965{
966 if (device_->have_error()) {
967 /* Don't attempt to update GPU display if the device has errors: the error state will make
968 * wrong decisions to happen about interop, causing more chained bugs. */
969 return;
970 }
971
972 if (!buffers_->buffer.device_pointer) {
973 LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
974 return;
975 }
976
978 if (copy_to_display_interop(display, pass_mode, num_samples)) {
979 return;
980 }
981
982 /* If error happens when trying to use graphics interop fallback to the native implementation
983 * and don't attempt to use interop for the further updates. */
984 interop_use_ = false;
985 }
986
987 copy_to_display_naive(display, pass_mode, num_samples);
988}
989
991 PassMode pass_mode,
992 int num_samples)
993{
994 const int full_x = effective_buffer_params_.full_x;
995 const int full_y = effective_buffer_params_.full_y;
996 const int width = effective_buffer_params_.window_width;
997 const int height = effective_buffer_params_.window_height;
998 const int final_width = buffers_->params.window_width;
999 const int final_height = buffers_->params.window_height;
1000
1001 const int texture_x = full_x - effective_big_tile_params_.full_x +
1003 const int texture_y = full_y - effective_big_tile_params_.full_y +
1005
1006 /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
1007 *
1008 * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
1009 * change of the resolution divider. However, if the display becomes smaller, shrink the
1010 * allocated memory as well. */
1011 if (display_rgba_half_.data_width != final_width ||
1012 display_rgba_half_.data_height != final_height)
1013 {
1014 display_rgba_half_.alloc(final_width, final_height);
1015 /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
1016 * transferring zeroes to the device. */
1017 queue_->zero_to_device(display_rgba_half_);
1018 }
1019
1020 PassAccessor::Destination destination(film_->get_display_pass());
1021 destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
1022
1023 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1024
1025 queue_->copy_from_device(display_rgba_half_);
1026 queue_->synchronize();
1027
1028 display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
1029}
1030
1032 PassMode pass_mode,
1033 int num_samples)
1034{
1036 device_graphics_interop_ = queue_->graphics_interop_create();
1037 }
1038
1039 const DisplayDriver::GraphicsInterop graphics_interop_dst = display->graphics_interop_get();
1040 device_graphics_interop_->set_display_interop(graphics_interop_dst);
1041
1042 const device_ptr d_rgba_half = device_graphics_interop_->map();
1043 if (!d_rgba_half) {
1044 return false;
1045 }
1046
1048 destination.d_pixels_half_rgba = d_rgba_half;
1049
1050 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1051
1052 device_graphics_interop_->unmap();
1053
1054 return true;
1055}
1056
1058{
1060 return;
1061 }
1062 display->graphics_interop_activate();
1063 device_graphics_interop_ = nullptr;
1064 display->graphics_interop_deactivate();
1065}
1066
1068 PassMode pass_mode,
1069 int num_samples)
1070{
1071 const KernelFilm &kfilm = device_scene_->data.film;
1072
1073 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
1074 if (pass_access_info.type == PASS_NONE) {
1075 return;
1076 }
1077
1078 const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
1079
1080 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
1081}
1082
1084{
1085 const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
1086
1087 if (num_active_pixels) {
1090 queue_->synchronize();
1091 }
1092
1093 return num_active_pixels;
1094}
1095
1097{
1098 device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
1099 num_active_pixels.alloc(1);
1100
1101 queue_->zero_to_device(num_active_pixels);
1102
1104 if (!work_size) {
1105 return 0;
1106 }
1107
1108 const int reset_int = reset; /* No bool kernel arguments. */
1109
1110 DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1115 &threshold,
1116 &reset_int,
1119 &num_active_pixels.device_pointer);
1120
1122
1123 queue_->copy_from_device(num_active_pixels);
1124 queue_->synchronize();
1125
1126 return num_active_pixels.data()[0];
1127}
1128
1144
1160
1175
1177{
1178 queue_->copy_from_device(buffers_->buffer);
1179
1180 /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
1181 return queue_->synchronize();
1182}
1183
1185{
1186 queue_->copy_to_device(buffers_->buffer);
1187
1188 /* NOTE: The direct device access to the buffers only happens within this path trace work. The
1189 * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
1190 * which will perform synchronization as needed. */
1191
1192 return true;
1193}
1194
1196{
1197 queue_->zero_to_device(buffers_->buffer);
1198
1199 return true;
1200}
1201
1203{
1204 return device_scene_->data.integrator.has_shadow_catcher;
1205}
1206
1208{
1209 if (max_active_main_path_index_ == 0) {
1210 return 0;
1211 }
1212
1213 if (!has_shadow_catcher()) {
1214 return 0;
1215 }
1216
1217 queue_->zero_to_device(num_queued_paths_);
1218
1220 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
1221
1222 DeviceKernelArguments args(&work_size, &d_num_queued_paths);
1223
1225 queue_->copy_from_device(num_queued_paths_);
1226 queue_->synchronize();
1227
1228 return num_queued_paths_.data()[0];
1229}
1230
1237
1246
1254
1260
1266
unsigned int uint
void reset()
clear internal cached data and reset random seed
int window_y
Definition buffers.h:80
int window_height
Definition buffers.h:82
int window_width
Definition buffers.h:81
NODE_DECLARE int width
Definition buffers.h:72
int window_x
Definition buffers.h:79
KernelData data
Definition devicescene.h:95
virtual void const_copy_to(const char *name, void *host, size_t size)=0
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const =0
virtual bool should_use_graphics_interop()
bool have_error()
Definition film.h:30
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
bool kernel_is_shadow_path(DeviceKernel kernel)
virtual bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, bool *cancel_requested_flag)
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
virtual void destroy_gpu_resources(PathTraceDisplay *display) override
virtual void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
int adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
virtual bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
virtual void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
virtual void cryptomatte_postproces() override
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool kernel_creates_ao_paths(DeviceKernel kernel)
virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num, int sample_offset) override
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
virtual bool copy_render_buffers_to_device() override
virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) override
virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
bool is_cancel_requested() const
bool get_work(KernelWorkTile *work_tile, const int max_work_size=0)
void set_accelerated_rt(bool state)
void set_max_num_path_states(int max_num_path_states)
void reset(const BufferParams &buffer_params, int sample_start, int samples_num, int sample_offset, float scrambling_distance)
size_t size() const
T * alloc(size_t width, size_t height=0, size_t depth=0)
@ MEM_READ_WRITE
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
#define KERNEL_FEATURE_AO
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ BVH_LAYOUT_OPTIX
@ PASS_NONE
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
DeviceKernel
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
#define VLOG_INFO
Definition log.h:72
#define VLOG_IS_ON(severity)
Definition log.h:36
#define DCHECK_GE(a, b)
Definition log.h:57
#define DCHECK_LE(a, b)
Definition log.h:62
#define DCHECK_GT(a, b)
Definition log.h:60
#define VLOG_DEVICE_STATS
Definition log.h:78
#define LOG(severity)
Definition log.h:33
PassMode
Definition pass.h:20
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
#define min(a, b)
Definition sort.c:32
unsigned __int64 uint64_t
Definition stdint.h:90
string string_human_readable_size(size_t size)
Definition string.cpp:234
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition state.h:103
ccl_global IntegratorQueueCounter * queue_counter
Definition state.h:198
ccl_global int * next_shadow_path_index
Definition state.h:204
ccl_global int * next_main_path_index
Definition state.h:207
ccl_global int * sort_partition_key_offsets
Definition state.h:210
ccl_global int * sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition state.h:201
uint sort_partition_divisor
Definition state.h:213
float max
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition util/types.h:53
uint64_t device_ptr
Definition util/types.h:45