Blender V4.5
path_trace_work_gpu.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
6
7#include "device/device.h"
8
11
12#include "scene/scene.h"
13#include "session/buffers.h"
14
15#include "util/log.h"
16#include "util/string.h"
17
18#include "kernel/types.h"
19
21
22static size_t estimate_single_state_size(const uint kernel_features)
23{
24 size_t state_size = 0;
25
26#define KERNEL_STRUCT_BEGIN(name) \
27 for (int array_index = 0;; array_index++) {
28
29#ifdef __INTEGRATOR_GPU_PACKED_STATE__
30# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
31 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
32# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
33# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
34 KERNEL_STRUCT_BEGIN(parent_struct) \
35 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
36#else
37# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
38 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
39# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
40# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
41#endif
42
43#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
44 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
45#define KERNEL_STRUCT_END(name) \
46 (void)array_index; \
47 break; \
48 }
49#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
50 if (array_index >= gpu_array_size - 1) { \
51 break; \
52 } \
53 }
54/* TODO(sergey): Look into better estimation for fields which depend on scene features. Maybe
55 * maximum state calculation should happen as `alloc_work_memory()`, so that we can react to an
56 * updated scene state here.
57 * For until then use common value. Currently this size is only used for logging, but is weak to
58 * rely on this. */
59#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
60
62
64
65#undef KERNEL_STRUCT_BEGIN
66#undef KERNEL_STRUCT_BEGIN_PACKED
67#undef KERNEL_STRUCT_MEMBER
68#undef KERNEL_STRUCT_MEMBER_PACKED
69#undef KERNEL_STRUCT_ARRAY_MEMBER
70#undef KERNEL_STRUCT_END
71#undef KERNEL_STRUCT_END_ARRAY
72#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
73
74 return state_size;
75}
76
78 Film *film,
79 DeviceScene *device_scene,
80 const bool *cancel_requested_flag)
81 : PathTraceWork(device, film, device_scene, cancel_requested_flag),
82 queue_(device->gpu_queue_create()),
84 integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
85 integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
87 device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
89 device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
91 device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
93 device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
94 integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
96 device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
97 queued_paths_(device, "queued_paths", MEM_READ_WRITE),
98 num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
99 work_tiles_(device, "work_tiles", MEM_READ_WRITE),
100 display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
104{
105 memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
106}
107
109{
110 /* IntegrateState allocated as structure of arrays. */
111
112 /* Check if we already allocated memory for the required features. */
113 const int requested_volume_stack_size = device_scene_->data.volume_stack_size;
114 const uint kernel_features = device_scene_->data.kernel_features;
115 if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features &&
116 integrator_state_soa_volume_stack_size_ >= requested_volume_stack_size)
117 {
118 return;
119 }
122 requested_volume_stack_size);
123
124 /* Determine the number of path states. Deferring this for as long as possible allows the
125 * back-end to make better decisions about memory availability. */
126 if (max_num_paths_ == 0) {
127 const size_t single_state_size = estimate_single_state_size(kernel_features);
128
129 max_num_paths_ = queue_->num_concurrent_states(single_state_size);
130 min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
131
132 /* Limit number of active paths to the half of the overall state. This is due to the logic in
133 * the path compaction which relies on the fact that regeneration does not happen sooner than
134 * half of the states are available again. */
136 }
137
138 /* Allocate a device only memory buffer before for each struct member, and then
139 * write the pointers into a struct that resides in constant memory.
140 *
141 * TODO: store float3 in separate XYZ arrays. */
142#define KERNEL_STRUCT_BEGIN(name) \
143 for (int array_index = 0;; array_index++) {
144#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
145 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
146 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
147 shadow ? "shadow_" : ""); \
148 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
149 array->alloc_to_device(max_num_paths_); \
150 memcpy(&integrator_state_gpu_.parent_struct.name, \
151 &array->device_pointer, \
152 sizeof(array->device_pointer)); \
153 integrator_state_soa_.emplace_back(std::move(array)); \
154 }
155#ifdef __INTEGRATOR_GPU_PACKED_STATE__
156# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
157 if ((kernel_features & (feature))) { \
158 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
159 shadow ? "shadow_" : ""); \
160 VLOG_DEBUG << "Skipping " << name_str \
161 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
162 }
163# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
164 KERNEL_STRUCT_BEGIN(parent_struct) \
165 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
166#else
167# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
168# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
169#endif
170
171#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
172 if ((kernel_features & (feature)) && \
173 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
174 { \
175 string name_str = string_printf( \
176 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
177 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
178 array->alloc_to_device(max_num_paths_); \
179 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
180 &array->device_pointer, \
181 sizeof(array->device_pointer)); \
182 integrator_state_soa_.emplace_back(std::move(array)); \
183 }
184#define KERNEL_STRUCT_END(name) \
185 (void)array_index; \
186 break; \
187 }
188#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
189 if (array_index >= gpu_array_size - 1) { \
190 break; \
191 } \
192 }
193#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
194
195 bool shadow = false;
197 shadow = true;
199
200#undef KERNEL_STRUCT_BEGIN
201#undef KERNEL_STRUCT_BEGIN_PACKED
202#undef KERNEL_STRUCT_MEMBER
203#undef KERNEL_STRUCT_MEMBER_PACKED
204#undef KERNEL_STRUCT_ARRAY_MEMBER
205#undef KERNEL_STRUCT_END
206#undef KERNEL_STRUCT_END_ARRAY
207#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
208
209 if (VLOG_IS_ON(3)) {
210 size_t total_soa_size = 0;
211 for (auto &&soa_memory : integrator_state_soa_) {
212 total_soa_size += soa_memory->memory_size();
213 }
214
215 VLOG_DEVICE_STATS << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
216 }
217}
218
220{
221 if (integrator_queue_counter_.size() == 0) {
223 integrator_queue_counter_.zero_to_device();
224 integrator_queue_counter_.copy_from_device();
226 integrator_queue_counter_.device_pointer;
227 }
228
229 /* Allocate data for active path index arrays. */
230 if (num_queued_paths_.size() == 0) {
231 num_queued_paths_.alloc(1);
232 num_queued_paths_.zero_to_device();
233 }
234
235 if (queued_paths_.size() == 0) {
237 /* TODO: this could be skip if we had a function to just allocate on device. */
238 queued_paths_.zero_to_device();
239 }
240}
241
243{
244 num_sort_partitions_ = queue_->num_sort_partitions(max_num_paths_,
245 device_scene_->data.max_shaders);
246
247 integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
249
250 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
251 /* Allocate array for partitioned shader sorting using local atomics. */
252 const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
253 if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
256 }
257 integrator_state_gpu_.sort_partition_key_offsets =
259 }
260 else {
261 /* Allocate arrays for shader sorting. */
262 const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
263 if (integrator_shader_sort_counter_.size() < sort_buckets) {
264 integrator_shader_sort_counter_.alloc(sort_buckets);
265 integrator_shader_sort_counter_.zero_to_device();
267 (int *)integrator_shader_sort_counter_.device_pointer;
268
269 integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
270 integrator_shader_sort_prefix_sum_.zero_to_device();
271 }
272
273 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
274 if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
278 (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
279 }
280 }
281
282 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
283 if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
284 integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
287 (int *)integrator_shader_mnee_sort_counter_.device_pointer;
288 }
289 }
290 }
291}
292
294{
295 if (integrator_next_shadow_path_index_.size() == 0) {
297 integrator_next_shadow_path_index_.zero_to_device();
298
299 integrator_state_gpu_.next_shadow_path_index =
300 (int *)integrator_next_shadow_path_index_.device_pointer;
301 }
302
303 if (integrator_next_main_path_index_.size() == 0) {
306 integrator_next_main_path_index_.zero_to_device();
307
308 integrator_state_gpu_.next_main_path_index =
309 (int *)integrator_next_main_path_index_.device_pointer;
310 }
311}
312
320
322{
323 queue_->init_execution();
324
325 /* Copy to device side struct in constant memory. */
326 device_->const_copy_to(
327 "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
328}
329
331 const int start_sample,
332 const int samples_num,
333 const int sample_offset)
334{
335 /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
336 * add more work (because tiles are smaller, so there is higher chance that more paths will
337 * become busy after adding new tiles). This is especially important for the shadow catcher which
338 * schedules work in halves of available number of paths. */
339 work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
340 work_tile_scheduler_.set_accelerated_rt(
341 (device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
343 start_sample,
344 samples_num,
345 sample_offset,
346 device_scene_->data.integrator.scrambling_distance);
347
349
350 int num_iterations = 0;
351 uint64_t num_busy_accum = 0;
352
353 /* TODO: set a hard limit in case of undetected kernel failures? */
354 while (true) {
355 /* Enqueue work from the scheduler, on start or when there are not enough
356 * paths to keep the device occupied. */
357 bool finished;
358 if (enqueue_work_tiles(finished)) {
359 /* Copy stats from the device. */
360 queue_->copy_from_device(integrator_queue_counter_);
361
362 if (!queue_->synchronize()) {
363 break; /* Stop on error. */
364 }
365 }
366
367 if (is_cancel_requested()) {
368 break;
369 }
370
371 /* Stop if no more work remaining. */
372 if (finished) {
373 break;
374 }
375
376 /* Enqueue on of the path iteration kernels. */
378 /* Copy stats from the device. */
379 queue_->copy_from_device(integrator_queue_counter_);
380
381 if (!queue_->synchronize()) {
382 break; /* Stop on error. */
383 }
384 }
385
386 if (is_cancel_requested()) {
387 break;
388 }
389
390 num_busy_accum += num_active_main_paths_paths();
391 ++num_iterations;
392 }
393
394 if (num_iterations) {
395 statistics.occupancy = float(num_busy_accum) / num_iterations / max_num_paths_;
396 }
397 else {
398 statistics.occupancy = 0.0f;
399 }
400}
401
403{
404 const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
405
406 int max_num_queued = 0;
408
409 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
410 if (queue_counter->num_queued[i] > max_num_queued) {
411 kernel = (DeviceKernel)i;
412 max_num_queued = queue_counter->num_queued[i];
413 }
414 }
415
416 return kernel;
417}
418
420{
422
424 queue_->zero_to_device(integrator_queue_counter_);
425 if (integrator_shader_sort_counter_.size() != 0) {
427 }
428 if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
430 {
432 }
433 if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
435 {
437 }
438
439 /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
440 * counter on the host side because `zero_to_device()` is not doing it. */
441 if (integrator_queue_counter_.host_pointer) {
442 memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
443 }
444}
445
447{
448 /* Find kernel to execute, with max number of queued paths. */
449 const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
450
451 int num_active_paths = 0;
452 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
453 num_active_paths += queue_counter->num_queued[i];
454 }
455
456 if (num_active_paths == 0) {
457 return false;
458 }
459
460 /* Find kernel to execute, with max number of queued paths. */
461 const DeviceKernel kernel = get_most_queued_kernel();
462 if (kernel == DEVICE_KERNEL_NUM) {
463 return false;
464 }
465
466 /* For kernels that add shadow paths, check if there is enough space available.
467 * If not, schedule shadow kernels first to clear out the shadow paths. */
468 int num_paths_limit = INT_MAX;
469
470 if (kernel_creates_shadow_paths(kernel)) {
472
473 const int available_shadow_paths = max_num_paths_ -
475 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
478 return true;
479 }
482 return true;
483 }
484 }
485 else if (kernel_creates_ao_paths(kernel)) {
486 /* AO kernel creates two shadow paths, so limit number of states to schedule. */
487 num_paths_limit = available_shadow_paths / 2;
488 }
489 }
490
491 /* Schedule kernel with maximum number of queued items. */
492 enqueue_path_iteration(kernel, num_paths_limit);
493
494 /* Update next shadow path index for kernels that can add shadow paths. */
495 if (kernel_creates_shadow_paths(kernel)) {
497 }
498
499 return true;
500}
501
502void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
503{
504 device_ptr d_path_index = 0;
505
506 /* Create array of path indices for which this kernel is queued to be executed. */
508
510 const int num_queued = queue_counter->num_queued[kernel];
511
512 if (kernel_uses_sorting(kernel)) {
513 /* Compute array of active paths, sorted by shader. */
514 work_size = num_queued;
515 d_path_index = queued_paths_.device_pointer;
516
517 compute_sorted_queued_paths(kernel, num_paths_limit);
518 }
519 else if (num_queued < work_size) {
520 work_size = num_queued;
521 d_path_index = queued_paths_.device_pointer;
522
523 if (kernel_is_shadow_path(kernel)) {
524 /* Compute array of active shadow paths for specific kernel. */
526 }
527 else {
528 /* Compute array of active paths for specific kernel. */
530 }
531 }
532
533 work_size = min(work_size, num_paths_limit);
534
536
537 switch (kernel) {
539 /* Closest ray intersection kernels with integrator state and render buffer. */
540 const DeviceKernelArguments args(
541 &d_path_index, &buffers_->buffer.device_pointer, &work_size);
542
543 queue_->enqueue(kernel, work_size, args);
544 break;
545 }
546
551 /* Ray intersection kernels with integrator state. */
552 const DeviceKernelArguments args(&d_path_index, &work_size);
553
554 queue_->enqueue(kernel, work_size, args);
555 break;
556 }
565 /* Shading kernels with integrator state and render buffer. */
566 const DeviceKernelArguments args(
567 &d_path_index, &buffers_->buffer.device_pointer, &work_size);
568
569 queue_->enqueue(kernel, work_size, args);
570 break;
571 }
572
573 default:
574 LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
575 << " used for path iteration, should never happen.";
576 break;
577 }
578}
579
581 const int num_paths_limit)
582{
583 int d_queued_kernel = queued_kernel;
584
585 /* Launch kernel to fill the active paths arrays. */
586 if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
587 const int work_size = kernel_max_active_main_path_index(queued_kernel);
588 device_ptr d_queued_paths = queued_paths_.device_pointer;
589
590 int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
591
592 const DeviceKernelArguments args(
593 &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
594
597 return;
598 }
599
600 device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
601 device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
602 assert(d_counter != 0 && d_prefix_sum != 0);
603
604 /* Compute prefix sum of number of active paths with each shader. */
605 {
606 const int work_size = 1;
607 int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
608
609 const DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
610
612 }
613
614 queue_->zero_to_device(num_queued_paths_);
615
616 /* Launch kernel to fill the active paths arrays. */
617 {
618 /* TODO: this could be smaller for terminated paths based on amount of work we want
619 * to schedule, and also based on num_paths_limit.
620 *
621 * Also, when the number paths is limited it may be better to prefer paths from the
622 * end of the array since compaction would need to do less work. */
623 const int work_size = kernel_max_active_main_path_index(queued_kernel);
624
625 device_ptr d_queued_paths = queued_paths_.device_pointer;
626 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
627
629 &num_paths_limit,
630 &d_queued_paths,
631 &d_num_queued_paths,
632 &d_counter,
633 &d_prefix_sum,
634 &d_queued_kernel);
635
637 }
638}
639
641{
642 int d_queued_kernel = queued_kernel;
643
644 /* Launch kernel to fill the active paths arrays. */
645 const int work_size = kernel_max_active_main_path_index(queued_kernel);
646 device_ptr d_queued_paths = queued_paths_.device_pointer;
647 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
648
649 const DeviceKernelArguments args(
650 &work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
651
652 queue_->zero_to_device(num_queued_paths_);
653 queue_->enqueue(kernel, work_size, args);
654}
655
656void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
657{
658 /* Early out if there is nothing that needs to be compacted. */
659 if (num_active_paths == 0) {
661 return;
662 }
663
664 const int min_compact_paths = 32;
665 if (max_active_main_path_index_ == num_active_paths ||
666 max_active_main_path_index_ < min_compact_paths)
667 {
668 return;
669 }
670
671 /* Compact. */
672 compact_paths(num_active_paths,
677
678 /* Adjust max active path index now we know which part of the array is actually used. */
679 max_active_main_path_index_ = num_active_paths;
680}
681
683{
685 const int num_active_paths =
688
689 /* Early out if there is nothing that needs to be compacted. */
690 if (num_active_paths == 0) {
691 if (integrator_next_shadow_path_index_.data()[0] != 0) {
694 }
695 return;
696 }
697
698 /* Compact if we can reduce the space used by half. Not always since
699 * compaction has a cost. */
700 const float max_overhead_factor = 2.0f;
701 const int min_compact_paths = 32;
702 const int num_total_paths = integrator_next_shadow_path_index_.data()[0];
703 if (num_total_paths < num_active_paths * max_overhead_factor ||
704 num_total_paths < min_compact_paths)
705 {
706 return;
707 }
708
709 /* Compact. */
710 compact_paths(num_active_paths,
711 num_total_paths,
715
716 /* Adjust max active path index now we know which part of the array is actually used. */
717 integrator_next_shadow_path_index_.data()[0] = num_active_paths;
719}
720
721void PathTraceWorkGPU::compact_paths(const int num_active_paths,
722 const int max_active_path_index,
723 DeviceKernel terminated_paths_kernel,
724 DeviceKernel compact_paths_kernel,
725 DeviceKernel compact_kernel)
726{
727 /* Compact fragmented path states into the start of the array, moving any paths
728 * with index higher than the number of active paths into the gaps. */
729 device_ptr d_compact_paths = queued_paths_.device_pointer;
730 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
731
732 /* Create array with terminated paths that we can write to. */
733 {
734 /* TODO: can the work size be reduced here? */
735 int offset = num_active_paths;
736 const int work_size = num_active_paths;
737
738 const DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset);
739
740 queue_->zero_to_device(num_queued_paths_);
741 queue_->enqueue(terminated_paths_kernel, work_size, args);
742 }
743
744 /* Create array of paths that we need to compact, where the path index is bigger
745 * than the number of active paths. */
746 {
747 const int work_size = max_active_path_index;
748
749 const DeviceKernelArguments args(
750 &work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
751
752 queue_->zero_to_device(num_queued_paths_);
753 queue_->enqueue(compact_paths_kernel, work_size, args);
754 }
755
756 queue_->copy_from_device(num_queued_paths_);
757 queue_->synchronize();
758
759 const int num_compact_paths = num_queued_paths_.data()[0];
760
761 /* Move paths into gaps. */
762 if (num_compact_paths > 0) {
763 int work_size = num_compact_paths;
764 int active_states_offset = 0;
765 int terminated_states_offset = num_active_paths;
766
767 const DeviceKernelArguments args(
768 &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size);
769
770 queue_->enqueue(compact_kernel, work_size, args);
771 }
772}
773
775{
776 /* If there are existing paths wait them to go to intersect closest kernel, which will align the
777 * wavefront of the existing and newly added paths. */
778 /* TODO: Check whether counting new intersection kernels here will have positive affect on the
779 * performance. */
780 const DeviceKernel kernel = get_most_queued_kernel();
782 return false;
783 }
784
785 const int num_active_paths = num_active_main_paths_paths();
786
787 /* Don't schedule more work if canceling. */
788 if (is_cancel_requested()) {
789 if (num_active_paths == 0) {
790 finished = true;
791 }
792 return false;
793 }
794
795 finished = false;
796
797 vector<KernelWorkTile> work_tiles;
798
799 int max_num_camera_paths = max_num_paths_;
800 int num_predicted_splits = 0;
801
802 if (has_shadow_catcher()) {
803 /* When there are shadow catchers in the scene bounce from them will split the state. So we
804 * make sure there is enough space in the path states array to fit split states.
805 *
806 * Basically, when adding N new paths we ensure that there is 2*N available path states, so
807 * that all the new paths can be split.
808 *
809 * Note that it is possible that some of the current states can still split, so need to make
810 * sure there is enough space for them as well. */
811
812 /* Number of currently in-flight states which can still split. */
813 const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
814
815 const int num_available_paths = max_num_paths_ - num_active_paths;
816 const int num_new_paths = num_available_paths / 2;
817 max_num_camera_paths = max(num_active_paths,
818 num_active_paths + num_new_paths - num_scheduled_possible_split);
819 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
820 }
821
822 /* Schedule when we're out of paths or there are too few paths to keep the
823 * device occupied. */
824 int num_paths = num_active_paths;
825 if (num_paths == 0 || num_paths < min_num_active_main_paths_) {
826 /* Get work tiles until the maximum number of path is reached. */
827 while (num_paths < max_num_camera_paths) {
828 KernelWorkTile work_tile;
829 if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
830 work_tiles.push_back(work_tile);
831 num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
832 }
833 else {
834 break;
835 }
836 }
837
838 /* If we couldn't get any more tiles, we're done. */
839 if (work_tiles.empty() && num_paths == 0) {
840 finished = true;
841 return false;
842 }
843 }
844
845 /* Initialize paths from work tiles. */
846 if (work_tiles.empty()) {
847 return false;
848 }
849
850 /* Compact state array when number of paths becomes small relative to the
851 * known maximum path index, which makes computing active index arrays slow. */
852 compact_main_paths(num_active_paths);
853
854 if (has_shadow_catcher()) {
855 integrator_next_main_path_index_.data()[0] = num_paths;
857 }
858
861 work_tiles.data(),
862 work_tiles.size(),
863 num_active_paths,
864 num_predicted_splits);
865
866 return true;
867}
868
870 const KernelWorkTile work_tiles[],
871 const int num_work_tiles,
872 const int num_active_paths,
873 const int num_predicted_splits)
874{
875 /* Copy work tiles to device. */
876 if (work_tiles_.size() < num_work_tiles) {
877 work_tiles_.alloc(num_work_tiles);
878 }
879
880 int path_index_offset = num_active_paths;
881 int max_tile_work_size = 0;
882 for (int i = 0; i < num_work_tiles; i++) {
883 KernelWorkTile &work_tile = work_tiles_.data()[i];
884 work_tile = work_tiles[i];
885
886 const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
887
888 work_tile.path_index_offset = path_index_offset;
889 work_tile.work_size = tile_work_size;
890
891 path_index_offset += tile_work_size;
892
893 max_tile_work_size = max(max_tile_work_size, tile_work_size);
894 }
895
896 queue_->copy_to_device(work_tiles_);
897
898 const device_ptr d_work_tiles = work_tiles_.device_pointer;
899 device_ptr d_render_buffer = buffers_->buffer.device_pointer;
900
901 /* Launch kernel. */
902 const DeviceKernelArguments args(
903 &d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size);
904
905 queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
906
907 max_active_main_path_index_ = path_index_offset + num_predicted_splits;
908}
909
911{
913
914 int num_paths = 0;
915 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
916 DCHECK_GE(queue_counter->num_queued[i], 0)
917 << "Invalid number of queued states for kernel "
918 << device_kernel_as_string(static_cast<DeviceKernel>(i));
919
921 num_paths += queue_counter->num_queued[i];
922 }
923 }
924
925 return num_paths;
926}
927
929{
930 /* There are few aspects with the graphics interop when using multiple devices caused by the fact
931 * that the PathTraceDisplay has a single texture:
932 *
933 * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
934 * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
935 * otherwise one would run into a conflict of where the source of truth is. */
936 if (has_multiple_works()) {
937 return false;
938 }
939
941 Device *device = queue_->device;
943 true);
944
945 if (interop_use_) {
946 VLOG_INFO << "Using graphics interop GPU display update.";
947 }
948 else {
949 VLOG_INFO << "Using naive GPU display update.";
950 }
951
953 }
954
955 return interop_use_;
956}
957
959 PassMode pass_mode,
960 const int num_samples)
961{
962 if (device_->have_error()) {
963 /* Don't attempt to update GPU display if the device has errors: the error state will make
964 * wrong decisions to happen about interop, causing more chained bugs. */
965 return;
966 }
967
968 if (!buffers_->buffer.device_pointer) {
969 LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
970 return;
971 }
972
973 if (should_use_graphics_interop(display)) {
974 if (copy_to_display_interop(display, pass_mode, num_samples)) {
975 return;
976 }
977
978 /* If error happens when trying to use graphics interop fallback to the native implementation
979 * and don't attempt to use interop for the further updates. */
980 interop_use_ = false;
981 }
982
983 copy_to_display_naive(display, pass_mode, num_samples);
984}
985
987 PassMode pass_mode,
988 const int num_samples)
989{
990 const int full_x = effective_buffer_params_.full_x;
991 const int full_y = effective_buffer_params_.full_y;
992 const int width = effective_buffer_params_.window_width;
993 const int height = effective_buffer_params_.window_height;
994 const int final_width = buffers_->params.window_width;
995 const int final_height = buffers_->params.window_height;
996
997 const int texture_x = full_x - effective_big_tile_params_.full_x +
999 const int texture_y = full_y - effective_big_tile_params_.full_y +
1001
1002 /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
1003 *
1004 * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
1005 * change of the resolution divider. However, if the display becomes smaller, shrink the
1006 * allocated memory as well. */
1007 if (display_rgba_half_.data_width != final_width ||
1008 display_rgba_half_.data_height != final_height)
1009 {
1010 display_rgba_half_.alloc(final_width, final_height);
1011 /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
1012 * transferring zeroes to the device. */
1013 queue_->zero_to_device(display_rgba_half_);
1014 }
1015
1016 PassAccessor::Destination destination(film_->get_display_pass());
1017 destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
1018
1019 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1020
1021 queue_->copy_from_device(display_rgba_half_);
1022 queue_->synchronize();
1023
1024 display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
1025}
1026
1028 PassMode pass_mode,
1029 const int num_samples)
1030{
1032 device_graphics_interop_ = queue_->graphics_interop_create();
1033 }
1034
1035 GraphicsInteropBuffer &interop_buffer = display->graphics_interop_get_buffer();
1036 device_graphics_interop_->set_buffer(interop_buffer);
1037
1038 const device_ptr d_rgba_half = device_graphics_interop_->map();
1039 if (!d_rgba_half) {
1040 return false;
1041 }
1042
1044 destination.d_pixels_half_rgba = d_rgba_half;
1045
1046 get_render_tile_film_pixels(destination, pass_mode, num_samples);
1047
1048 device_graphics_interop_->unmap();
1049
1050 return true;
1051}
1052
1054{
1056 return;
1057 }
1058 display->graphics_interop_activate();
1059 device_graphics_interop_ = nullptr;
1060 display->graphics_interop_deactivate();
1061}
1062
1064 PassMode pass_mode,
1065 const int num_samples)
1066{
1067 const KernelFilm &kfilm = device_scene_->data.film;
1068
1069 const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
1070 if (pass_access_info.type == PASS_NONE) {
1071 return;
1072 }
1073
1074 const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
1075
1076 pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
1077}
1078
1080 bool reset)
1081{
1082 const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
1083
1084 if (num_active_pixels) {
1087 queue_->synchronize();
1088 }
1089
1090 return num_active_pixels;
1091}
1092
1094 bool reset)
1095{
1096 device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
1097 num_active_pixels.alloc(1);
1098
1099 queue_->zero_to_device(num_active_pixels);
1100
1102 if (!work_size) {
1103 return 0;
1104 }
1105
1106 const int reset_int = reset; /* No bool kernel arguments. */
1107
1108 const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1113 &threshold,
1114 &reset_int,
1117 &num_active_pixels.device_pointer);
1118
1120
1121 queue_->copy_from_device(num_active_pixels);
1122 queue_->synchronize();
1123
1124 return num_active_pixels.data()[0];
1125}
1126
1142
1158
1160{
1162 if (!work_size) {
1163 return;
1164 }
1165
1166 const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1167 &work_size,
1169 &effective_buffer_params_.stride);
1170
1172}
1173
1175{
1176 /* May not exist if cancelled before rendering started. */
1177 if (!buffers_->buffer.device_pointer) {
1178 return false;
1179 }
1180
1181 queue_->copy_from_device(buffers_->buffer);
1182
1183 /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
1184 return queue_->synchronize();
1185}
1186
1188{
1189 queue_->copy_to_device(buffers_->buffer);
1190
1191 /* NOTE: The direct device access to the buffers only happens within this path trace work. The
1192 * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
1193 * which will perform synchronization as needed. */
1194
1195 return true;
1196}
1197
1199{
1200 queue_->zero_to_device(buffers_->buffer);
1201
1202 return true;
1203}
1204
1206{
1207 return device_scene_->data.integrator.has_shadow_catcher;
1208}
1209
1211{
1212 if (max_active_main_path_index_ == 0) {
1213 return 0;
1214 }
1215
1216 if (!has_shadow_catcher()) {
1217 return 0;
1218 }
1219
1220 queue_->zero_to_device(num_queued_paths_);
1221
1223 device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
1224
1225 const DeviceKernelArguments args(&work_size, &d_num_queued_paths);
1226
1228 queue_->copy_from_device(num_queued_paths_);
1229 queue_->synchronize();
1230
1231 return num_queued_paths_.data()[0];
1232}
1233
1240
1249
1257
1263
1269
unsigned int uint
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
virtual bool should_use_graphics_interop(const GraphicsInteropDevice &, const bool=false)
Definition film.h:29
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
void copy_pixels_to_texture(const half4 *rgba_pixels, const int texture_x, const int texture_y, const int pixels_width, const int pixels_height)
GraphicsInteropBuffer & graphics_interop_get_buffer()
GraphicsInteropDevice graphics_interop_get_device()
bool kernel_is_shadow_path(DeviceKernel kernel)
bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
void destroy_gpu_resources(PathTraceDisplay *display) override
void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
void cryptomatte_postproces() override
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool kernel_creates_ao_paths(DeviceKernel kernel)
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
bool should_use_graphics_interop(PathTraceDisplay *display)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
int adaptive_sampling_convergence_check_count_active(const float threshold, bool reset)
bool copy_render_buffers_to_device() override
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
T * alloc(const size_t width, const size_t height=0, const size_t depth=0)
@ MEM_READ_WRITE
#define KERNEL_FEATURE_AO
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
@ BVH_LAYOUT_OPTIX
@ PASS_NONE
DeviceKernel
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
@ DEVICE_KERNEL_INTEGRATOR_NUM
#define VLOG_INFO
Definition log.h:71
#define VLOG_IS_ON(severity)
Definition log.h:35
#define DCHECK_GE(a, b)
Definition log.h:56
#define DCHECK_LE(a, b)
Definition log.h:61
#define DCHECK_GT(a, b)
Definition log.h:59
#define VLOG_DEVICE_STATS
Definition log.h:77
#define LOG(severity)
Definition log.h:32
PassMode
Definition pass.h:20
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
#define min(a, b)
Definition sort.cc:36
string string_human_readable_size(size_t size)
Definition string.cpp:257
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition state.h:103
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
uint64_t device_ptr
Definition types_base.h:44
ccl_device_inline size_t divide_up(const size_t x, const size_t y)
Definition types_base.h:52