24 size_t state_size = 0;
26#define KERNEL_STRUCT_BEGIN(name) \
27 for (int array_index = 0;; array_index++) {
29#ifdef __INTEGRATOR_GPU_PACKED_STATE__
30# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
31 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
32# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
33# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
34 KERNEL_STRUCT_BEGIN(parent_struct) \
35 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
37# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
38 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
39# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
40# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
43#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
44 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
45#define KERNEL_STRUCT_END(name) \
49#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
50 if (array_index >= gpu_array_size - 1) { \
59#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
65#undef KERNEL_STRUCT_BEGIN
66#undef KERNEL_STRUCT_BEGIN_PACKED
67#undef KERNEL_STRUCT_MEMBER
68#undef KERNEL_STRUCT_MEMBER_PACKED
69#undef KERNEL_STRUCT_ARRAY_MEMBER
70#undef KERNEL_STRUCT_END
71#undef KERNEL_STRUCT_END_ARRAY
72#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
80 bool *cancel_requested_flag)
81 :
PathTraceWork(device, film, device_scene, cancel_requested_flag),
82 queue_(device->gpu_queue_create()),
83 integrator_state_soa_kernel_features_(0),
84 integrator_queue_counter_(device,
"integrator_queue_counter",
MEM_READ_WRITE),
85 integrator_shader_sort_counter_(device,
"integrator_shader_sort_counter",
MEM_READ_WRITE),
86 integrator_shader_raytrace_sort_counter_(
87 device,
"integrator_shader_raytrace_sort_counter",
MEM_READ_WRITE),
88 integrator_shader_mnee_sort_counter_(
90 integrator_shader_sort_prefix_sum_(
92 integrator_shader_sort_partition_key_offsets_(
93 device,
"integrator_shader_sort_partition_key_offsets",
MEM_READ_WRITE),
94 integrator_next_main_path_index_(device,
"integrator_next_main_path_index",
MEM_READ_WRITE),
95 integrator_next_shadow_path_index_(
100 display_rgba_half_(device,
"display buffer half",
MEM_READ_WRITE),
102 min_num_active_main_paths_(0),
103 max_active_main_path_index_(0)
122 requested_volume_stack_size);
142#define KERNEL_STRUCT_BEGIN(name) \
143 for (int array_index = 0;; array_index++) {
144#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
145 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
146 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
147 shadow ? "shadow_" : ""); \
148 device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
149 array->alloc_to_device(max_num_paths_); \
150 integrator_state_soa_.emplace_back(array); \
151 memcpy(&integrator_state_gpu_.parent_struct.name, \
152 &array->device_pointer, \
153 sizeof(array->device_pointer)); \
155#ifdef __INTEGRATOR_GPU_PACKED_STATE__
156# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
157 if ((kernel_features & (feature))) { \
158 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
159 shadow ? "shadow_" : ""); \
160 VLOG_DEBUG << "Skipping " << name_str \
161 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
163# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
164 KERNEL_STRUCT_BEGIN(parent_struct) \
165 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
167# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
168# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
171#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
172 if ((kernel_features & (feature)) && \
173 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
175 string name_str = string_printf( \
176 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
177 device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
178 array->alloc_to_device(max_num_paths_); \
179 integrator_state_soa_.emplace_back(array); \
180 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
181 &array->device_pointer, \
182 sizeof(array->device_pointer)); \
184#define KERNEL_STRUCT_END(name) \
188#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
189 if (array_index >= gpu_array_size - 1) { \
193#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
200#undef KERNEL_STRUCT_BEGIN
201#undef KERNEL_STRUCT_BEGIN_PACKED
202#undef KERNEL_STRUCT_MEMBER
203#undef KERNEL_STRUCT_MEMBER_PACKED
204#undef KERNEL_STRUCT_ARRAY_MEMBER
205#undef KERNEL_STRUCT_END
206#undef KERNEL_STRUCT_END_ARRAY
207#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
210 size_t total_soa_size = 0;
212 total_soa_size += soa_memory->memory_size();
250 const int num_elements =
queue_->num_sort_partition_elements();
359 int num_iterations = 0;
371 if (!
queue_->synchronize()) {
390 if (!
queue_->synchronize()) {
403 if (num_iterations) {
407 statistics.occupancy = 0.0f;
415 int max_num_queued = 0;
419 if (queue_counter->
num_queued[i] > max_num_queued) {
421 max_num_queued = queue_counter->
num_queued[i];
460 int num_active_paths = 0;
462 num_active_paths += queue_counter->
num_queued[i];
465 if (num_active_paths == 0) {
477 int num_paths_limit = INT_MAX;
484 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
496 num_paths_limit = available_shadow_paths / 2;
519 int num_queued = queue_counter->
num_queued[kernel];
582 <<
" used for path iteration, should never happen.";
588 const int num_paths_limit)
590 int d_queued_kernel = queued_kernel;
600 &
work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
609 assert(d_counter != 0 && d_prefix_sum != 0);
649 int d_queued_kernel = queued_kernel;
665 if (num_active_paths == 0) {
670 const int min_compact_paths = 32;
691 const int num_active_paths =
696 if (num_active_paths == 0) {
706 const float shadow_compact_ratio = 0.5f;
707 const int min_compact_paths = 32;
727 const int max_active_path_index,
740 int offset = num_active_paths;
755 &
work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
767 if (num_compact_paths > 0) {
769 int active_states_offset = 0;
770 int terminated_states_offset = num_active_paths;
773 &d_compact_paths, &active_states_offset, &terminated_states_offset, &
work_size);
794 if (num_active_paths == 0) {
805 int num_predicted_splits = 0;
820 const int num_available_paths =
max_num_paths_ - num_active_paths;
821 const int num_new_paths = num_available_paths / 2;
822 max_num_camera_paths =
max(num_active_paths,
823 num_active_paths + num_new_paths - num_scheduled_possible_split);
824 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
829 int num_paths = num_active_paths;
832 while (num_paths < max_num_camera_paths) {
835 work_tiles.push_back(work_tile);
836 num_paths += work_tile.
w * work_tile.
h * work_tile.
num_samples;
844 if (work_tiles.size() == 0 && num_paths == 0) {
851 if (work_tiles.size() == 0) {
869 num_predicted_splits);
876 const int num_work_tiles,
877 const int num_active_paths,
878 const int num_predicted_splits)
885 int path_index_offset = num_active_paths;
887 for (
int i = 0; i < num_work_tiles; i++) {
889 work_tile = work_tiles[i];
891 const int tile_work_size = work_tile.
w * work_tile.
h * work_tile.
num_samples;
896 path_index_offset += tile_work_size;
922 <<
"Invalid number of queued states for kernel "
950 VLOG_INFO <<
"Using graphics interop GPU display update.";
953 VLOG_INFO <<
"Using naive GPU display update.";
972 if (!
buffers_->buffer.device_pointer) {
973 LOG(WARNING) <<
"Request for GPU display update without allocated render buffers.";
998 const int final_width =
buffers_->params.window_width;
999 const int final_height =
buffers_->params.window_height;
1048 destination.d_pixels_half_rgba = d_rgba_half;
1062 display->graphics_interop_activate();
1064 display->graphics_interop_deactivate();
1087 if (num_active_pixels) {
1093 return num_active_pixels;
1099 num_active_pixels.
alloc(1);
1101 queue_->zero_to_device(num_active_pixels);
1108 const int reset_int =
reset;
1123 queue_->copy_from_device(num_active_pixels);
1126 return num_active_pixels.
data()[0];
1181 return queue_->synchronize();
void reset()
clear internal cached data and reset random seed
virtual void const_copy_to(const char *name, void *host, size_t size)=0
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const =0
virtual bool should_use_graphics_interop()
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
bool kernel_is_shadow_path(DeviceKernel kernel)
virtual bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
void compact_shadow_paths()
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
int integrator_state_soa_volume_stack_size_
void alloc_integrator_sorting()
uint integrator_state_soa_kernel_features_
bool interop_use_checked_
void enqueue_adaptive_sampling_filter_y()
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, bool *cancel_requested_flag)
void enqueue_adaptive_sampling_filter_x()
void alloc_integrator_soa()
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
int min_num_active_main_paths_
virtual void destroy_gpu_resources(PathTraceDisplay *display) override
virtual void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
int adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
virtual bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
virtual void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
virtual void cryptomatte_postproces() override
void alloc_integrator_path_split()
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool should_use_graphics_interop()
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool has_shadow_catcher() const
bool kernel_creates_ao_paths(DeviceKernel kernel)
bool enqueue_path_iteration()
int num_active_main_paths_paths()
virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num, int sample_offset) override
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void alloc_integrator_queue()
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
virtual bool copy_render_buffers_to_device() override
int max_active_main_path_index_
virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) override
virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
int shadow_catcher_count_possible_splits()
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
bool is_cancel_requested() const
bool get_work(KernelWorkTile *work_tile, const int max_work_size=0)
void set_accelerated_rt(bool state)
void set_max_num_path_states(int max_num_path_states)
void reset(const BufferParams &buffer_params, int sample_start, int samples_num, int sample_offset, float scrambling_distance)
device_ptr device_pointer
T * alloc(size_t width, size_t height=0, size_t depth=0)
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
#define KERNEL_FEATURE_AO
@ DEVICE_KERNEL_INTEGRATOR_NUM
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
#define VLOG_IS_ON(severity)
#define VLOG_DEVICE_STATS
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
ccl_global IntegratorQueueCounter * queue_counter
ccl_global int * next_shadow_path_index
ccl_global int * next_main_path_index
ccl_global int * sort_partition_key_offsets
ccl_global int * sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM]
uint sort_partition_divisor
ccl_device_inline size_t divide_up(size_t x, size_t y)