25 size_t state_size = 0;
27#define KERNEL_STRUCT_BEGIN(name) \
28 for (int array_index = 0;; array_index++) {
30#ifdef __INTEGRATOR_GPU_PACKED_STATE__
31# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
32 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
33# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
34# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
35 KERNEL_STRUCT_BEGIN(parent_struct) \
36 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
38# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
39 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
40# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
41# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
44#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
45 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
46#define KERNEL_STRUCT_END(name) \
50#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
51 if (array_index >= gpu_array_size - 1) { \
60#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
66#undef KERNEL_STRUCT_BEGIN
67#undef KERNEL_STRUCT_BEGIN_PACKED
68#undef KERNEL_STRUCT_MEMBER
69#undef KERNEL_STRUCT_MEMBER_PACKED
70#undef KERNEL_STRUCT_ARRAY_MEMBER
71#undef KERNEL_STRUCT_END
72#undef KERNEL_STRUCT_END_ARRAY
73#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
81 const bool *cancel_requested_flag)
82 :
PathTraceWork(device, film, device_scene, cancel_requested_flag),
83 queue_(device->gpu_queue_create()),
88 device,
"integrator_shader_raytrace_sort_counter",
MEM_READ_WRITE),
94 device,
"integrator_shader_sort_partition_key_offsets",
MEM_READ_WRITE),
114 const int requested_volume_stack_size =
device_scene_->data.volume_stack_size;
123 requested_volume_stack_size);
143#define KERNEL_STRUCT_BEGIN(name) \
144 for (int array_index = 0;; array_index++) {
145#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
146 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
147 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
148 shadow ? "shadow_" : ""); \
149 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
150 array->alloc_to_device(max_num_paths_); \
151 memcpy(&integrator_state_gpu_.parent_struct.name, \
152 &array->device_pointer, \
153 sizeof(array->device_pointer)); \
154 integrator_state_soa_.emplace_back(std::move(array)); \
156#ifdef __INTEGRATOR_GPU_PACKED_STATE__
157# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
158 if ((kernel_features & (feature))) { \
159 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
160 shadow ? "shadow_" : ""); \
161 LOG_TRACE << "Skipping " << name_str \
162 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
164# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
165 KERNEL_STRUCT_BEGIN(parent_struct) \
166 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
168# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
169# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
172#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
173 if ((kernel_features & (feature)) && \
174 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
176 string name_str = string_printf( \
177 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
178 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
179 array->alloc_to_device(max_num_paths_); \
180 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
181 &array->device_pointer, \
182 sizeof(array->device_pointer)); \
183 integrator_state_soa_.emplace_back(std::move(array)); \
185#define KERNEL_STRUCT_END(name) \
189#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
190 if (array_index >= gpu_array_size - 1) { \
194#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
201#undef KERNEL_STRUCT_BEGIN
202#undef KERNEL_STRUCT_BEGIN_PACKED
203#undef KERNEL_STRUCT_MEMBER
204#undef KERNEL_STRUCT_MEMBER_PACKED
205#undef KERNEL_STRUCT_ARRAY_MEMBER
206#undef KERNEL_STRUCT_END
207#undef KERNEL_STRUCT_END_ARRAY
208#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
211 size_t total_soa_size = 0;
213 total_soa_size += soa_memory->memory_size();
332 const int start_sample,
333 const int samples_num,
334 const int sample_offset)
351 int num_iterations = 0;
363 if (!
queue_->synchronize()) {
382 if (!
queue_->synchronize()) {
395 if (num_iterations) {
407 int max_num_queued = 0;
411 if (queue_counter->
num_queued[
i] > max_num_queued) {
452 int num_active_paths = 0;
457 if (num_active_paths == 0) {
469 int num_paths_limit = INT_MAX;
476 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
488 num_paths_limit = available_shadow_paths / 2;
511 const int num_queued = queue_counter->
num_queued[kernel];
576 <<
" used for path iteration, should never happen.";
582 const int num_paths_limit)
584 int d_queued_kernel = queued_kernel;
594 &
work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
607 assert(d_counter != 0 && d_prefix_sum != 0);
647 int d_queued_kernel = queued_kernel;
655 &
work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
664 if (num_active_paths == 0) {
669 const int min_compact_paths = 32;
690 const int num_active_paths =
695 if (num_active_paths == 0) {
705 const float max_overhead_factor = 2.0f;
706 const int min_compact_paths = 32;
708 if (num_total_paths < num_active_paths * max_overhead_factor ||
709 num_total_paths < min_compact_paths)
727 const int max_active_path_index,
740 int offset = num_active_paths;
752 const int work_size = max_active_path_index;
755 &
work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
767 if (num_compact_paths > 0) {
769 int active_states_offset = 0;
770 int terminated_states_offset = num_active_paths;
773 &d_compact_paths, &active_states_offset, &terminated_states_offset, &
work_size);
794 if (num_active_paths == 0) {
805 int num_predicted_splits = 0;
820 const int num_available_paths =
max_num_paths_ - num_active_paths;
821 const int num_new_paths = num_available_paths / 2;
822 max_num_camera_paths =
max(num_active_paths,
823 num_active_paths + num_new_paths - num_scheduled_possible_split);
824 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
829 int num_paths = num_active_paths;
832 while (num_paths < max_num_camera_paths) {
835 work_tiles.push_back(work_tile);
836 num_paths += work_tile.
w * work_tile.
h * work_tile.
num_samples;
844 if (work_tiles.empty() && num_paths == 0) {
851 if (work_tiles.empty()) {
869 num_predicted_splits);
876 const int num_work_tiles,
877 const int num_active_paths,
878 const int num_predicted_splits)
885 int path_index_offset = num_active_paths;
887 for (
int i = 0;
i < num_work_tiles;
i++) {
889 work_tile = work_tiles[
i];
891 const int tile_work_size = work_tile.
w * work_tile.
h * work_tile.
num_samples;
896 path_index_offset += tile_work_size;
922 <<
"Invalid number of queued states for kernel "
951 LOG_INFO <<
"Using graphics interop GPU display update.";
954 LOG_INFO <<
"Using naive GPU display update.";
965 const int num_samples)
973 if (!
buffers_->buffer.device_pointer) {
974 LOG_WARNING <<
"Request for GPU display update without allocated render buffers.";
993 const int num_samples)
999 const int final_width =
buffers_->params.window_width;
1000 const int final_height =
buffers_->params.window_height;
1034 const int num_samples)
1070 const int num_samples)
1089 if (num_active_pixels) {
1095 return num_active_pixels;
1102 num_active_pixels.
alloc(1);
1104 queue_->zero_to_device(num_active_pixels);
1111 const int reset_int =
reset;
1126 queue_->copy_from_device(num_active_pixels);
1129 return num_active_pixels.
data()[0];
1205 if (!
buffers_->buffer.device_pointer) {
1212 return queue_->synchronize();
#define GPU_PARALLEL_SORT_BLOCK_SIZE
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
virtual bool should_use_graphics_interop(const GraphicsInteropDevice &, const bool=false)
device_ptr d_pixels_half_rgba
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
void graphics_interop_activate()
void copy_pixels_to_texture(const half4 *rgba_pixels, const int texture_x, const int texture_y, const int pixels_width, const int pixels_height)
GraphicsInteropBuffer & graphics_interop_get_buffer()
void graphics_interop_deactivate()
GraphicsInteropDevice graphics_interop_get_device()
bool kernel_is_shadow_path(DeviceKernel kernel)
bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
void compact_shadow_paths()
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
int integrator_state_soa_volume_stack_size_
void alloc_integrator_sorting()
uint integrator_state_soa_kernel_features_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
bool interop_use_checked_
void enqueue_adaptive_sampling_filter_y()
void enqueue_adaptive_sampling_filter_x()
void alloc_integrator_soa()
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
int min_num_active_main_paths_
void destroy_gpu_resources(PathTraceDisplay *display) override
void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
void cryptomatte_postproces() override
void alloc_integrator_path_split()
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
device_vector< IntegratorQueueCounter > integrator_queue_counter_
void denoise_volume_guiding_buffers() override
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool has_shadow_catcher() const
bool kernel_creates_ao_paths(DeviceKernel kernel)
bool enqueue_path_iteration()
int num_active_main_paths_paths()
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void alloc_integrator_queue()
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
bool should_use_graphics_interop(PathTraceDisplay *display)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
int adaptive_sampling_convergence_check_count_active(const float threshold, bool reset)
bool copy_render_buffers_to_device() override
int max_active_main_path_index_
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
int shadow_catcher_count_possible_splits()
device_vector< half4 > display_rgba_half_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display, const PassMode mode) const
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
device_ptr device_pointer
T * alloc(const size_t width, const size_t height=0)
#define KERNEL_FEATURE_AO
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME_RAY_MARCHING
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y
@ DEVICE_KERNEL_PREFIX_SUM
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
string string_human_readable_size(size_t size)
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
ccl_device_inline size_t divide_up(const size_t x, const size_t y)