24 size_t state_size = 0;
26#define KERNEL_STRUCT_BEGIN(name) \
27 for (int array_index = 0;; array_index++) {
29#ifdef __INTEGRATOR_GPU_PACKED_STATE__
30# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
31 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
32# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
33# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
34 KERNEL_STRUCT_BEGIN(parent_struct) \
35 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
37# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
38 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
39# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
40# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
43#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
44 state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
45#define KERNEL_STRUCT_END(name) \
49#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
50 if (array_index >= gpu_array_size - 1) { \
59#define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
65#undef KERNEL_STRUCT_BEGIN
66#undef KERNEL_STRUCT_BEGIN_PACKED
67#undef KERNEL_STRUCT_MEMBER
68#undef KERNEL_STRUCT_MEMBER_PACKED
69#undef KERNEL_STRUCT_ARRAY_MEMBER
70#undef KERNEL_STRUCT_END
71#undef KERNEL_STRUCT_END_ARRAY
72#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
80 const bool *cancel_requested_flag)
81 :
PathTraceWork(device, film, device_scene, cancel_requested_flag),
82 queue_(device->gpu_queue_create()),
87 device,
"integrator_shader_raytrace_sort_counter",
MEM_READ_WRITE),
93 device,
"integrator_shader_sort_partition_key_offsets",
MEM_READ_WRITE),
113 const int requested_volume_stack_size =
device_scene_->data.volume_stack_size;
122 requested_volume_stack_size);
142#define KERNEL_STRUCT_BEGIN(name) \
143 for (int array_index = 0;; array_index++) {
144#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
145 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
146 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
147 shadow ? "shadow_" : ""); \
148 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
149 array->alloc_to_device(max_num_paths_); \
150 memcpy(&integrator_state_gpu_.parent_struct.name, \
151 &array->device_pointer, \
152 sizeof(array->device_pointer)); \
153 integrator_state_soa_.emplace_back(std::move(array)); \
155#ifdef __INTEGRATOR_GPU_PACKED_STATE__
156# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
157 if ((kernel_features & (feature))) { \
158 string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
159 shadow ? "shadow_" : ""); \
160 VLOG_DEBUG << "Skipping " << name_str \
161 << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
163# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
164 KERNEL_STRUCT_BEGIN(parent_struct) \
165 KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
167# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
168# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
171#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
172 if ((kernel_features & (feature)) && \
173 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
175 string name_str = string_printf( \
176 "%sintegrator_state_" #name "_%d", shadow ? "shadow_" : "", array_index); \
177 auto array = make_unique<device_only_memory<type>>(device_, name_str.c_str()); \
178 array->alloc_to_device(max_num_paths_); \
179 memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
180 &array->device_pointer, \
181 sizeof(array->device_pointer)); \
182 integrator_state_soa_.emplace_back(std::move(array)); \
184#define KERNEL_STRUCT_END(name) \
188#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
189 if (array_index >= gpu_array_size - 1) { \
193#define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
200#undef KERNEL_STRUCT_BEGIN
201#undef KERNEL_STRUCT_BEGIN_PACKED
202#undef KERNEL_STRUCT_MEMBER
203#undef KERNEL_STRUCT_MEMBER_PACKED
204#undef KERNEL_STRUCT_ARRAY_MEMBER
205#undef KERNEL_STRUCT_END
206#undef KERNEL_STRUCT_END_ARRAY
207#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
210 size_t total_soa_size = 0;
212 total_soa_size += soa_memory->memory_size();
331 const int start_sample,
332 const int samples_num,
333 const int sample_offset)
350 int num_iterations = 0;
362 if (!
queue_->synchronize()) {
381 if (!
queue_->synchronize()) {
394 if (num_iterations) {
406 int max_num_queued = 0;
410 if (queue_counter->
num_queued[
i] > max_num_queued) {
451 int num_active_paths = 0;
456 if (num_active_paths == 0) {
468 int num_paths_limit = INT_MAX;
475 if (available_shadow_paths < queue_counter->num_queued[kernel]) {
487 num_paths_limit = available_shadow_paths / 2;
510 const int num_queued = queue_counter->
num_queued[kernel];
575 <<
" used for path iteration, should never happen.";
581 const int num_paths_limit)
583 int d_queued_kernel = queued_kernel;
593 &
work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
602 assert(d_counter != 0 && d_prefix_sum != 0);
642 int d_queued_kernel = queued_kernel;
650 &
work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
659 if (num_active_paths == 0) {
664 const int min_compact_paths = 32;
685 const int num_active_paths =
690 if (num_active_paths == 0) {
700 const float max_overhead_factor = 2.0f;
701 const int min_compact_paths = 32;
703 if (num_total_paths < num_active_paths * max_overhead_factor ||
704 num_total_paths < min_compact_paths)
722 const int max_active_path_index,
735 int offset = num_active_paths;
747 const int work_size = max_active_path_index;
750 &
work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
762 if (num_compact_paths > 0) {
764 int active_states_offset = 0;
765 int terminated_states_offset = num_active_paths;
768 &d_compact_paths, &active_states_offset, &terminated_states_offset, &
work_size);
789 if (num_active_paths == 0) {
800 int num_predicted_splits = 0;
815 const int num_available_paths =
max_num_paths_ - num_active_paths;
816 const int num_new_paths = num_available_paths / 2;
817 max_num_camera_paths =
max(num_active_paths,
818 num_active_paths + num_new_paths - num_scheduled_possible_split);
819 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
824 int num_paths = num_active_paths;
827 while (num_paths < max_num_camera_paths) {
830 work_tiles.push_back(work_tile);
831 num_paths += work_tile.
w * work_tile.
h * work_tile.
num_samples;
839 if (work_tiles.empty() && num_paths == 0) {
846 if (work_tiles.empty()) {
864 num_predicted_splits);
871 const int num_work_tiles,
872 const int num_active_paths,
873 const int num_predicted_splits)
880 int path_index_offset = num_active_paths;
882 for (
int i = 0;
i < num_work_tiles;
i++) {
884 work_tile = work_tiles[
i];
886 const int tile_work_size = work_tile.
w * work_tile.
h * work_tile.
num_samples;
891 path_index_offset += tile_work_size;
917 <<
"Invalid number of queued states for kernel "
946 VLOG_INFO <<
"Using graphics interop GPU display update.";
949 VLOG_INFO <<
"Using naive GPU display update.";
960 const int num_samples)
968 if (!
buffers_->buffer.device_pointer) {
969 LOG(WARNING) <<
"Request for GPU display update without allocated render buffers.";
988 const int num_samples)
994 const int final_width =
buffers_->params.window_width;
995 const int final_height =
buffers_->params.window_height;
1029 const int num_samples)
1065 const int num_samples)
1084 if (num_active_pixels) {
1090 return num_active_pixels;
1097 num_active_pixels.
alloc(1);
1099 queue_->zero_to_device(num_active_pixels);
1106 const int reset_int =
reset;
1121 queue_->copy_from_device(num_active_pixels);
1124 return num_active_pixels.
data()[0];
1177 if (!
buffers_->buffer.device_pointer) {
1184 return queue_->synchronize();
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
virtual bool should_use_graphics_interop(const GraphicsInteropDevice &, const bool=false)
device_ptr d_pixels_half_rgba
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
void graphics_interop_activate()
void copy_pixels_to_texture(const half4 *rgba_pixels, const int texture_x, const int texture_y, const int pixels_width, const int pixels_height)
GraphicsInteropBuffer & graphics_interop_get_buffer()
void graphics_interop_deactivate()
GraphicsInteropDevice graphics_interop_get_device()
bool kernel_is_shadow_path(DeviceKernel kernel)
bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
void compact_shadow_paths()
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
int integrator_state_soa_volume_stack_size_
void alloc_integrator_sorting()
uint integrator_state_soa_kernel_features_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
bool interop_use_checked_
void enqueue_adaptive_sampling_filter_y()
void enqueue_adaptive_sampling_filter_x()
void alloc_integrator_soa()
device_vector< int > num_queued_paths_
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit)
int min_num_active_main_paths_
void destroy_gpu_resources(PathTraceDisplay *display) override
void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, const int num_samples)
void cryptomatte_postproces() override
void alloc_integrator_path_split()
void render_samples(RenderStatistics &statistics, const int start_sample, const int samples_num, const int sample_offset) override
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool has_shadow_catcher() const
bool kernel_creates_ao_paths(DeviceKernel kernel)
bool enqueue_path_iteration()
int num_active_main_paths_paths()
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, const int num_samples) override
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
device_vector< int > integrator_shader_sort_partition_key_offsets_
void alloc_integrator_queue()
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
bool should_use_graphics_interop(PathTraceDisplay *display)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
int adaptive_sampling_convergence_check_count_active(const float threshold, bool reset)
bool copy_render_buffers_to_device() override
int max_active_main_path_index_
int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
int shadow_catcher_count_possible_splits()
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
PathTraceWork(Device *device, Film *film, DeviceScene *device_scene, const bool *cancel_requested_flag)
bool is_cancel_requested() const
device_ptr device_pointer
T * alloc(const size_t width, const size_t height=0, const size_t depth=0)
#define KERNEL_FEATURE_AO
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
const char * device_kernel_as_string(DeviceKernel kernel)
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
@ DEVICE_KERNEL_INTEGRATOR_NUM
#define VLOG_IS_ON(severity)
#define VLOG_DEVICE_STATS
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size(const uint kernel_features)
string string_human_readable_size(size_t size)
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
ccl_device_inline size_t divide_up(const size_t x, const size_t y)