17CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
20 const CUDAContextScope scope(cuda_device_);
21 cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
24CUDADeviceQueue::~CUDADeviceQueue()
26 const CUDAContextScope scope(cuda_device_);
27 cuStreamDestroy(cuda_stream_);
30int CUDADeviceQueue::num_concurrent_states(
const size_t state_size)
const
32 const int max_num_threads = cuda_device_->get_num_multiprocessors() *
33 cuda_device_->get_max_num_threads_per_multiprocessor();
36 const char *factor_str = getenv(
"CYCLES_CONCURRENT_STATES_FACTOR");
38 const float factor = (
float)atof(factor_str);
43 LOG_TRACE <<
"CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
53int CUDADeviceQueue::num_concurrent_busy_states(
const size_t )
const
55 const int max_num_threads = cuda_device_->get_num_multiprocessors() *
56 cuda_device_->get_max_num_threads_per_multiprocessor();
58 if (max_num_threads == 0) {
62 return 4 * max_num_threads;
65void CUDADeviceQueue::init_execution()
68 CUDAContextScope scope(cuda_device_);
69 cuda_device_->load_texture_info();
70 cuda_device_assert(cuda_device_, cuCtxSynchronize());
72 debug_init_execution();
79 if (cuda_device_->have_error()) {
85 const CUDAContextScope scope(cuda_device_);
88 if (cuda_device_->load_texture_info()) {
89 cuda_device_assert(cuda_device_, cuCtxSynchronize());
90 if (cuda_device_->have_error()) {
96 const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
97 const int num_threads_per_block = cuda_kernel.num_threads_per_block;
100 int shared_mem_bytes = 0;
112 shared_mem_bytes = (num_threads_per_block + 1) *
sizeof(
int);
120 assert_success(cuLaunchKernel(cuda_kernel.function,
124 num_threads_per_block,
129 const_cast<void **
>(args.
values),
135 return !(cuda_device_->have_error());
138bool CUDADeviceQueue::synchronize()
140 if (cuda_device_->have_error()) {
144 const CUDAContextScope scope(cuda_device_);
145 assert_success(cuStreamSynchronize(cuda_stream_),
"synchronize");
149 return !(cuda_device_->have_error());
162 cuda_device_->mem_alloc(mem);
168 const CUDAContextScope scope(cuda_device_);
184 cuda_device_->mem_alloc(mem);
191 const CUDAContextScope scope(cuda_device_);
210 const CUDAContextScope scope(cuda_device_);
217void CUDADeviceQueue::assert_success(CUresult
result,
const char *operation)
219 if (
result != CUDA_SUCCESS) {
222 "%s in CUDA queue %s (%s)",
name, operation, debug_active_kernels().c_str()));
228 return make_unique<CUDADeviceGraphicsInterop>(
this);
device_ptr device_pointer
#define CCL_NAMESPACE_END
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
string string_human_readable_size(size_t size)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
ccl_device_inline size_t divide_up(const size_t x, const size_t y)