Blender V5.0
cuda/queue.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_CUDA
6
7# include "device/cuda/queue.h"
8
11# include "device/cuda/kernel.h"
12
14
15/* CUDADeviceQueue */
16
17CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
18 : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
19{
20 const CUDAContextScope scope(cuda_device_);
21 cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
22}
23
24CUDADeviceQueue::~CUDADeviceQueue()
25{
26 const CUDAContextScope scope(cuda_device_);
27 cuStreamDestroy(cuda_stream_);
28}
29
30int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
31{
32 const int max_num_threads = cuda_device_->get_num_multiprocessors() *
33 cuda_device_->get_max_num_threads_per_multiprocessor();
34 int num_states = max(max_num_threads, 65536) * 16;
35
36 const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
37 if (factor_str) {
38 const float factor = (float)atof(factor_str);
39 if (factor != 0.0f) {
40 num_states = max((int)(num_states * factor), 1024);
41 }
42 else {
43 LOG_TRACE << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
44 }
45 }
46
47 LOG_TRACE << "GPU queue concurrent states: " << num_states << ", using up to "
49
50 return num_states;
51}
52
53int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
54{
55 const int max_num_threads = cuda_device_->get_num_multiprocessors() *
56 cuda_device_->get_max_num_threads_per_multiprocessor();
57
58 if (max_num_threads == 0) {
59 return 65536;
60 }
61
62 return 4 * max_num_threads;
63}
64
65void CUDADeviceQueue::init_execution()
66{
67 /* Synchronize all textures and memory copies before executing task. */
68 CUDAContextScope scope(cuda_device_);
69 cuda_device_->load_texture_info();
70 cuda_device_assert(cuda_device_, cuCtxSynchronize());
71
72 debug_init_execution();
73}
74
75bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
76 const int work_size,
77 const DeviceKernelArguments &args)
78{
79 if (cuda_device_->have_error()) {
80 return false;
81 }
82
83 debug_enqueue_begin(kernel, work_size);
84
85 const CUDAContextScope scope(cuda_device_);
86
87 /* Update texture info in case integrator memory alloc caused texture to move to host. */
88 if (cuda_device_->load_texture_info()) {
89 cuda_device_assert(cuda_device_, cuCtxSynchronize());
90 if (cuda_device_->have_error()) {
91 return false;
92 }
93 }
94
95 /* Compute kernel launch parameters. */
96 const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
97 const int num_threads_per_block = cuda_kernel.num_threads_per_block;
98 const int num_blocks = divide_up(work_size, num_threads_per_block);
99
100 int shared_mem_bytes = 0;
101
102 switch (kernel) {
111 /* See parall_active_index.h for why this amount of shared memory is needed. */
112 shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
113 break;
114
115 default:
116 break;
117 }
118
119 /* Launch kernel. */
120 assert_success(cuLaunchKernel(cuda_kernel.function,
121 num_blocks,
122 1,
123 1,
124 num_threads_per_block,
125 1,
126 1,
127 shared_mem_bytes,
128 cuda_stream_,
129 const_cast<void **>(args.values),
130 nullptr),
131 "enqueue");
132
133 debug_enqueue_end();
134
135 return !(cuda_device_->have_error());
136}
137
138bool CUDADeviceQueue::synchronize()
139{
140 if (cuda_device_->have_error()) {
141 return false;
142 }
143
144 const CUDAContextScope scope(cuda_device_);
145 assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
146
147 debug_synchronize();
148
149 return !(cuda_device_->have_error());
150}
151
152void CUDADeviceQueue::zero_to_device(device_memory &mem)
153{
154 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
155
156 if (mem.memory_size() == 0) {
157 return;
158 }
159
160 /* Allocate on demand. */
161 if (mem.device_pointer == 0) {
162 cuda_device_->mem_alloc(mem);
163 }
164
165 /* Zero memory on device. */
166 assert(mem.device_pointer != 0);
167
168 const CUDAContextScope scope(cuda_device_);
169 assert_success(
170 cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
171 "zero_to_device");
172}
173
174void CUDADeviceQueue::copy_to_device(device_memory &mem)
175{
176 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
177
178 if (mem.memory_size() == 0) {
179 return;
180 }
181
182 /* Allocate on demand. */
183 if (mem.device_pointer == 0) {
184 cuda_device_->mem_alloc(mem);
185 }
186
187 assert(mem.device_pointer != 0);
188 assert(mem.host_pointer != nullptr);
189
190 /* Copy memory to device. */
191 const CUDAContextScope scope(cuda_device_);
192 assert_success(
193 cuMemcpyHtoDAsync(
194 (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
195 "copy_to_device");
196}
197
198void CUDADeviceQueue::copy_from_device(device_memory &mem)
199{
200 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
201
202 if (mem.memory_size() == 0) {
203 return;
204 }
205
206 assert(mem.device_pointer != 0);
207 assert(mem.host_pointer != nullptr);
208
209 /* Copy memory from device. */
210 const CUDAContextScope scope(cuda_device_);
211 assert_success(
212 cuMemcpyDtoHAsync(
213 mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
214 "copy_from_device");
215}
216
217void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
218{
219 if (result != CUDA_SUCCESS) {
220 const char *name = cuewErrorString(result);
221 cuda_device_->set_error(string_printf(
222 "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
223 }
224}
225
226unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
227{
228 return make_unique<CUDADeviceGraphicsInterop>(this);
229}
230
232
233#endif /* WITH_CUDA */
nullptr float
@ MEM_TEXTURE
#define CCL_NAMESPACE_END
#define assert(assertion)
const int num_states
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define LOG_TRACE
Definition log.h:108
const char * name
string string_human_readable_size(size_t size)
Definition string.cpp:257
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
void * values[MAX_ARGS]
max
Definition text_draw.cc:251
ccl_device_inline size_t divide_up(const size_t x, const size_t y)
Definition types_base.h:52