Blender V4.5
cuda/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_CUDA
6
7# include <climits>
8# include <cstdio>
9# include <cstdlib>
10# include <cstring>
11
13
14# include "util/debug.h"
15# include "util/log.h"
16# include "util/md5.h"
17# include "util/path.h"
18# include "util/string.h"
19# include "util/system.h"
20# include "util/time.h"
21# include "util/types.h"
22
23# ifdef _WIN32
24# include "util/windows.h"
25# endif
26
28
30
32
33class CUDADevice;
34
35bool CUDADevice::have_precompiled_kernels()
36{
37 string cubins_path = path_get("lib");
38 return path_exists(cubins_path);
39}
40
41BVHLayoutMask CUDADevice::get_bvh_layout_mask(uint /*kernel_features*/) const
42{
43 return BVH_LAYOUT_BVH2;
44}
45
46void CUDADevice::set_error(const string &error)
47{
49
50 if (first_error) {
51 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
52 fprintf(stderr,
53 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
54 first_error = false;
55 }
56}
57
58CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
59 : GPUDevice(info, stats, profiler, headless)
60{
61 /* Verify that base class types can be used with specific backend types */
62 static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
63 static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
64
65 first_error = true;
66
67 cuDevId = info.num;
68 cuDevice = 0;
69 cuContext = nullptr;
70
71 cuModule = nullptr;
72
73 need_texture_info = false;
74
75 pitch_alignment = 0;
76
77 /* Initialize CUDA. */
78 CUresult result = cuInit(0);
79 if (result != CUDA_SUCCESS) {
80 set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
81 return;
82 }
83
84 /* Setup device and context. */
85 result = cuDeviceGet(&cuDevice, cuDevId);
86 if (result != CUDA_SUCCESS) {
87 set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
88 cuewErrorString(result)));
89 return;
90 }
91
92 /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
93 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
94 * so we can predict which memory to map to host. */
95 int value;
96 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
97 can_map_host = value != 0;
98
99 cuda_assert(cuDeviceGetAttribute(
100 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
101
102 if (can_map_host) {
103 init_host_memory();
104 }
105
106 int active = 0;
107 unsigned int ctx_flags = 0;
108 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &active));
109
110 /* Configure primary context only once. */
111 if (active == 0) {
112 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
113 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
114 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
115 set_error(string_printf("Failed to configure CUDA context (%s)", cuewErrorString(result)));
116 return;
117 }
118 }
119
120 /* Create context. */
121 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
122
123 if (result != CUDA_SUCCESS) {
124 set_error(string_printf("Failed to retain CUDA context (%s)", cuewErrorString(result)));
125 return;
126 }
127
128 int major, minor;
129 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
130 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
131 cuDevArchitecture = major * 100 + minor * 10;
132}
133
134CUDADevice::~CUDADevice()
135{
136 texture_info.free();
137 if (cuModule) {
138 cuda_assert(cuModuleUnload(cuModule));
139 }
140 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
141}
142
143bool CUDADevice::support_device(const uint /*kernel_features*/)
144{
145 int major, minor;
146 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
147 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
148
149 /* We only support sm_30 and above */
150 if (major < 3) {
151 set_error(string_printf(
152 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
153 return false;
154 }
155
156 return true;
157}
158
159bool CUDADevice::check_peer_access(Device *peer_device)
160{
161 if (peer_device == this) {
162 return false;
163 }
164 if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
165 return false;
166 }
167
168 CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
169
170 int can_access = 0;
171 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
172 if (can_access == 0) {
173 return false;
174 }
175
176 // Ensure array access over the link is possible as well (for 3D textures)
177 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
178 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
179 cuDevice,
180 peer_device_cuda->cuDevice));
181 if (can_access == 0) {
182 return false;
183 }
184
185 // Enable peer access in both directions
186 {
187 const CUDAContextScope scope(this);
188 CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
189 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
190 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
191 cuewErrorString(result)));
192 return false;
193 }
194 }
195 {
196 const CUDAContextScope scope(peer_device_cuda);
197 CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
198 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
199 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
200 cuewErrorString(result)));
201 return false;
202 }
203 }
204
205 return true;
206}
207
208bool CUDADevice::use_adaptive_compilation()
209{
211}
212
213/* Common NVCC flags which stays the same regardless of shading model,
214 * kernel sources md5 and only depends on compiler or compilation settings.
215 */
216string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
217{
218 const int machine = system_cpu_bits();
219 const string source_path = path_get("source");
220 const string include_path = source_path;
221 string cflags = string_printf(
222 "-m%d "
223 "--ptxas-options=\"-v\" "
224 "--use_fast_math "
225 "-DNVCC "
226 "-I\"%s\"",
227 machine,
228 include_path.c_str());
229 if (use_adaptive_compilation()) {
230 cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
231 }
232 const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
233 if (extra_cflags) {
234 cflags += string(" ") + string(extra_cflags);
235 }
236
237# ifdef WITH_NANOVDB
238 cflags += " -DWITH_NANOVDB";
239# endif
240
241# ifdef WITH_CYCLES_DEBUG
242 cflags += " -DWITH_CYCLES_DEBUG";
243# endif
244
245 return cflags;
246}
247
248string CUDADevice::compile_kernel(const string &common_cflags,
249 const char *name,
250 const char *base,
251 bool force_ptx)
252{
253 /* Compute kernel name. */
254 int major, minor;
255 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
256 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
257
258 /* Attempt to use kernel provided with Blender. */
259 if (!use_adaptive_compilation()) {
260 if (!force_ptx) {
261 const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor));
262 VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
263 if (path_exists(cubin)) {
264 VLOG_INFO << "Using precompiled kernel.";
265 return cubin;
266 }
267 }
268
269 /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
270 int ptx_major = major, ptx_minor = minor;
271 while (ptx_major >= 3) {
272 const string ptx = path_get(
273 string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
274 VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
275 if (path_exists(ptx)) {
276 VLOG_INFO << "Using precompiled kernel.";
277 return ptx;
278 }
279
280 if (ptx_minor > 0) {
281 ptx_minor--;
282 }
283 else {
284 ptx_major--;
285 ptx_minor = 9;
286 }
287 }
288 }
289
290 /* Try to use locally compiled kernel. */
291 string source_path = path_get("source");
292 const string source_md5 = path_files_md5_hash(source_path);
293
294 /* We include cflags into md5 so changing cuda toolkit or changing other
295 * compiler command line arguments makes sure cubin gets re-built.
296 */
297 const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
298
299 const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
300 const char *const kernel_arch = force_ptx ? "compute" : "sm";
301 const string cubin_file = string_printf(
302 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
303 const string cubin = path_cache_get(path_join("kernels", cubin_file));
304 VLOG_INFO << "Testing for locally compiled kernel " << cubin << ".";
305 if (path_exists(cubin)) {
306 VLOG_INFO << "Using locally compiled kernel.";
307 return cubin;
308 }
309
310# ifdef _WIN32
311 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
312 if (major < 3) {
313 set_error(
314 string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
315 "Your GPU is not supported.",
316 major,
317 minor));
318 }
319 else {
320 set_error(
321 string_printf("CUDA binary kernel for this graphics card compute "
322 "capability (%d.%d) not found.",
323 major,
324 minor));
325 }
326 return string();
327 }
328# endif
329
330 /* Compile. */
331 const char *const nvcc = cuewCompilerPath();
332 if (nvcc == nullptr) {
333 set_error(
334 "CUDA nvcc compiler not found. "
335 "Install CUDA toolkit in default location.");
336 return string();
337 }
338
339 const int nvcc_cuda_version = cuewCompilerVersion();
340 VLOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
341 if (nvcc_cuda_version < 101) {
342 printf(
343 "Unsupported CUDA version %d.%d detected, "
344 "you need CUDA 10.1 or newer.\n",
345 nvcc_cuda_version / 10,
346 nvcc_cuda_version % 10);
347 return string();
348 }
349 if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
350 printf(
351 "CUDA version %d.%d detected, build may succeed but only "
352 "CUDA 10.1 to 12 are officially supported.\n",
353 nvcc_cuda_version / 10,
354 nvcc_cuda_version % 10);
355 }
356
357 double starttime = time_dt();
358
360
361 source_path = path_join(path_join(source_path, "kernel"),
362 path_join("device", path_join(base, string_printf("%s.cu", name))));
363
364 string command = string_printf(
365 "\"%s\" "
366 "-arch=%s_%d%d "
367 "--%s \"%s\" "
368 "-o \"%s\" "
369 "%s",
370 nvcc,
371 kernel_arch,
372 major,
373 minor,
374 kernel_ext,
375 source_path.c_str(),
376 cubin.c_str(),
377 common_cflags.c_str());
378
379 printf("Compiling %sCUDA kernel ...\n%s\n",
380 (use_adaptive_compilation()) ? "adaptive " : "",
381 command.c_str());
382
383# ifdef _WIN32
384 command = "call " + command;
385# endif
386 if (system(command.c_str()) != 0) {
387 set_error(
388 "Failed to execute compilation command, "
389 "see console for details.");
390 return string();
391 }
392
393 /* Verify if compilation succeeded */
394 if (!path_exists(cubin)) {
395 set_error(
396 "CUDA kernel compilation failed, "
397 "see console for details.");
398 return string();
399 }
400
401 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
402
403 return cubin;
404}
405
406bool CUDADevice::load_kernels(const uint kernel_features)
407{
408 /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
409 *
410 * Currently re-loading kernel will invalidate memory pointers,
411 * causing problems in cuCtxSynchronize.
412 */
413 if (cuModule) {
414 if (use_adaptive_compilation()) {
416 << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
417 }
418 return true;
419 }
420
421 /* check if cuda init succeeded */
422 if (cuContext == nullptr) {
423 return false;
424 }
425
426 /* check if GPU is supported */
427 if (!support_device(kernel_features)) {
428 return false;
429 }
430
431 /* get kernel */
432 const char *kernel_name = "kernel";
433 string cflags = compile_kernel_get_common_cflags(kernel_features);
434 string cubin = compile_kernel(cflags, kernel_name);
435 if (cubin.empty()) {
436 return false;
437 }
438
439 /* open module */
440 CUDAContextScope scope(this);
441
442 string cubin_data;
443 CUresult result;
444
445 if (path_read_compressed_text(cubin, cubin_data)) {
446 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
447 }
448 else {
449 result = CUDA_ERROR_FILE_NOT_FOUND;
450 }
451
452 if (result != CUDA_SUCCESS) {
453 set_error(string_printf(
454 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
455 }
456
457 if (result == CUDA_SUCCESS) {
458 kernels.load(this);
459 reserve_local_memory(kernel_features);
460 }
461
462 return (result == CUDA_SUCCESS);
463}
464
465void CUDADevice::reserve_local_memory(const uint kernel_features)
466{
467 /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
468 * needed for kernel launches, so that we can reliably figure out when
469 * to allocate scene data in mapped host memory. */
470 size_t total = 0, free_before = 0, free_after = 0;
471
472 {
473 CUDAContextScope scope(this);
474 cuMemGetInfo(&free_before, &total);
475 }
476
477 {
478 /* Use the biggest kernel for estimation. */
479 const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
481 (kernel_features & KERNEL_FEATURE_MNEE) ?
484
485 /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
486 * multiprocessors. It would be good to do this in parallel for the multi GPU case
487 * still to make it faster. */
488 CUDADeviceQueue queue(this);
489
490 device_ptr d_path_index = 0;
491 device_ptr d_render_buffer = 0;
492 int d_work_size = 0;
493 DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
494
495 queue.init_execution();
496 queue.enqueue(test_kernel, 1, args);
497 queue.synchronize();
498 }
499
500 {
501 CUDAContextScope scope(this);
502 cuMemGetInfo(&free_after, &total);
503 }
504
505 VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
506 << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
507
508# if 0
509 /* For testing mapped host memory, fill up device memory. */
510 const size_t keep_mb = 1024;
511
512 while (free_after > keep_mb * 1024 * 1024LL) {
513 CUdeviceptr tmp;
514 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
515 cuMemGetInfo(&free_after, &total);
516 }
517# endif
518}
519
520void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
521{
522 CUDAContextScope scope(this);
523
524 cuMemGetInfo(&free, &total);
525}
526
527bool CUDADevice::alloc_device(void *&device_pointer, const size_t size)
528{
529 CUDAContextScope scope(this);
530
531 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
532 return mem_alloc_result == CUDA_SUCCESS;
533}
534
535void CUDADevice::free_device(void *device_pointer)
536{
537 CUDAContextScope scope(this);
538
539 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
540}
541
542bool CUDADevice::shared_alloc(void *&shared_pointer, const size_t size)
543{
544 CUDAContextScope scope(this);
545
546 CUresult mem_alloc_result = cuMemHostAlloc(
547 &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
548 return mem_alloc_result == CUDA_SUCCESS;
549}
550
551void CUDADevice::shared_free(void *shared_pointer)
552{
553 CUDAContextScope scope(this);
554
555 cuMemFreeHost(shared_pointer);
556}
557
558void *CUDADevice::shared_to_device_pointer(const void *shared_pointer)
559{
560 CUDAContextScope scope(this);
561 void *device_pointer = nullptr;
562 cuda_assert(
563 cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, (void *)shared_pointer, 0));
564 return device_pointer;
565}
566
567void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size)
568{
569 const CUDAContextScope scope(this);
570
571 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
572}
573
574void CUDADevice::mem_alloc(device_memory &mem)
575{
576 if (mem.type == MEM_TEXTURE) {
577 assert(!"mem_alloc not supported for textures.");
578 }
579 else if (mem.type == MEM_GLOBAL) {
580 assert(!"mem_alloc not supported for global memory.");
581 }
582 else {
583 generic_alloc(mem);
584 }
585}
586
587void CUDADevice::mem_copy_to(device_memory &mem)
588{
589 if (mem.type == MEM_GLOBAL) {
590 global_copy_to(mem);
591 }
592 else if (mem.type == MEM_TEXTURE) {
593 tex_copy_to((device_texture &)mem);
594 }
595 else {
596 if (!mem.device_pointer) {
597 generic_alloc(mem);
598 generic_copy_to(mem);
599 }
600 else if (mem.is_resident(this)) {
601 generic_copy_to(mem);
602 }
603 }
604}
605
606void CUDADevice::mem_move_to_host(device_memory &mem)
607{
608 if (mem.type == MEM_GLOBAL) {
609 global_free(mem);
610 global_alloc(mem);
611 }
612 else if (mem.type == MEM_TEXTURE) {
613 tex_free((device_texture &)mem);
614 tex_alloc((device_texture &)mem);
615 }
616 else {
617 assert(!"mem_move_to_host only supported for texture and global memory");
618 }
619}
620
621void CUDADevice::mem_copy_from(
622 device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem)
623{
624 if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
625 assert(!"mem_copy_from not supported for textures.");
626 }
627 else if (mem.host_pointer) {
628 const size_t size = elem * w * h;
629 const size_t offset = elem * y * w;
630
631 if (mem.device_pointer) {
632 const CUDAContextScope scope(this);
633 cuda_assert(cuMemcpyDtoH(
634 (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
635 }
636 else {
637 memset((char *)mem.host_pointer + offset, 0, size);
638 }
639 }
640}
641
642void CUDADevice::mem_zero(device_memory &mem)
643{
644 if (!mem.device_pointer) {
645 mem_alloc(mem);
646 }
647 if (!mem.device_pointer) {
648 return;
649 }
650
651 if (!(mem.is_shared(this) && mem.host_pointer == mem.shared_pointer)) {
652 const CUDAContextScope scope(this);
653 cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
654 }
655 else if (mem.host_pointer) {
656 memset(mem.host_pointer, 0, mem.memory_size());
657 }
658}
659
660void CUDADevice::mem_free(device_memory &mem)
661{
662 if (mem.type == MEM_GLOBAL) {
663 global_free(mem);
664 }
665 else if (mem.type == MEM_TEXTURE) {
666 tex_free((device_texture &)mem);
667 }
668 else {
669 generic_free(mem);
670 }
671}
672
673device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, const size_t offset, size_t /*size*/)
674{
675 return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
676}
677
678void CUDADevice::const_copy_to(const char *name, void *host, const size_t size)
679{
680 CUDAContextScope scope(this);
681 CUdeviceptr mem;
682 size_t bytes;
683
684 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params"));
685 assert(bytes == sizeof(KernelParamsCUDA));
686
687 /* Update data storage pointers in launch parameters. */
688# define KERNEL_DATA_ARRAY(data_type, data_name) \
689 if (strcmp(name, #data_name) == 0) { \
690 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
691 return; \
692 }
693 KERNEL_DATA_ARRAY(KernelData, data)
694 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
695# include "kernel/data_arrays.h"
696# undef KERNEL_DATA_ARRAY
697}
698
699void CUDADevice::global_alloc(device_memory &mem)
700{
701 if (mem.is_resident(this)) {
702 generic_alloc(mem);
703 generic_copy_to(mem);
704 }
705
706 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
707}
708
709void CUDADevice::global_copy_to(device_memory &mem)
710{
711 if (!mem.device_pointer) {
712 generic_alloc(mem);
713 generic_copy_to(mem);
714 }
715 else if (mem.is_resident(this)) {
716 generic_copy_to(mem);
717 }
718
719 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
720}
721
722void CUDADevice::global_free(device_memory &mem)
723{
724 if (mem.is_resident(this) && mem.device_pointer) {
725 generic_free(mem);
726 }
727}
728
729static size_t tex_src_pitch(const device_texture &mem)
730{
731 return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
732}
733
734static CUDA_MEMCPY2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
735{
736 /* 2D texture using pitch aligned linear memory. */
737 const size_t src_pitch = tex_src_pitch(mem);
738 const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
739
740 CUDA_MEMCPY2D param;
741 memset(&param, 0, sizeof(param));
742 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
743 param.dstDevice = mem.device_pointer;
744 param.dstPitch = dst_pitch;
745 param.srcMemoryType = CU_MEMORYTYPE_HOST;
746 param.srcHost = mem.host_pointer;
747 param.srcPitch = src_pitch;
748 param.WidthInBytes = param.srcPitch;
749 param.Height = mem.data_height;
750
751 return param;
752}
753
754static CUDA_MEMCPY3D tex_3d_copy_param(const device_texture &mem)
755{
756 const size_t src_pitch = tex_src_pitch(mem);
757
758 CUDA_MEMCPY3D param;
759 memset(&param, 0, sizeof(param));
760 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
761 param.dstArray = (CUarray)mem.device_pointer;
762 param.srcMemoryType = CU_MEMORYTYPE_HOST;
763 param.srcHost = mem.host_pointer;
764 param.srcPitch = src_pitch;
765 param.WidthInBytes = param.srcPitch;
766 param.Height = mem.data_height;
767 param.Depth = mem.data_depth;
768
769 return param;
770}
771
772void CUDADevice::tex_alloc(device_texture &mem)
773{
774 CUDAContextScope scope(this);
775
776 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
777 switch (mem.info.extension) {
778 case EXTENSION_REPEAT:
779 address_mode = CU_TR_ADDRESS_MODE_WRAP;
780 break;
781 case EXTENSION_EXTEND:
782 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
783 break;
784 case EXTENSION_CLIP:
785 address_mode = CU_TR_ADDRESS_MODE_BORDER;
786 break;
787 case EXTENSION_MIRROR:
788 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
789 break;
790 default:
791 assert(0);
792 break;
793 }
794
795 CUfilter_mode filter_mode;
797 filter_mode = CU_TR_FILTER_MODE_POINT;
798 }
799 else {
800 filter_mode = CU_TR_FILTER_MODE_LINEAR;
801 }
802
803 /* Image Texture Storage */
804 /* Cycles expects to read all texture data as normalized float values in
805 * kernel/device/gpu/image.h. But storing all data as floats would be very inefficient due to the
806 * huge size of float textures. So in the code below, we define different texture types including
807 * integer types, with the aim of using CUDA's default promotion behavior of integer data to
808 * floating point data in the range [0, 1], as noted in the CUDA documentation on
809 * cuTexObjectCreate API Call.
810 * Note that 32-bit integers are not supported by this promotion behavior and cannot be used
811 * with Cycles's current implementation in kernel/device/gpu/image.h.
812 */
813 CUarray_format_enum format;
814 switch (mem.data_type) {
815 case TYPE_UCHAR:
816 format = CU_AD_FORMAT_UNSIGNED_INT8;
817 break;
818 case TYPE_UINT16:
819 format = CU_AD_FORMAT_UNSIGNED_INT16;
820 break;
821 case TYPE_FLOAT:
822 format = CU_AD_FORMAT_FLOAT;
823 break;
824 case TYPE_HALF:
825 format = CU_AD_FORMAT_HALF;
826 break;
827 default:
828 assert(0);
829 return;
830 }
831
832 Mem *cmem = nullptr;
833 CUarray array_3d = nullptr;
834
835 if (!mem.is_resident(this)) {
836 thread_scoped_lock lock(device_mem_map_mutex);
837 cmem = &device_mem_map[&mem];
838 cmem->texobject = 0;
839
840 if (mem.data_depth > 1) {
841 array_3d = (CUarray)mem.device_pointer;
842 cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
843 }
844 }
845 else if (mem.data_depth > 1) {
846 /* 3D texture using array, there is no API for linear memory. */
847 CUDA_ARRAY3D_DESCRIPTOR desc;
848
849 desc.Width = mem.data_width;
850 desc.Height = mem.data_height;
851 desc.Depth = mem.data_depth;
852 desc.Format = format;
853 desc.NumChannels = mem.data_elements;
854 desc.Flags = 0;
855
856 VLOG_WORK << "Array 3D allocate: " << mem.name << ", "
857 << string_human_readable_number(mem.memory_size()) << " bytes. ("
859
860 cuda_assert(cuArray3DCreate(&array_3d, &desc));
861
862 if (!array_3d) {
863 return;
864 }
865
866 mem.device_pointer = (device_ptr)array_3d;
867 mem.device_size = mem.memory_size();
868 stats.mem_alloc(mem.memory_size());
869
870 const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
871 cuda_assert(cuMemcpy3D(&param));
872
873 thread_scoped_lock lock(device_mem_map_mutex);
874 cmem = &device_mem_map[&mem];
875 cmem->texobject = 0;
876 cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
877 }
878 else if (mem.data_height > 0) {
879 /* 2D texture, using pitch aligned linear memory. */
880 const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
881 const size_t dst_size = dst_pitch * mem.data_height;
882
883 cmem = generic_alloc(mem, dst_size - mem.memory_size());
884 if (!cmem) {
885 return;
886 }
887
888 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
889 cuda_assert(cuMemcpy2DUnaligned(&param));
890 }
891 else {
892 /* 1D texture, using linear memory. */
893 cmem = generic_alloc(mem);
894 if (!cmem) {
895 return;
896 }
897
898 cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
899 }
900
901 /* Set Mapping and tag that we need to (re-)upload to device */
902 TextureInfo tex_info = mem.info;
903
908 {
909 CUDA_RESOURCE_DESC resDesc;
910 memset(&resDesc, 0, sizeof(resDesc));
911
912 if (array_3d) {
913 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
914 resDesc.res.array.hArray = array_3d;
915 resDesc.flags = 0;
916 }
917 else if (mem.data_height > 0) {
918 const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
919
920 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
921 resDesc.res.pitch2D.devPtr = mem.device_pointer;
922 resDesc.res.pitch2D.format = format;
923 resDesc.res.pitch2D.numChannels = mem.data_elements;
924 resDesc.res.pitch2D.height = mem.data_height;
925 resDesc.res.pitch2D.width = mem.data_width;
926 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
927 }
928 else {
929 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
930 resDesc.res.linear.devPtr = mem.device_pointer;
931 resDesc.res.linear.format = format;
932 resDesc.res.linear.numChannels = mem.data_elements;
933 resDesc.res.linear.sizeInBytes = mem.device_size;
934 }
935
936 CUDA_TEXTURE_DESC texDesc;
937 memset(&texDesc, 0, sizeof(texDesc));
938 texDesc.addressMode[0] = address_mode;
939 texDesc.addressMode[1] = address_mode;
940 texDesc.addressMode[2] = address_mode;
941 texDesc.filterMode = filter_mode;
942 /* CUDA's flag CU_TRSF_READ_AS_INTEGER is intentionally not used and it is
943 * significant, see above an explanation about how Blender treat textures. */
944 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
945
946 thread_scoped_lock lock(device_mem_map_mutex);
947 cmem = &device_mem_map[&mem];
948
949 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, nullptr));
950
951 tex_info.data = (uint64_t)cmem->texobject;
952 }
953 else {
954 tex_info.data = (uint64_t)mem.device_pointer;
955 }
956
957 {
958 /* Update texture info. */
959 thread_scoped_lock lock(texture_info_mutex);
960 const uint slot = mem.slot;
961 if (slot >= texture_info.size()) {
962 /* Allocate some slots in advance, to reduce amount of re-allocations. */
963 texture_info.resize(slot + 128);
964 }
965 texture_info[slot] = tex_info;
966 need_texture_info = true;
967 }
968}
969
970void CUDADevice::tex_copy_to(device_texture &mem)
971{
972 if (!mem.device_pointer) {
973 /* Not yet allocated on device. */
974 tex_alloc(mem);
975 }
976 else if (!mem.is_resident(this)) {
977 /* Peering with another device, may still need to create texture info and object. */
978 bool texture_allocated = false;
979 {
980 thread_scoped_lock lock(texture_info_mutex);
981 texture_allocated = mem.slot < texture_info.size() && texture_info[mem.slot].data != 0;
982 }
983 if (!texture_allocated) {
984 tex_alloc(mem);
985 }
986 }
987 else {
988 /* Resident and fully allocated, only copy. */
989 if (mem.data_depth > 0) {
990 CUDAContextScope scope(this);
991 const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
992 cuda_assert(cuMemcpy3D(&param));
993 }
994 else if (mem.data_height > 0) {
995 CUDAContextScope scope(this);
996 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
997 cuda_assert(cuMemcpy2DUnaligned(&param));
998 }
999 else {
1000 generic_copy_to(mem);
1001 }
1002 }
1003}
1004
1005void CUDADevice::tex_free(device_texture &mem)
1006{
1007 CUDAContextScope scope(this);
1008 thread_scoped_lock lock(device_mem_map_mutex);
1009
1010 /* Check if the memory was allocated for this device. */
1011 auto it = device_mem_map.find(&mem);
1012 if (it == device_mem_map.end()) {
1013 return;
1014 }
1015
1016 const Mem &cmem = it->second;
1017
1018 /* Always clear texture info and texture object, regardless of residency. */
1019 {
1020 thread_scoped_lock lock(texture_info_mutex);
1021 texture_info[mem.slot] = TextureInfo();
1022 }
1023
1024 if (cmem.texobject) {
1025 /* Free bindless texture. */
1026 cuTexObjectDestroy(cmem.texobject);
1027 }
1028
1029 if (!mem.is_resident(this)) {
1030 /* Do not free memory here, since it was allocated on a different device. */
1031 device_mem_map.erase(device_mem_map.find(&mem));
1032 }
1033 else if (cmem.array) {
1034 /* Free array. */
1035 cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
1036 stats.mem_free(mem.device_size);
1037 mem.device_pointer = 0;
1038 mem.device_size = 0;
1039
1040 device_mem_map.erase(device_mem_map.find(&mem));
1041 }
1042 else {
1043 lock.unlock();
1044 generic_free(mem);
1045 }
1046}
1047
1048unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
1049{
1050 return make_unique<CUDADeviceQueue>(this);
1051}
1052
1053bool CUDADevice::should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
1054 const bool log)
1055{
1056 if (headless) {
1057 /* Avoid any call which might involve interaction with a graphics backend when we know that
1058 * we don't have active graphics context. This avoid crash on certain platforms when calling
1059 * cuGLGetDevices(). */
1060 return false;
1061 }
1062
1063 CUDAContextScope scope(this);
1064
1065 switch (interop_device.type) {
1067 /* Check whether this device is part of OpenGL context.
1068 *
1069 * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
1070 * possible, but from the empiric measurements it can be considerably slower than using naive
1071 * pixels copy. */
1072 int num_all_devices = 0;
1073 cuda_assert(cuDeviceGetCount(&num_all_devices));
1074
1075 if (num_all_devices == 0) {
1076 return false;
1077 }
1078
1079 vector<CUdevice> gl_devices(num_all_devices);
1080 uint num_gl_devices = 0;
1081 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
1082
1083 bool found = false;
1084 for (uint i = 0; i < num_gl_devices; ++i) {
1085 if (gl_devices[i] == cuDevice) {
1086 found = true;
1087 break;
1088 }
1089 }
1090
1091 if (log) {
1092 if (found) {
1093 VLOG_INFO << "Graphics interop: found matching OpenGL device for CUDA";
1094 }
1095 else {
1096 VLOG_INFO << "Graphics interop: no matching OpenGL device for CUDA";
1097 }
1098 }
1099
1100 return found;
1101 }
1102 case ccl::GraphicsInteropDevice::VULKAN: {
1103 /* Only do interop with matching device UUID. */
1104 CUuuid uuid = {};
1105 cuDeviceGetUuid(&uuid, cuDevice);
1106 const bool found = (sizeof(uuid.bytes) == interop_device.uuid.size() &&
1107 memcmp(uuid.bytes, interop_device.uuid.data(), sizeof(uuid.bytes)) == 0);
1108
1109 if (log) {
1110 if (found) {
1111 VLOG_INFO << "Graphics interop: found matching Vulkan device for CUDA";
1112 }
1113 else {
1114 VLOG_INFO << "Graphics interop: no matching Vulkan device for CUDA";
1115 }
1116
1117 VLOG_INFO << "Graphics Interop: CUDA UUID "
1118 << string_hex(reinterpret_cast<uint8_t *>(uuid.bytes), sizeof(uuid.bytes))
1119 << ", Vulkan UUID "
1120 << string_hex(interop_device.uuid.data(), interop_device.uuid.size());
1121 }
1122
1123 return found;
1124 }
1127 return false;
1128 }
1129 }
1130
1131 return false;
1132}
1133
1134int CUDADevice::get_num_multiprocessors()
1135{
1136 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
1137}
1138
1139int CUDADevice::get_max_num_threads_per_multiprocessor()
1140{
1141 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1142}
1143
1144bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
1145{
1146 CUDAContextScope scope(this);
1147
1148 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1149}
1150
1151int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, const int default_value)
1152{
1153 int value = 0;
1154 if (!get_device_attribute(attribute, &value)) {
1155 return default_value;
1156 }
1157 return value;
1158}
1159
1161
1162#endif
void BLI_kdtree_nd_ free(KDTree *tree)
unsigned int uint
volatile int lock
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
CUDA cuda
Definition debug.h:123
DeviceType type
virtual void set_error(const string &error)
DeviceInfo info
void mem_alloc(const size_t size)
Definition util/stats.h:18
void mem_free(const size_t size)
Definition util/stats.h:24
bool is_resident(Device *sub_device) const
Definition memory.cpp:134
size_t memory_elements_size(const int elements)
bool is_shared(Device *sub_device) const
Definition memory.cpp:139
static constexpr size_t datatype_size(DataType datatype)
@ MEM_TEXTURE
@ TYPE_UINT16
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:8
DebugFlags & DebugFlags()
Definition debug.h:145
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
@ DEVICE_CUDA
@ DEVICE_OPTIX
static const char * to_string(const Interpolation &interp)
Definition gl_shader.cc:109
#define log
#define active
#define assert(assertion)
#define printf(...)
@ BVH_LAYOUT_BVH2
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
format
#define VLOG_INFO
Definition log.h:71
#define VLOG_WORK
Definition log.h:74
string util_md5_string(const string &str)
Definition md5.cpp:378
static void error(const char *str)
int BVHLayoutMask
Definition params.h:50
string path_cache_get(const string &sub)
Definition path.cpp:360
string path_get(const string &sub)
Definition path.cpp:337
string path_files_md5_hash(const string &dir)
Definition path.cpp:611
string path_join(const string &dir, const string &file)
Definition path.cpp:415
bool path_exists(const string &path)
Definition path.cpp:563
void path_create_directories(const string &filepath)
Definition path.cpp:647
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
string string_human_readable_size(size_t size)
Definition string.cpp:257
string string_hex(const uint8_t *data, const size_t size)
Definition string.cpp:191
string string_human_readable_number(size_t num)
Definition string.cpp:276
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool adaptive_compile
Definition debug.h:59
uint64_t data
uint interpolation
int system_cpu_bits()
Definition system.cpp:130
i
Definition text_draw.cc:230
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:38
uint64_t device_ptr
Definition types_base.h:44
ccl_device_inline size_t align_up(const size_t offset, const size_t alignment)
Definition types_base.h:47
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
@ INTERPOLATION_CLOSEST
@ EXTENSION_REPEAT
@ EXTENSION_CLIP
@ EXTENSION_EXTEND
@ EXTENSION_MIRROR