Blender V5.0
cuda/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_CUDA
6
7# include <cstdio>
8# include <cstdlib>
9# include <cstring>
10# include <iomanip>
11
13
14# include "util/debug.h"
15# include "util/log.h"
16# include "util/md5.h"
17# include "util/path.h"
18# include "util/string.h"
19# include "util/system.h"
20# include "util/texture.h"
21# include "util/time.h"
22# include "util/types.h"
23
24# ifdef _WIN32
25# include "util/windows.h"
26# endif
27
29
31
33
34class CUDADevice;
35
36bool CUDADevice::have_precompiled_kernels()
37{
38 string cubins_path = path_get("lib");
39 return path_exists(cubins_path);
40}
41
42BVHLayoutMask CUDADevice::get_bvh_layout_mask(uint /*kernel_features*/) const
43{
44 return BVH_LAYOUT_BVH2;
45}
46
47void CUDADevice::set_error(const string &error)
48{
50
51 if (first_error) {
52 LOG_ERROR << "Refer to the Cycles GPU rendering documentation for possible solutions:\n"
53 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n";
54 first_error = false;
55 }
56}
57
58CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
59 : GPUDevice(info, stats, profiler, headless)
60{
61 /* Verify that base class types can be used with specific backend types */
62 static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
63 static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
64
65 first_error = true;
66
67 cuDevId = info.num;
68 cuDevice = 0;
69 cuContext = nullptr;
70
71 cuModule = nullptr;
72
73 need_texture_info = false;
74
75 pitch_alignment = 0;
76
77 /* Initialize CUDA. */
78 CUresult result = cuInit(0);
79 if (result != CUDA_SUCCESS) {
80 set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
81 return;
82 }
83
84 /* Setup device and context. */
85 result = cuDeviceGet(&cuDevice, cuDevId);
86 if (result != CUDA_SUCCESS) {
87 set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
88 cuewErrorString(result)));
89 return;
90 }
91
92 /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
93 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
94 * so we can predict which memory to map to host. */
95 int value;
96 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
97 can_map_host = value != 0;
98
99 cuda_assert(cuDeviceGetAttribute(
100 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
101
102 if (can_map_host) {
103 init_host_memory();
104 }
105
106 int active = 0;
107 unsigned int ctx_flags = 0;
108 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &active));
109
110 /* Configure primary context only once. */
111 if (active == 0) {
112 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
113 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
114 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
115 set_error(string_printf("Failed to configure CUDA context (%s)", cuewErrorString(result)));
116 return;
117 }
118 }
119
120 /* Create context. */
121 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
122
123 if (result != CUDA_SUCCESS) {
124 set_error(string_printf("Failed to retain CUDA context (%s)", cuewErrorString(result)));
125 return;
126 }
127
128 int major, minor;
129 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
130 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
131 cuDevArchitecture = major * 100 + minor * 10;
132}
133
134CUDADevice::~CUDADevice()
135{
136 texture_info.free();
137 if (cuModule) {
138 cuda_assert(cuModuleUnload(cuModule));
139 }
140 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
141}
142
143bool CUDADevice::support_device(const uint /*kernel_features*/)
144{
145 int major, minor;
146 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
147 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
148
149 /* We only support sm_50 and above */
150 if (major < 5) {
151 set_error(string_printf(
152 "CUDA backend requires compute capability 5.0 or up, but found %d.%d.", major, minor));
153 return false;
154 }
155
156 return true;
157}
158
159bool CUDADevice::check_peer_access(Device *peer_device)
160{
161 if (peer_device == this) {
162 return false;
163 }
164 if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
165 return false;
166 }
167
168 CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
169
170 int can_access = 0;
171 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
172 if (can_access == 0) {
173 return false;
174 }
175
176 // Ensure array access over the link is possible as well (for 3D textures)
177 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
178 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
179 cuDevice,
180 peer_device_cuda->cuDevice));
181 if (can_access == 0) {
182 return false;
183 }
184
185 // Enable peer access in both directions
186 {
187 const CUDAContextScope scope(this);
188 CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
189 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
190 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
191 cuewErrorString(result)));
192 return false;
193 }
194 }
195 {
196 const CUDAContextScope scope(peer_device_cuda);
197 CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
198 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
199 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
200 cuewErrorString(result)));
201 return false;
202 }
203 }
204
205 return true;
206}
207
208bool CUDADevice::use_adaptive_compilation()
209{
211}
212
213/* Common NVCC flags which stays the same regardless of shading model,
214 * kernel sources md5 and only depends on compiler or compilation settings.
215 */
216string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
217{
218 const int machine = system_cpu_bits();
219 const string source_path = path_get("source");
220 const string include_path = source_path;
221 string cflags = string_printf(
222 "-m%d "
223 "--ptxas-options=\"-v\" "
224 "--use_fast_math "
225 "-DNVCC "
226 "-I\"%s\"",
227 machine,
228 include_path.c_str());
229 if (use_adaptive_compilation()) {
230 cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
231 }
232 const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
233 if (extra_cflags) {
234 cflags += string(" ") + string(extra_cflags);
235 }
236
237# ifdef WITH_NANOVDB
238 cflags += " -DWITH_NANOVDB";
239# endif
240
241# ifdef WITH_CYCLES_DEBUG
242 cflags += " -DWITH_CYCLES_DEBUG";
243# endif
244
245 return cflags;
246}
247
248string CUDADevice::compile_kernel(const string &common_cflags,
249 const char *name,
250 const char *base,
251 bool force_ptx)
252{
253 /* Compute kernel name. */
254 int major, minor;
255 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
256 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
257
258 /* Attempt to use kernel provided with Blender. */
259 if (!use_adaptive_compilation()) {
260 if (!force_ptx) {
261 const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor));
262 LOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
263 if (path_exists(cubin)) {
264 LOG_INFO << "Using precompiled kernel.";
265 return cubin;
266 }
267 }
268
269 /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
270 int ptx_major = major, ptx_minor = minor;
271 while (ptx_major >= 5) {
272 const string ptx = path_get(
273 string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
274 LOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
275 if (path_exists(ptx)) {
276 LOG_INFO << "Using precompiled kernel.";
277 return ptx;
278 }
279
280 if (ptx_minor > 0) {
281 ptx_minor--;
282 }
283 else {
284 ptx_major--;
285 ptx_minor = 9;
286 }
287 }
288 }
289
290 /* Try to use locally compiled kernel. */
291 string source_path = path_get("source");
292 const string source_md5 = path_files_md5_hash(source_path);
293
294 /* We include cflags into md5 so changing cuda toolkit or changing other
295 * compiler command line arguments makes sure cubin gets re-built.
296 */
297 const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
298
299 const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
300 const char *const kernel_arch = force_ptx ? "compute" : "sm";
301 const string cubin_file = string_printf(
302 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
303 const string cubin = path_cache_get(path_join("kernels", cubin_file));
304 LOG_INFO << "Testing for locally compiled kernel " << cubin << ".";
305 if (path_exists(cubin)) {
306 LOG_INFO << "Using locally compiled kernel.";
307 return cubin;
308 }
309
310# ifdef _WIN32
311 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
312 if (major < 5) {
313 set_error(
314 string_printf("CUDA backend requires compute capability 5.0 or up, but found %d.%d. "
315 "Your GPU is not supported.",
316 major,
317 minor));
318 }
319 else {
320 set_error(
321 string_printf("CUDA binary kernel for this graphics card compute "
322 "capability (%d.%d) not found.",
323 major,
324 minor));
325 }
326 return string();
327 }
328# endif
329
330 /* Compile. */
331 const char *const nvcc = cuewCompilerPath();
332 if (nvcc == nullptr) {
333 set_error(
334 "CUDA nvcc compiler not found. "
335 "Install CUDA toolkit in default location.");
336 return string();
337 }
338
339 const int nvcc_cuda_version = cuewCompilerVersion();
340 LOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
341 if (nvcc_cuda_version < 101) {
342 LOG_ERROR << "Unsupported CUDA version " << nvcc_cuda_version / 10 << "."
343 << nvcc_cuda_version % 10 << ", you need CUDA 10.1 or newer";
344 return string();
345 }
346 if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
347 LOG_ERROR << "CUDA version " << nvcc_cuda_version / 10 << "." << nvcc_cuda_version % 10
348 << "CUDA 10.1 to 12 are officially supported.";
349 }
350
351 double starttime = time_dt();
352
354
355 source_path = path_join(path_join(source_path, "kernel"),
356 path_join("device", path_join(base, string_printf("%s.cu", name))));
357
358 string command = string_printf(
359 "\"%s\" "
360 "-arch=%s_%d%d "
361 "--%s \"%s\" "
362 "-o \"%s\" "
363 "%s",
364 nvcc,
365 kernel_arch,
366 major,
367 minor,
368 kernel_ext,
369 source_path.c_str(),
370 cubin.c_str(),
371 common_cflags.c_str());
372
373 LOG_INFO_IMPORTANT << "Compiling " << ((use_adaptive_compilation()) ? "adaptive " : "")
374 << "CUDA kernel ...";
375 LOG_INFO_IMPORTANT << command;
376
377# ifdef _WIN32
378 command = "call " + command;
379# endif
380 if (system(command.c_str()) != 0) {
381 set_error(
382 "Failed to execute compilation command, "
383 "see console for details.");
384 return string();
385 }
386
387 /* Verify if compilation succeeded */
388 if (!path_exists(cubin)) {
389 set_error(
390 "CUDA kernel compilation failed, "
391 "see console for details.");
392 return string();
393 }
394
395 LOG_INFO_IMPORTANT << "Kernel compilation finished in " << std::fixed << std::setprecision(2)
396 << time_dt() - starttime << "s";
397
398 return cubin;
399}
400
401bool CUDADevice::load_kernels(const uint kernel_features)
402{
403 /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
404 *
405 * Currently re-loading kernel will invalidate memory pointers,
406 * causing problems in cuCtxSynchronize.
407 */
408 if (cuModule) {
409 if (use_adaptive_compilation()) {
410 LOG_INFO << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
411 }
412 return true;
413 }
414
415 /* check if cuda init succeeded */
416 if (cuContext == nullptr) {
417 return false;
418 }
419
420 /* check if GPU is supported */
421 if (!support_device(kernel_features)) {
422 return false;
423 }
424
425 /* get kernel */
426 const char *kernel_name = "kernel";
427 string cflags = compile_kernel_get_common_cflags(kernel_features);
428 string cubin = compile_kernel(cflags, kernel_name);
429 if (cubin.empty()) {
430 return false;
431 }
432
433 /* open module */
434 CUDAContextScope scope(this);
435
436 string cubin_data;
437 CUresult result;
438
439 if (path_read_compressed_text(cubin, cubin_data)) {
440 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
441 }
442 else {
443 result = CUDA_ERROR_FILE_NOT_FOUND;
444 }
445
446 if (result != CUDA_SUCCESS) {
447 set_error(string_printf(
448 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
449 }
450
451 if (result == CUDA_SUCCESS) {
452 kernels.load(this);
453 reserve_local_memory(kernel_features);
454 }
455
456 return (result == CUDA_SUCCESS);
457}
458
459void CUDADevice::reserve_local_memory(const uint kernel_features)
460{
461 /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
462 * needed for kernel launches, so that we can reliably figure out when
463 * to allocate scene data in mapped host memory. */
464 size_t total = 0, free_before = 0, free_after = 0;
465
466 {
467 CUDAContextScope scope(this);
468 cuMemGetInfo(&free_before, &total);
469 }
470
471 {
472 /* Use the biggest kernel for estimation. */
473 const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
475 (kernel_features & KERNEL_FEATURE_MNEE) ?
478
479 /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
480 * multiprocessors. It would be good to do this in parallel for the multi GPU case
481 * still to make it faster. */
482 CUDADeviceQueue queue(this);
483
484 device_ptr d_path_index = 0;
485 device_ptr d_render_buffer = 0;
486 int d_work_size = 0;
487 DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
488
489 queue.init_execution();
490 queue.enqueue(test_kernel, 1, args);
491 queue.synchronize();
492 }
493
494 {
495 CUDAContextScope scope(this);
496 cuMemGetInfo(&free_after, &total);
497 }
498
499 LOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
500 << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
501
502# if 0
503 /* For testing mapped host memory, fill up device memory. */
504 const size_t keep_mb = 1024;
505
506 while (free_after > keep_mb * 1024 * 1024LL) {
507 CUdeviceptr tmp;
508 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
509 cuMemGetInfo(&free_after, &total);
510 }
511# endif
512}
513
514void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
515{
516 CUDAContextScope scope(this);
517
518 cuMemGetInfo(&free, &total);
519}
520
521bool CUDADevice::alloc_device(void *&device_pointer, const size_t size)
522{
523 CUDAContextScope scope(this);
524
525 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
526 return mem_alloc_result == CUDA_SUCCESS;
527}
528
529void CUDADevice::free_device(void *device_pointer)
530{
531 CUDAContextScope scope(this);
532
533 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
534}
535
536bool CUDADevice::shared_alloc(void *&shared_pointer, const size_t size)
537{
538 CUDAContextScope scope(this);
539
540 CUresult mem_alloc_result = cuMemHostAlloc(
541 &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
542 return mem_alloc_result == CUDA_SUCCESS;
543}
544
545void CUDADevice::shared_free(void *shared_pointer)
546{
547 CUDAContextScope scope(this);
548
549 cuMemFreeHost(shared_pointer);
550}
551
552void *CUDADevice::shared_to_device_pointer(const void *shared_pointer)
553{
554 CUDAContextScope scope(this);
555 void *device_pointer = nullptr;
556 cuda_assert(
557 cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, (void *)shared_pointer, 0));
558 return device_pointer;
559}
560
561void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size)
562{
563 const CUDAContextScope scope(this);
564
565 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
566}
567
568void CUDADevice::mem_alloc(device_memory &mem)
569{
570 if (mem.type == MEM_TEXTURE) {
571 assert(!"mem_alloc not supported for textures.");
572 }
573 else if (mem.type == MEM_GLOBAL) {
574 assert(!"mem_alloc not supported for global memory.");
575 }
576 else {
577 generic_alloc(mem);
578 }
579}
580
581void CUDADevice::mem_copy_to(device_memory &mem)
582{
583 if (mem.type == MEM_GLOBAL) {
584 global_copy_to(mem);
585 }
586 else if (mem.type == MEM_TEXTURE) {
587 tex_copy_to((device_texture &)mem);
588 }
589 else {
590 if (!mem.device_pointer) {
591 generic_alloc(mem);
592 generic_copy_to(mem);
593 }
594 else if (mem.is_resident(this)) {
595 generic_copy_to(mem);
596 }
597 }
598}
599
600void CUDADevice::mem_move_to_host(device_memory &mem)
601{
602 if (mem.type == MEM_GLOBAL) {
603 global_free(mem);
604 global_alloc(mem);
605 }
606 else if (mem.type == MEM_TEXTURE) {
607 tex_free((device_texture &)mem);
608 tex_alloc((device_texture &)mem);
609 }
610 else {
611 assert(!"mem_move_to_host only supported for texture and global memory");
612 }
613}
614
615void CUDADevice::mem_copy_from(
616 device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem)
617{
618 if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
619 assert(!"mem_copy_from not supported for textures.");
620 }
621 else if (mem.host_pointer) {
622 const size_t size = elem * w * h;
623 const size_t offset = elem * y * w;
624
625 if (mem.device_pointer) {
626 const CUDAContextScope scope(this);
627 cuda_assert(cuMemcpyDtoH(
628 (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
629 }
630 else {
631 memset((char *)mem.host_pointer + offset, 0, size);
632 }
633 }
634}
635
636void CUDADevice::mem_zero(device_memory &mem)
637{
638 if (!mem.device_pointer) {
639 mem_alloc(mem);
640 }
641 if (!mem.device_pointer) {
642 return;
643 }
644
645 if (!(mem.is_shared(this) && mem.host_pointer == mem.shared_pointer)) {
646 const CUDAContextScope scope(this);
647 cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
648 }
649 else if (mem.host_pointer) {
650 memset(mem.host_pointer, 0, mem.memory_size());
651 }
652}
653
654void CUDADevice::mem_free(device_memory &mem)
655{
656 if (mem.type == MEM_GLOBAL) {
657 global_free(mem);
658 }
659 else if (mem.type == MEM_TEXTURE) {
660 tex_free((device_texture &)mem);
661 }
662 else {
663 generic_free(mem);
664 }
665}
666
667device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, const size_t offset, size_t /*size*/)
668{
669 return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
670}
671
672void CUDADevice::const_copy_to(const char *name, void *host, const size_t size)
673{
674 CUDAContextScope scope(this);
675 CUdeviceptr mem;
676 size_t bytes;
677
678 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params"));
679 assert(bytes == sizeof(KernelParamsCUDA));
680
681 /* Update data storage pointers in launch parameters. */
682# define KERNEL_DATA_ARRAY(data_type, data_name) \
683 if (strcmp(name, #data_name) == 0) { \
684 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
685 return; \
686 }
687 KERNEL_DATA_ARRAY(KernelData, data)
688 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
689# include "kernel/data_arrays.h"
690# undef KERNEL_DATA_ARRAY
691}
692
693void CUDADevice::global_alloc(device_memory &mem)
694{
695 if (mem.is_resident(this)) {
696 generic_alloc(mem);
697 generic_copy_to(mem);
698 }
699
700 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
701}
702
703void CUDADevice::global_copy_to(device_memory &mem)
704{
705 if (!mem.device_pointer) {
706 generic_alloc(mem);
707 generic_copy_to(mem);
708 }
709 else if (mem.is_resident(this)) {
710 generic_copy_to(mem);
711 }
712
713 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
714}
715
716void CUDADevice::global_free(device_memory &mem)
717{
718 if (mem.is_resident(this) && mem.device_pointer) {
719 generic_free(mem);
720 }
721}
722
723static size_t tex_src_pitch(const device_texture &mem)
724{
725 return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
726}
727
728static CUDA_MEMCPY2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
729{
730 /* 2D texture using pitch aligned linear memory. */
731 const size_t src_pitch = tex_src_pitch(mem);
732 const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
733
734 CUDA_MEMCPY2D param;
735 memset(&param, 0, sizeof(param));
736 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
737 param.dstDevice = mem.device_pointer;
738 param.dstPitch = dst_pitch;
739 param.srcMemoryType = CU_MEMORYTYPE_HOST;
740 param.srcHost = mem.host_pointer;
741 param.srcPitch = src_pitch;
742 param.WidthInBytes = param.srcPitch;
743 param.Height = mem.data_height;
744
745 return param;
746}
747
748void CUDADevice::tex_alloc(device_texture &mem)
749{
750 CUDAContextScope scope(this);
751
752 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
753 switch (mem.info.extension) {
754 case EXTENSION_REPEAT:
755 address_mode = CU_TR_ADDRESS_MODE_WRAP;
756 break;
757 case EXTENSION_EXTEND:
758 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
759 break;
760 case EXTENSION_CLIP:
761 address_mode = CU_TR_ADDRESS_MODE_BORDER;
762 break;
763 case EXTENSION_MIRROR:
764 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
765 break;
766 default:
767 assert(0);
768 break;
769 }
770
771 CUfilter_mode filter_mode;
773 filter_mode = CU_TR_FILTER_MODE_POINT;
774 }
775 else {
776 filter_mode = CU_TR_FILTER_MODE_LINEAR;
777 }
778
779 /* Image Texture Storage */
780 /* Cycles expects to read all texture data as normalized float values in
781 * kernel/device/gpu/image.h. But storing all data as floats would be very inefficient due to the
782 * huge size of float textures. So in the code below, we define different texture types including
783 * integer types, with the aim of using CUDA's default promotion behavior of integer data to
784 * floating point data in the range [0, 1], as noted in the CUDA documentation on
785 * cuTexObjectCreate API Call.
786 * Note that 32-bit integers are not supported by this promotion behavior and cannot be used
787 * with Cycles's current implementation in kernel/device/gpu/image.h.
788 */
789 CUarray_format_enum format;
790 switch (mem.data_type) {
791 case TYPE_UCHAR:
792 format = CU_AD_FORMAT_UNSIGNED_INT8;
793 break;
794 case TYPE_UINT16:
795 format = CU_AD_FORMAT_UNSIGNED_INT16;
796 break;
797 case TYPE_FLOAT:
798 format = CU_AD_FORMAT_FLOAT;
799 break;
800 case TYPE_HALF:
801 format = CU_AD_FORMAT_HALF;
802 break;
803 default:
804 assert(0);
805 return;
806 }
807
808 Mem *cmem = nullptr;
809
810 if (!mem.is_resident(this)) {
811 thread_scoped_lock lock(device_mem_map_mutex);
812 cmem = &device_mem_map[&mem];
813 cmem->texobject = 0;
814 }
815 else if (mem.data_height > 0) {
816 /* 2D texture, using pitch aligned linear memory. */
817 const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
818 const size_t dst_size = dst_pitch * mem.data_height;
819
820 cmem = generic_alloc(mem, dst_size - mem.memory_size());
821 if (!cmem) {
822 return;
823 }
824
825 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
826 cuda_assert(cuMemcpy2DUnaligned(&param));
827 }
828 else {
829 /* 1D texture, using linear memory. */
830 cmem = generic_alloc(mem);
831 if (!cmem) {
832 return;
833 }
834
835 cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
836 }
837
838 /* Set Mapping and tag that we need to (re-)upload to device */
839 TextureInfo tex_info = mem.info;
840
841 if (!is_nanovdb_type(mem.info.data_type)) {
842 CUDA_RESOURCE_DESC resDesc;
843 memset(&resDesc, 0, sizeof(resDesc));
844
845 if (mem.data_height > 0) {
846 const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
847
848 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
849 resDesc.res.pitch2D.devPtr = mem.device_pointer;
850 resDesc.res.pitch2D.format = format;
851 resDesc.res.pitch2D.numChannels = mem.data_elements;
852 resDesc.res.pitch2D.height = mem.data_height;
853 resDesc.res.pitch2D.width = mem.data_width;
854 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
855 }
856 else {
857 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
858 resDesc.res.linear.devPtr = mem.device_pointer;
859 resDesc.res.linear.format = format;
860 resDesc.res.linear.numChannels = mem.data_elements;
861 resDesc.res.linear.sizeInBytes = mem.device_size;
862 }
863
864 CUDA_TEXTURE_DESC texDesc;
865 memset(&texDesc, 0, sizeof(texDesc));
866 texDesc.addressMode[0] = address_mode;
867 texDesc.addressMode[1] = address_mode;
868 texDesc.addressMode[2] = address_mode;
869 texDesc.filterMode = filter_mode;
870 /* CUDA's flag CU_TRSF_READ_AS_INTEGER is intentionally not used and it is
871 * significant, see above an explanation about how Blender treat textures. */
872 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
873
874 thread_scoped_lock lock(device_mem_map_mutex);
875 cmem = &device_mem_map[&mem];
876
877 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, nullptr));
878
879 tex_info.data = (uint64_t)cmem->texobject;
880 }
881 else {
882 tex_info.data = (uint64_t)mem.device_pointer;
883 }
884
885 {
886 /* Update texture info. */
887 thread_scoped_lock lock(texture_info_mutex);
888 const uint slot = mem.slot;
889 if (slot >= texture_info.size()) {
890 /* Allocate some slots in advance, to reduce amount of re-allocations. */
891 texture_info.resize(slot + 128);
892 }
893 texture_info[slot] = tex_info;
894 need_texture_info = true;
895 }
896}
897
898void CUDADevice::tex_copy_to(device_texture &mem)
899{
900 if (!mem.device_pointer) {
901 /* Not yet allocated on device. */
902 tex_alloc(mem);
903 }
904 else if (!mem.is_resident(this)) {
905 /* Peering with another device, may still need to create texture info and object. */
906 bool texture_allocated = false;
907 {
908 thread_scoped_lock lock(texture_info_mutex);
909 texture_allocated = mem.slot < texture_info.size() && texture_info[mem.slot].data != 0;
910 }
911 if (!texture_allocated) {
912 tex_alloc(mem);
913 }
914 }
915 else {
916 /* Resident and fully allocated, only copy. */
917 if (mem.data_height > 0) {
918 CUDAContextScope scope(this);
919 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
920 cuda_assert(cuMemcpy2DUnaligned(&param));
921 }
922 else {
923 generic_copy_to(mem);
924 }
925 }
926}
927
928void CUDADevice::tex_free(device_texture &mem)
929{
930 CUDAContextScope scope(this);
931 thread_scoped_lock lock(device_mem_map_mutex);
932
933 /* Check if the memory was allocated for this device. */
934 auto it = device_mem_map.find(&mem);
935 if (it == device_mem_map.end()) {
936 return;
937 }
938
939 const Mem &cmem = it->second;
940
941 /* Always clear texture info and texture object, regardless of residency. */
942 {
943 thread_scoped_lock lock(texture_info_mutex);
944 texture_info[mem.slot] = TextureInfo();
945 }
946
947 if (cmem.texobject) {
948 /* Free bindless texture. */
949 cuTexObjectDestroy(cmem.texobject);
950 }
951
952 if (!mem.is_resident(this)) {
953 /* Do not free memory here, since it was allocated on a different device. */
954 device_mem_map.erase(device_mem_map.find(&mem));
955 }
956 else if (cmem.array) {
957 /* Free array. */
958 cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
959 stats.mem_free(mem.device_size);
960 mem.device_pointer = 0;
961 mem.device_size = 0;
962
963 device_mem_map.erase(device_mem_map.find(&mem));
964 }
965 else {
966 lock.unlock();
967 generic_free(mem);
968 }
969}
970
971unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
972{
973 return make_unique<CUDADeviceQueue>(this);
974}
975
976bool CUDADevice::should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
977 const bool log)
978{
979 if (headless) {
980 /* Avoid any call which might involve interaction with a graphics backend when we know that
981 * we don't have active graphics context. This avoid crash on certain platforms when calling
982 * cuGLGetDevices(). */
983 return false;
984 }
985
986 CUDAContextScope scope(this);
987
988 switch (interop_device.type) {
990 /* Check whether this device is part of OpenGL context.
991 *
992 * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
993 * possible, but from the empiric measurements it can be considerably slower than using naive
994 * pixels copy. */
995 int num_all_devices = 0;
996 cuda_assert(cuDeviceGetCount(&num_all_devices));
997
998 if (num_all_devices == 0) {
999 return false;
1000 }
1001
1002 vector<CUdevice> gl_devices(num_all_devices);
1003 uint num_gl_devices = 0;
1004 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
1005
1006 bool found = false;
1007 for (uint i = 0; i < num_gl_devices; ++i) {
1008 if (gl_devices[i] == cuDevice) {
1009 found = true;
1010 break;
1011 }
1012 }
1013
1014 if (log) {
1015 if (found) {
1016 LOG_INFO << "Graphics interop: found matching OpenGL device for CUDA";
1017 }
1018 else {
1019 LOG_INFO << "Graphics interop: no matching OpenGL device for CUDA";
1020 }
1021 }
1022
1023 return found;
1024 }
1025 case ccl::GraphicsInteropDevice::VULKAN: {
1026 /* Only do interop with matching device UUID. */
1027 CUuuid uuid = {};
1028 cuDeviceGetUuid(&uuid, cuDevice);
1029 const bool found = (sizeof(uuid.bytes) == interop_device.uuid.size() &&
1030 memcmp(uuid.bytes, interop_device.uuid.data(), sizeof(uuid.bytes)) == 0);
1031
1032 if (log) {
1033 if (found) {
1034 LOG_INFO << "Graphics interop: found matching Vulkan device for CUDA";
1035 }
1036 else {
1037 LOG_INFO << "Graphics interop: no matching Vulkan device for CUDA";
1038 }
1039
1040 LOG_INFO << "Graphics Interop: CUDA UUID "
1041 << string_hex(reinterpret_cast<uint8_t *>(uuid.bytes), sizeof(uuid.bytes))
1042 << ", Vulkan UUID "
1043 << string_hex(interop_device.uuid.data(), interop_device.uuid.size());
1044 }
1045
1046 return found;
1047 }
1050 return false;
1051 }
1052 }
1053
1054 return false;
1055}
1056
1057int CUDADevice::get_num_multiprocessors()
1058{
1059 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
1060}
1061
1062int CUDADevice::get_max_num_threads_per_multiprocessor()
1063{
1064 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1065}
1066
1067bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
1068{
1069 CUDAContextScope scope(this);
1070
1071 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1072}
1073
1074int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, const int default_value)
1075{
1076 int value = 0;
1077 if (!get_device_attribute(attribute, &value)) {
1078 return default_value;
1079 }
1080 return value;
1081}
1082
1084
1085#endif
void BLI_kdtree_nd_ free(KDTree *tree)
unsigned int uint
volatile int lock
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
CUDA cuda
Definition debug.h:123
DeviceType type
virtual void set_error(const string &error)
DeviceInfo info
void mem_free(const size_t size)
Definition util/stats.h:24
bool is_resident(Device *sub_device) const
Definition memory.cpp:132
size_t memory_elements_size(const int elements)
bool is_shared(Device *sub_device) const
Definition memory.cpp:137
static constexpr size_t datatype_size(DataType datatype)
@ MEM_TEXTURE
@ TYPE_UINT16
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:8
DebugFlags & DebugFlags()
Definition debug.h:145
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
@ DEVICE_CUDA
@ DEVICE_OPTIX
static const char * to_string(const Interpolation &interp)
Definition gl_shader.cc:103
#define active
#define assert(assertion)
#define log
@ BVH_LAYOUT_BVH2
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
format
#define LOG_INFO_IMPORTANT
Definition log.h:105
#define LOG_ERROR
Definition log.h:101
#define LOG_INFO
Definition log.h:106
string util_md5_string(const string &str)
Definition md5.cpp:386
static void error(const char *str)
int BVHLayoutMask
Definition params.h:50
string path_cache_get(const string &sub)
Definition path.cpp:360
string path_get(const string &sub)
Definition path.cpp:337
string path_files_md5_hash(const string &dir)
Definition path.cpp:611
string path_join(const string &dir, const string &file)
Definition path.cpp:415
bool path_exists(const string &path)
Definition path.cpp:563
void path_create_directories(const string &filepath)
Definition path.cpp:647
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
const char * name
string string_human_readable_size(size_t size)
Definition string.cpp:257
string string_hex(const uint8_t *data, const size_t size)
Definition string.cpp:191
string string_human_readable_number(size_t num)
Definition string.cpp:276
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool adaptive_compile
Definition debug.h:59
uint64_t data
Definition texture.h:86
uint data_type
Definition texture.h:88
uint extension
Definition texture.h:91
uint interpolation
Definition texture.h:90
int system_cpu_bits()
Definition system.cpp:130
i
Definition text_draw.cc:230
@ INTERPOLATION_CLOSEST
Definition texture.h:25
@ EXTENSION_REPEAT
Definition texture.h:73
@ EXTENSION_CLIP
Definition texture.h:77
@ EXTENSION_EXTEND
Definition texture.h:75
@ EXTENSION_MIRROR
Definition texture.h:79
ccl_device_inline bool is_nanovdb_type(int type)
Definition texture.h:51
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:47
uint64_t device_ptr
Definition types_base.h:44
ccl_device_inline size_t align_up(const size_t offset, const size_t alignment)
Definition types_base.h:47