36bool CUDADevice::have_precompiled_kernels()
38 string cubins_path =
path_get(
"lib");
47void CUDADevice::set_error(
const string &
error)
52 LOG_ERROR <<
"Refer to the Cycles GPU rendering documentation for possible solutions:\n"
53 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n";
59 :
GPUDevice(info, stats, profiler, headless)
62 static_assert(
sizeof(texMemObject) ==
sizeof(
CUtexObject));
63 static_assert(
sizeof(arrayMemObject) ==
sizeof(CUarray));
73 need_texture_info =
false;
78 CUresult
result = cuInit(0);
79 if (
result != CUDA_SUCCESS) {
80 set_error(
string_printf(
"Failed to initialize CUDA runtime (%s)", cuewErrorString(
result)));
85 result = cuDeviceGet(&cuDevice, cuDevId);
86 if (
result != CUDA_SUCCESS) {
87 set_error(
string_printf(
"Failed to get CUDA device handle from ordinal (%s)",
96 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
97 can_map_host = value != 0;
99 cuda_assert(cuDeviceGetAttribute(
100 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
107 unsigned int ctx_flags = 0;
108 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &
active));
112 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
113 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
114 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
115 set_error(
string_printf(
"Failed to configure CUDA context (%s)", cuewErrorString(
result)));
121 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
123 if (
result != CUDA_SUCCESS) {
129 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
130 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
131 cuDevArchitecture = major * 100 + minor * 10;
134CUDADevice::~CUDADevice()
138 cuda_assert(cuModuleUnload(cuModule));
140 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
143bool CUDADevice::support_device(
const uint )
146 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
147 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
152 "CUDA backend requires compute capability 5.0 or up, but found %d.%d.", major, minor));
159bool CUDADevice::check_peer_access(
Device *peer_device)
161 if (peer_device ==
this) {
168 CUDADevice *
const peer_device_cuda =
static_cast<CUDADevice *
>(peer_device);
171 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
172 if (can_access == 0) {
177 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
178 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
180 peer_device_cuda->cuDevice));
181 if (can_access == 0) {
187 const CUDAContextScope scope(
this);
188 CUresult
result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
189 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
190 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
191 cuewErrorString(
result)));
196 const CUDAContextScope scope(peer_device_cuda);
197 CUresult
result = cuCtxEnablePeerAccess(cuContext, 0);
198 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
199 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
200 cuewErrorString(
result)));
208bool CUDADevice::use_adaptive_compilation()
216string CUDADevice::compile_kernel_get_common_cflags(
const uint kernel_features)
219 const string source_path =
path_get(
"source");
220 const string include_path = source_path;
223 "--ptxas-options=\"-v\" "
228 include_path.c_str());
229 if (use_adaptive_compilation()) {
230 cflags +=
" -D__KERNEL_FEATURES__=" +
to_string(kernel_features);
232 const char *extra_cflags = getenv(
"CYCLES_CUDA_EXTRA_CFLAGS");
234 cflags += string(
" ") + string(extra_cflags);
238 cflags +=
" -DWITH_NANOVDB";
241# ifdef WITH_CYCLES_DEBUG
242 cflags +=
" -DWITH_CYCLES_DEBUG";
248string CUDADevice::compile_kernel(
const string &common_cflags,
255 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
256 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
259 if (!use_adaptive_compilation()) {
262 LOG_INFO <<
"Testing for pre-compiled kernel " << cubin <<
".";
264 LOG_INFO <<
"Using precompiled kernel.";
270 int ptx_major = major, ptx_minor = minor;
271 while (ptx_major >= 5) {
274 LOG_INFO <<
"Testing for pre-compiled kernel " << ptx <<
".";
276 LOG_INFO <<
"Using precompiled kernel.";
291 string source_path =
path_get(
"source");
299 const char *
const kernel_ext = force_ptx ?
"ptx" :
"cubin";
300 const char *
const kernel_arch = force_ptx ?
"compute" :
"sm";
302 "cycles_%s_%s_%d%d_%s.%s",
name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
304 LOG_INFO <<
"Testing for locally compiled kernel " << cubin <<
".";
306 LOG_INFO <<
"Using locally compiled kernel.";
311 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
314 string_printf(
"CUDA backend requires compute capability 5.0 or up, but found %d.%d. "
315 "Your GPU is not supported.",
321 string_printf(
"CUDA binary kernel for this graphics card compute "
322 "capability (%d.%d) not found.",
331 const char *
const nvcc = cuewCompilerPath();
332 if (nvcc ==
nullptr) {
334 "CUDA nvcc compiler not found. "
335 "Install CUDA toolkit in default location.");
339 const int nvcc_cuda_version = cuewCompilerVersion();
340 LOG_INFO <<
"Found nvcc " << nvcc <<
", CUDA version " << nvcc_cuda_version <<
".";
341 if (nvcc_cuda_version < 101) {
342 LOG_ERROR <<
"Unsupported CUDA version " << nvcc_cuda_version / 10 <<
"."
343 << nvcc_cuda_version % 10 <<
", you need CUDA 10.1 or newer";
346 if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
347 LOG_ERROR <<
"CUDA version " << nvcc_cuda_version / 10 <<
"." << nvcc_cuda_version % 10
348 <<
"CUDA 10.1 to 12 are officially supported.";
371 common_cflags.c_str());
373 LOG_INFO_IMPORTANT <<
"Compiling " << ((use_adaptive_compilation()) ?
"adaptive " :
"")
374 <<
"CUDA kernel ...";
378 command =
"call " + command;
380 if (system(command.c_str()) != 0) {
382 "Failed to execute compilation command, "
383 "see console for details.");
390 "CUDA kernel compilation failed, "
391 "see console for details.");
395 LOG_INFO_IMPORTANT <<
"Kernel compilation finished in " << std::fixed << std::setprecision(2)
396 <<
time_dt() - starttime <<
"s";
401bool CUDADevice::load_kernels(
const uint kernel_features)
409 if (use_adaptive_compilation()) {
410 LOG_INFO <<
"Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
416 if (cuContext ==
nullptr) {
421 if (!support_device(kernel_features)) {
426 const char *kernel_name =
"kernel";
427 string cflags = compile_kernel_get_common_cflags(kernel_features);
428 string cubin = compile_kernel(cflags, kernel_name);
434 CUDAContextScope scope(
this);
440 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
443 result = CUDA_ERROR_FILE_NOT_FOUND;
446 if (
result != CUDA_SUCCESS) {
448 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(
result)));
451 if (
result == CUDA_SUCCESS) {
453 reserve_local_memory(kernel_features);
456 return (
result == CUDA_SUCCESS);
459void CUDADevice::reserve_local_memory(
const uint kernel_features)
464 size_t total = 0, free_before = 0, free_after = 0;
467 CUDAContextScope scope(
this);
468 cuMemGetInfo(&free_before, &total);
482 CUDADeviceQueue queue(
this);
489 queue.init_execution();
490 queue.enqueue(test_kernel, 1, args);
495 CUDAContextScope scope(
this);
496 cuMemGetInfo(&free_after, &total);
504 const size_t keep_mb = 1024;
506 while (free_after > keep_mb * 1024 * 1024LL) {
508 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
509 cuMemGetInfo(&free_after, &total);
514void CUDADevice::get_device_memory_info(
size_t &total,
size_t &
free)
516 CUDAContextScope scope(
this);
518 cuMemGetInfo(&
free, &total);
521bool CUDADevice::alloc_device(
void *&device_pointer,
const size_t size)
523 CUDAContextScope scope(
this);
525 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer,
size);
526 return mem_alloc_result == CUDA_SUCCESS;
529void CUDADevice::free_device(
void *device_pointer)
531 CUDAContextScope scope(
this);
533 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
536bool CUDADevice::shared_alloc(
void *&shared_pointer,
const size_t size)
538 CUDAContextScope scope(
this);
540 CUresult mem_alloc_result = cuMemHostAlloc(
541 &shared_pointer,
size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
542 return mem_alloc_result == CUDA_SUCCESS;
545void CUDADevice::shared_free(
void *shared_pointer)
547 CUDAContextScope scope(
this);
549 cuMemFreeHost(shared_pointer);
552void *CUDADevice::shared_to_device_pointer(
const void *shared_pointer)
554 CUDAContextScope scope(
this);
555 void *device_pointer =
nullptr;
557 cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, (
void *)shared_pointer, 0));
558 return device_pointer;
561void CUDADevice::copy_host_to_device(
void *device_pointer,
void *host_pointer,
const size_t size)
563 const CUDAContextScope scope(
this);
565 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer,
size));
571 assert(!
"mem_alloc not supported for textures.");
574 assert(!
"mem_alloc not supported for global memory.");
592 generic_copy_to(mem);
595 generic_copy_to(mem);
611 assert(!
"mem_move_to_host only supported for texture and global memory");
615void CUDADevice::mem_copy_from(
616 device_memory &mem,
const size_t y,
size_t w,
const size_t h,
size_t elem)
619 assert(!
"mem_copy_from not supported for textures.");
622 const size_t size = elem *
w * h;
623 const size_t offset = elem *
y *
w;
626 const CUDAContextScope scope(
this);
627 cuda_assert(cuMemcpyDtoH(
646 const CUDAContextScope scope(
this);
672void CUDADevice::const_copy_to(
const char *
name,
void *host,
const size_t size)
674 CUDAContextScope scope(
this);
678 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule,
"kernel_params"));
682# define KERNEL_DATA_ARRAY(data_type, data_name) \
683 if (strcmp(name, #data_name) == 0) { \
684 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
689# include "kernel/data_arrays.h"
690# undef KERNEL_DATA_ARRAY
697 generic_copy_to(mem);
707 generic_copy_to(mem);
710 generic_copy_to(mem);
728static CUDA_MEMCPY2D tex_2d_copy_param(
const device_texture &mem,
const int pitch_alignment)
731 const size_t src_pitch = tex_src_pitch(mem);
732 const size_t dst_pitch =
align_up(src_pitch, pitch_alignment);
735 memset(¶m, 0,
sizeof(param));
736 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
738 param.dstPitch = dst_pitch;
739 param.srcMemoryType = CU_MEMORYTYPE_HOST;
741 param.srcPitch = src_pitch;
742 param.WidthInBytes = param.srcPitch;
750 CUDAContextScope scope(
this);
752 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
755 address_mode = CU_TR_ADDRESS_MODE_WRAP;
758 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
761 address_mode = CU_TR_ADDRESS_MODE_BORDER;
764 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
771 CUfilter_mode filter_mode;
773 filter_mode = CU_TR_FILTER_MODE_POINT;
776 filter_mode = CU_TR_FILTER_MODE_LINEAR;
789 CUarray_format_enum
format;
792 format = CU_AD_FORMAT_UNSIGNED_INT8;
795 format = CU_AD_FORMAT_UNSIGNED_INT16;
798 format = CU_AD_FORMAT_FLOAT;
801 format = CU_AD_FORMAT_HALF;
812 cmem = &device_mem_map[&mem];
817 const size_t dst_pitch =
align_up(tex_src_pitch(mem), pitch_alignment);
818 const size_t dst_size = dst_pitch * mem.
data_height;
820 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
825 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
826 cuda_assert(cuMemcpy2DUnaligned(¶m));
830 cmem = generic_alloc(mem);
842 CUDA_RESOURCE_DESC resDesc;
843 memset(&resDesc, 0,
sizeof(resDesc));
846 const size_t dst_pitch =
align_up(tex_src_pitch(mem), pitch_alignment);
848 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
850 resDesc.res.pitch2D.format =
format;
854 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
857 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
859 resDesc.res.linear.format =
format;
864 CUDA_TEXTURE_DESC texDesc;
865 memset(&texDesc, 0,
sizeof(texDesc));
866 texDesc.addressMode[0] = address_mode;
867 texDesc.addressMode[1] = address_mode;
868 texDesc.addressMode[2] = address_mode;
869 texDesc.filterMode = filter_mode;
872 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
875 cmem = &device_mem_map[&mem];
877 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
nullptr));
889 if (slot >= texture_info.size()) {
891 texture_info.resize(slot + 128);
893 texture_info[slot] = tex_info;
894 need_texture_info =
true;
906 bool texture_allocated =
false;
909 texture_allocated = mem.
slot < texture_info.size() && texture_info[mem.
slot].data != 0;
911 if (!texture_allocated) {
918 CUDAContextScope scope(
this);
919 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
920 cuda_assert(cuMemcpy2DUnaligned(¶m));
923 generic_copy_to(mem);
930 CUDAContextScope scope(
this);
934 auto it = device_mem_map.find(&mem);
935 if (it == device_mem_map.end()) {
939 const Mem &cmem = it->second;
947 if (cmem.texobject) {
949 cuTexObjectDestroy(cmem.texobject);
954 device_mem_map.erase(device_mem_map.find(&mem));
956 else if (cmem.array) {
958 cuArrayDestroy(
reinterpret_cast<CUarray
>(cmem.array));
963 device_mem_map.erase(device_mem_map.find(&mem));
973 return make_unique<CUDADeviceQueue>(
this);
986 CUDAContextScope scope(
this);
988 switch (interop_device.
type) {
995 int num_all_devices = 0;
996 cuda_assert(cuDeviceGetCount(&num_all_devices));
998 if (num_all_devices == 0) {
1003 uint num_gl_devices = 0;
1004 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
1007 for (
uint i = 0;
i < num_gl_devices; ++
i) {
1008 if (gl_devices[
i] == cuDevice) {
1016 LOG_INFO <<
"Graphics interop: found matching OpenGL device for CUDA";
1019 LOG_INFO <<
"Graphics interop: no matching OpenGL device for CUDA";
1025 case ccl::GraphicsInteropDevice::VULKAN: {
1028 cuDeviceGetUuid(&uuid, cuDevice);
1029 const bool found = (
sizeof(uuid.bytes) == interop_device.
uuid.size() &&
1030 memcmp(uuid.bytes, interop_device.
uuid.data(),
sizeof(uuid.bytes)) == 0);
1034 LOG_INFO <<
"Graphics interop: found matching Vulkan device for CUDA";
1037 LOG_INFO <<
"Graphics interop: no matching Vulkan device for CUDA";
1040 LOG_INFO <<
"Graphics Interop: CUDA UUID "
1057int CUDADevice::get_num_multiprocessors()
1059 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
1062int CUDADevice::get_max_num_threads_per_multiprocessor()
1064 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1067bool CUDADevice::get_device_attribute(CUdevice_attribute attribute,
int *value)
1069 CUDAContextScope scope(
this);
1071 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1074int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute,
const int default_value)
1077 if (!get_device_attribute(attribute, &value)) {
1078 return default_value;
void BLI_kdtree_nd_ free(KDTree *tree)
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
virtual void set_error(const string &error)
void mem_free(const size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(const int elements)
bool is_shared(Device *sub_device) const
device_ptr device_pointer
static constexpr size_t datatype_size(DataType datatype)
#define KERNEL_DATA_ARRAY(type, name)
DebugFlags & DebugFlags()
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
static const char * to_string(const Interpolation &interp)
#define assert(assertion)
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
#define LOG_INFO_IMPORTANT
string util_md5_string(const string &str)
static void error(const char *str)
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_compressed_text(const string &path, string &text)
string string_human_readable_size(size_t size)
string string_hex(const uint8_t *data, const size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
ccl_device_inline bool is_nanovdb_type(int type)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t align_up(const size_t offset, const size_t alignment)