33bool CUDADevice::have_precompiled_kernels()
35 string cubins_path =
path_get(
"lib");
44void CUDADevice::set_error(
const string &
error)
49 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
51 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
57 :
GPUDevice(info, stats, profiler, headless)
60 static_assert(
sizeof(texMemObject) ==
sizeof(
CUtexObject));
61 static_assert(
sizeof(arrayMemObject) ==
sizeof(CUarray));
71 need_texture_info =
false;
76 CUresult result = cuInit(0);
77 if (result != CUDA_SUCCESS) {
78 set_error(
string_printf(
"Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
83 result = cuDeviceGet(&cuDevice, cuDevId);
84 if (result != CUDA_SUCCESS) {
85 set_error(
string_printf(
"Failed to get CUDA device handle from ordinal (%s)",
86 cuewErrorString(result)));
94 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
95 can_map_host = value != 0;
97 cuda_assert(cuDeviceGetAttribute(
98 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
105 unsigned int ctx_flags = 0;
106 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &active));
110 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
111 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
112 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
113 set_error(
string_printf(
"Failed to configure CUDA context (%s)", cuewErrorString(result)));
119 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
121 if (result != CUDA_SUCCESS) {
122 set_error(
string_printf(
"Failed to retain CUDA context (%s)", cuewErrorString(result)));
127 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
128 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
129 cuDevArchitecture = major * 100 + minor * 10;
132CUDADevice::~CUDADevice()
136 cuda_assert(cuModuleUnload(cuModule));
138 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
141bool CUDADevice::support_device(
const uint )
144 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
145 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
150 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
157bool CUDADevice::check_peer_access(
Device *peer_device)
159 if (peer_device ==
this) {
166 CUDADevice *
const peer_device_cuda =
static_cast<CUDADevice *
>(peer_device);
169 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
170 if (can_access == 0) {
175 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
176 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
178 peer_device_cuda->cuDevice));
179 if (can_access == 0) {
185 const CUDAContextScope scope(
this);
186 CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
187 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
188 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
189 cuewErrorString(result)));
194 const CUDAContextScope scope(peer_device_cuda);
195 CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
196 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
197 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
198 cuewErrorString(result)));
206bool CUDADevice::use_adaptive_compilation()
214string CUDADevice::compile_kernel_get_common_cflags(
const uint kernel_features)
217 const string source_path =
path_get(
"source");
218 const string include_path = source_path;
221 "--ptxas-options=\"-v\" "
226 include_path.c_str());
227 if (use_adaptive_compilation()) {
228 cflags +=
" -D__KERNEL_FEATURES__=" +
to_string(kernel_features);
230 const char *extra_cflags = getenv(
"CYCLES_CUDA_EXTRA_CFLAGS");
232 cflags += string(
" ") + string(extra_cflags);
236 cflags +=
" -DWITH_NANOVDB";
239# ifdef WITH_CYCLES_DEBUG
240 cflags +=
" -DWITH_CYCLES_DEBUG";
246string CUDADevice::compile_kernel(
const string &common_cflags,
253 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
254 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
257 if (!use_adaptive_compilation()) {
260 VLOG_INFO <<
"Testing for pre-compiled kernel " << cubin <<
".";
262 VLOG_INFO <<
"Using precompiled kernel.";
268 int ptx_major = major, ptx_minor = minor;
269 while (ptx_major >= 3) {
271 string_printf(
"lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
272 VLOG_INFO <<
"Testing for pre-compiled kernel " << ptx <<
".";
274 VLOG_INFO <<
"Using precompiled kernel.";
289 string source_path =
path_get(
"source");
297 const char *
const kernel_ext = force_ptx ?
"ptx" :
"cubin";
298 const char *
const kernel_arch = force_ptx ?
"compute" :
"sm";
300 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
302 VLOG_INFO <<
"Testing for locally compiled kernel " << cubin <<
".";
304 VLOG_INFO <<
"Using locally compiled kernel.";
309 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
312 string_printf(
"CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
313 "Your GPU is not supported.",
319 string_printf(
"CUDA binary kernel for this graphics card compute "
320 "capability (%d.%d) not found.",
329 const char *
const nvcc = cuewCompilerPath();
332 "CUDA nvcc compiler not found. "
333 "Install CUDA toolkit in default location.");
337 const int nvcc_cuda_version = cuewCompilerVersion();
338 VLOG_INFO <<
"Found nvcc " << nvcc <<
", CUDA version " << nvcc_cuda_version <<
".";
339 if (nvcc_cuda_version < 101) {
341 "Unsupported CUDA version %d.%d detected, "
342 "you need CUDA 10.1 or newer.\n",
343 nvcc_cuda_version / 10,
344 nvcc_cuda_version % 10);
347 else if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
349 "CUDA version %d.%d detected, build may succeed but only "
350 "CUDA 10.1 to 12 are officially supported.\n",
351 nvcc_cuda_version / 10,
352 nvcc_cuda_version % 10);
375 common_cflags.c_str());
377 printf(
"Compiling %sCUDA kernel ...\n%s\n",
378 (use_adaptive_compilation()) ?
"adaptive " :
"",
382 command =
"call " + command;
384 if (system(command.c_str()) != 0) {
386 "Failed to execute compilation command, "
387 "see console for details.");
394 "CUDA kernel compilation failed, "
395 "see console for details.");
399 printf(
"Kernel compilation finished in %.2lfs.\n",
time_dt() - starttime);
404bool CUDADevice::load_kernels(
const uint kernel_features)
412 if (use_adaptive_compilation()) {
414 <<
"Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
420 if (cuContext == 0) {
425 if (!support_device(kernel_features)) {
430 const char *kernel_name =
"kernel";
431 string cflags = compile_kernel_get_common_cflags(kernel_features);
432 string cubin = compile_kernel(cflags, kernel_name);
438 CUDAContextScope scope(
this);
444 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
447 result = CUDA_ERROR_FILE_NOT_FOUND;
450 if (result != CUDA_SUCCESS) {
452 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
455 if (result == CUDA_SUCCESS) {
457 reserve_local_memory(kernel_features);
460 return (result == CUDA_SUCCESS);
463void CUDADevice::reserve_local_memory(
const uint kernel_features)
468 size_t total = 0, free_before = 0, free_after = 0;
471 CUDAContextScope scope(
this);
472 cuMemGetInfo(&free_before, &total);
486 CUDADeviceQueue queue(
this);
493 queue.init_execution();
494 queue.enqueue(test_kernel, 1, args);
499 CUDAContextScope scope(
this);
500 cuMemGetInfo(&free_after, &total);
508 const size_t keep_mb = 1024;
510 while (free_after > keep_mb * 1024 * 1024LL) {
512 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
513 cuMemGetInfo(&free_after, &total);
518void CUDADevice::get_device_memory_info(
size_t &total,
size_t &
free)
520 CUDAContextScope scope(
this);
522 cuMemGetInfo(&
free, &total);
525bool CUDADevice::alloc_device(
void *&device_pointer,
size_t size)
527 CUDAContextScope scope(
this);
529 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
530 return mem_alloc_result == CUDA_SUCCESS;
533void CUDADevice::free_device(
void *device_pointer)
535 CUDAContextScope scope(
this);
537 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
540bool CUDADevice::alloc_host(
void *&shared_pointer,
size_t size)
542 CUDAContextScope scope(
this);
544 CUresult mem_alloc_result = cuMemHostAlloc(
545 &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
546 return mem_alloc_result == CUDA_SUCCESS;
549void CUDADevice::free_host(
void *shared_pointer)
551 CUDAContextScope scope(
this);
553 cuMemFreeHost(shared_pointer);
556void CUDADevice::transform_host_pointer(
void *&device_pointer,
void *&shared_pointer)
558 CUDAContextScope scope(
this);
560 cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
563void CUDADevice::copy_host_to_device(
void *device_pointer,
void *host_pointer,
size_t size)
565 const CUDAContextScope scope(
this);
567 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
573 assert(!
"mem_alloc not supported for textures.");
576 assert(!
"mem_alloc not supported for global memory.");
597 generic_copy_to(mem);
601void CUDADevice::mem_copy_from(
device_memory &mem,
size_t y,
size_t w,
size_t h,
size_t elem)
604 assert(!
"mem_copy_from not supported for textures.");
607 const size_t size = elem *
w * h;
608 const size_t offset = elem * y *
w;
611 const CUDAContextScope scope(
this);
612 cuda_assert(cuMemcpyDtoH(
634 const CUDAContextScope scope(
this);
660void CUDADevice::const_copy_to(
const char *name,
void *host,
size_t size)
662 CUDAContextScope scope(
this);
666 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule,
"kernel_params"));
670# define KERNEL_DATA_ARRAY(data_type, data_name) \
671 if (strcmp(name, #data_name) == 0) { \
672 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
677# include "kernel/data_arrays.h"
678# undef KERNEL_DATA_ARRAY
685 generic_copy_to(mem);
700 CUDAContextScope scope(
this);
705 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
708 address_mode = CU_TR_ADDRESS_MODE_WRAP;
711 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
714 address_mode = CU_TR_ADDRESS_MODE_BORDER;
717 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
724 CUfilter_mode filter_mode;
726 filter_mode = CU_TR_FILTER_MODE_POINT;
729 filter_mode = CU_TR_FILTER_MODE_LINEAR;
742 CUarray_format_enum
format;
745 format = CU_AD_FORMAT_UNSIGNED_INT8;
748 format = CU_AD_FORMAT_UNSIGNED_INT16;
751 format = CU_AD_FORMAT_FLOAT;
754 format = CU_AD_FORMAT_HALF;
762 CUarray array_3d =
NULL;
764 size_t dst_pitch = src_pitch;
768 cmem = &device_mem_map[&mem];
773 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
776 dst_pitch =
align_up(src_pitch, pitch_alignment);
781 CUDA_ARRAY3D_DESCRIPTOR desc;
794 cuda_assert(cuArray3DCreate(&array_3d, &desc));
801 memset(¶m, 0,
sizeof(param));
802 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
803 param.dstArray = array_3d;
804 param.srcMemoryType = CU_MEMORYTYPE_HOST;
806 param.srcPitch = src_pitch;
807 param.WidthInBytes = param.srcPitch;
811 cuda_assert(cuMemcpy3D(¶m));
818 cmem = &device_mem_map[&mem];
820 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
824 dst_pitch =
align_up(src_pitch, pitch_alignment);
827 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
833 memset(¶m, 0,
sizeof(param));
834 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
836 param.dstPitch = dst_pitch;
837 param.srcMemoryType = CU_MEMORYTYPE_HOST;
839 param.srcPitch = src_pitch;
840 param.WidthInBytes = param.srcPitch;
843 cuda_assert(cuMemcpy2DUnaligned(¶m));
847 cmem = generic_alloc(mem);
857 if (slot >= texture_info.size()) {
860 texture_info.resize(slot + 128);
864 texture_info[slot] = mem.
info;
865 need_texture_info =
true;
872 CUDA_RESOURCE_DESC resDesc;
873 memset(&resDesc, 0,
sizeof(resDesc));
876 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
877 resDesc.res.array.hArray = array_3d;
881 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
883 resDesc.res.pitch2D.format =
format;
887 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
890 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
892 resDesc.res.linear.format =
format;
897 CUDA_TEXTURE_DESC texDesc;
898 memset(&texDesc, 0,
sizeof(texDesc));
899 texDesc.addressMode[0] = address_mode;
900 texDesc.addressMode[1] = address_mode;
901 texDesc.addressMode[2] = address_mode;
902 texDesc.filterMode = filter_mode;
905 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
908 cmem = &device_mem_map[&mem];
910 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
NULL));
912 texture_info[slot].data = (
uint64_t)cmem->texobject;
922 CUDAContextScope scope(
this);
924 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
925 const Mem &cmem = device_mem_map[&mem];
927 if (cmem.texobject) {
929 cuTexObjectDestroy(cmem.texobject);
934 device_mem_map.erase(device_mem_map.find(&mem));
936 else if (cmem.array) {
938 cuArrayDestroy(
reinterpret_cast<CUarray
>(cmem.array));
943 device_mem_map.erase(device_mem_map.find(&mem));
952unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
954 return make_unique<CUDADeviceQueue>(
this);
957bool CUDADevice::should_use_graphics_interop()
972 CUDAContextScope scope(
this);
974 int num_all_devices = 0;
975 cuda_assert(cuDeviceGetCount(&num_all_devices));
977 if (num_all_devices == 0) {
982 uint num_gl_devices = 0;
983 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
985 for (
uint i = 0; i < num_gl_devices; ++i) {
986 if (gl_devices[i] == cuDevice) {
994int CUDADevice::get_num_multiprocessors()
996 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
999int CUDADevice::get_max_num_threads_per_multiprocessor()
1001 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1004bool CUDADevice::get_device_attribute(CUdevice_attribute attribute,
int *value)
1006 CUDAContextScope scope(
this);
1008 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1011int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute,
int default_value)
1014 if (!get_device_attribute(attribute, &value)) {
1015 return default_value;
void BLI_kdtree_nd_ free(KDTree *tree)
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
virtual void set_error(const string &error)
void mem_free(size_t size)
void mem_alloc(size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(int elements)
device_ptr device_pointer
static constexpr size_t datatype_size(DataType datatype)
#define KERNEL_DATA_ARRAY(type, name)
DebugFlags & DebugFlags()
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
static const char * to_string(const Interpolation &interp)
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
#define DCHECK(expression)
string util_md5_string(const string &str)
static void error(const char *str)
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_compressed_text(const string &path, string &text)
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
ccl_device_inline size_t align_up(size_t offset, size_t alignment)