17# include <TargetConditionals.h>
18# include <crt_externs.h>
25std::map<int, MetalDevice *> MetalDevice::active_device_ids;
29MetalDevice *MetalDevice::get_device_by_ID(
int ID,
32 auto it = active_device_ids.find(
ID);
33 if (it != active_device_ids.end()) {
39bool MetalDevice::is_device_cancelled(
int ID)
42 return get_device_by_ID(
ID,
lock) ==
nullptr;
50void MetalDevice::set_error(
const string &
error)
52 static std::mutex s_error_mutex;
53 std::lock_guard<std::mutex>
lock(s_error_mutex);
58 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
60 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
66 :
Device(info, stats, profiler, headless), texture_info(this,
"texture_info",
MEM_GLOBAL)
73 static int existing_devices_counter = 1;
74 device_id = existing_devices_counter++;
75 active_device_ids[device_id] =
this;
81 auto usable_devices = MetalInfo::get_usable_devices();
82 assert(mtlDevId < usable_devices.size());
83 mtlDevice = usable_devices[mtlDevId];
84 metal_printf(
"Creating new Cycles Metal device: %s\n", info.
description.c_str());
88 default_storage_mode = MTLResourceStorageModeManaged;
92 if ([mtlDevice hasUnifiedMemory]) {
93 default_storage_mode = MTLResourceStorageModeShared;
96 max_threads_per_threadgroup = 512;
99 if (
auto metalrt = getenv(
"CYCLES_METALRT")) {
100 use_metalrt = (atoi(metalrt) != 0);
103 if (getenv(
"CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
104 capture_enabled =
true;
110 kernel_specialization_level = PSO_GENERIC;
114 kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
117 kernel_specialization_level = PSO_SPECIALIZED_SHADE;
121 if (
auto envstr = getenv(
"CYCLES_METAL_SPECIALIZATION_LEVEL")) {
122 kernel_specialization_level = (MetalPipelineType)atoi(envstr);
124 metal_printf(
"kernel_specialization_level = %s\n",
125 kernel_type_as_string(
126 (MetalPipelineType)
min((
int)kernel_specialization_level, (
int)PSO_NUM - 1)));
128 MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc]
init];
129 arg_desc_params.dataType = MTLDataTypePointer;
130 arg_desc_params.access = MTLArgumentAccessReadOnly;
132 mtlBufferKernelParamsEncoder = [mtlDevice
133 newArgumentEncoderWithArguments:@[ arg_desc_params ]];
135 MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc]
init];
136 arg_desc_texture.dataType = MTLDataTypeTexture;
137 arg_desc_texture.access = MTLArgumentAccessReadOnly;
138 mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
139 MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc]
init];
140 arg_desc_buffer.dataType = MTLDataTypePointer;
141 arg_desc_buffer.access = MTLArgumentAccessReadOnly;
142 mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
144 buffer_bindings_1d = [mtlDevice newBufferWithLength:8192
options:default_storage_mode];
145 texture_bindings_2d = [mtlDevice newBufferWithLength:8192
options:default_storage_mode];
146 texture_bindings_3d = [mtlDevice newBufferWithLength:8192
options:default_storage_mode];
147 stats.
mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
148 texture_bindings_3d.allocatedSize);
154 mtlComputeCommandQueue = [mtlDevice newCommandQueue];
157 mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
160 if (@available(macos 12.0, *)) {
162 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc]
init];
163 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
164 arg_desc_as.access = MTLArgumentAccessReadOnly;
165 mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
166 [arg_desc_as release];
172 NSMutableArray *ancillary_desc = [[NSMutableArray alloc]
init];
175 MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc]
init];
176 arg_desc_tex.dataType = MTLDataTypePointer;
177 arg_desc_tex.access = MTLArgumentAccessReadOnly;
179 arg_desc_tex.index = index++;
180 [ancillary_desc addObject:[arg_desc_tex
copy]];
181 arg_desc_tex.index = index++;
182 [ancillary_desc addObject:[arg_desc_tex
copy]];
183 arg_desc_tex.index = index++;
184 [ancillary_desc addObject:[arg_desc_tex
copy]];
186 [arg_desc_tex release];
188 if (@available(macos 12.0, *)) {
190 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc]
init];
191 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
192 arg_desc_as.access = MTLArgumentAccessReadOnly;
194 MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc]
init];
195 arg_desc_ptrs.dataType = MTLDataTypePointer;
196 arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
198 MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc]
init];
199 arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
200 arg_desc_ift.access = MTLArgumentAccessReadOnly;
202 arg_desc_as.index = index++;
203 [ancillary_desc addObject:[arg_desc_as
copy]];
206 arg_desc_ift.index = index++;
207 [ancillary_desc addObject:[arg_desc_ift
copy]];
208 arg_desc_ift.index = index++;
209 [ancillary_desc addObject:[arg_desc_ift
copy]];
210 arg_desc_ift.index = index++;
211 [ancillary_desc addObject:[arg_desc_ift
copy]];
212 arg_desc_ift.index = index++;
213 [ancillary_desc addObject:[arg_desc_ift
copy]];
214 arg_desc_ift.index = index++;
215 [ancillary_desc addObject:[arg_desc_ift
copy]];
216 arg_desc_ift.index = index++;
217 [ancillary_desc addObject:[arg_desc_ift
copy]];
218 arg_desc_ift.index = index++;
219 [ancillary_desc addObject:[arg_desc_ift
copy]];
220 arg_desc_ift.index = index++;
221 [ancillary_desc addObject:[arg_desc_ift
copy]];
223 arg_desc_ptrs.index = index++;
224 [ancillary_desc addObject:[arg_desc_ptrs
copy]];
226 [arg_desc_ift release];
227 [arg_desc_as release];
228 [arg_desc_ptrs release];
232 mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
237 MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc]
init];
238 arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
239 arg_desc_blas.access = MTLArgumentAccessReadOnly;
240 mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
241 [arg_desc_blas release];
244 for (
int i = 0; i < ancillary_desc.count; i++) {
245 [ancillary_desc[i] release];
247 [ancillary_desc release];
249 [arg_desc_params release];
250 [arg_desc_texture release];
254MetalDevice::~MetalDevice()
263 int num_resources = texture_info.size();
264 for (
int res = 0; res < num_resources; res++) {
265 if (is_texture(texture_info[res])) {
266 [texture_slot_map[res] release];
267 texture_slot_map[res] = nil;
272 flush_delayed_free_list();
274 if (texture_bindings_2d) {
275 stats.
mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
276 texture_bindings_3d.allocatedSize);
277 [buffer_bindings_1d release];
278 [texture_bindings_2d release];
279 [texture_bindings_3d release];
281 [mtlTextureArgEncoder release];
282 [mtlBufferKernelParamsEncoder release];
283 [mtlBufferArgEncoder release];
284 [mtlASArgEncoder release];
285 [mtlAncillaryArgEncoder release];
286 [mtlComputeCommandQueue release];
287 [mtlGeneralCommandQueue release];
293bool MetalDevice::support_device(
const uint )
298bool MetalDevice::check_peer_access(
Device * )
305bool MetalDevice::use_adaptive_compilation()
310bool MetalDevice::use_local_atomic_sort()
const
315string MetalDevice::preprocess_source(MetalPipelineType pso_type,
316 const uint kernel_features,
319 string global_defines;
320 if (use_adaptive_compilation()) {
321 global_defines +=
"#define __KERNEL_FEATURES__ " +
to_string(kernel_features) +
"\n";
324 if (use_local_atomic_sort()) {
325 global_defines +=
"#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
329 global_defines +=
"#define __METALRT__\n";
331 global_defines +=
"#define __METALRT_MOTION__\n";
335# ifdef WITH_CYCLES_DEBUG
336 global_defines +=
"#define WITH_CYCLES_DEBUG\n";
339 global_defines +=
"#define __KERNEL_METAL_APPLE__\n";
340 if (@available(macos 14.0, *)) {
342 global_defines +=
"#define __METAL_GLOBAL_BUILTINS__\n";
347 if ((pso_type == PSO_GENERIC || using_nanovdb) &&
DebugFlags().metal.use_nanovdb) {
348 global_defines +=
"#define WITH_NANOVDB\n";
352 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
353 NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
354 global_defines +=
"#define __KERNEL_METAL_MACOS__ " +
to_string(macos_ver.majorVersion) +
"\n";
357 global_defines +=
"#define __KERNEL_METAL_TARGET_CPU_ARM64__\n";
364 if (pso_type != PSO_GENERIC) {
366 const double starttime =
time_dt();
368# define KERNEL_STRUCT_BEGIN(name, parent) \
369 string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
371 bool next_member_is_specialized =
true;
373# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
375# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
376 if (!next_member_is_specialized) { \
378 *source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
379 next_member_is_specialized = true; \
384# undef KERNEL_STRUCT_MEMBER
385# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
386# undef KERNEL_STRUCT_BEGIN
388 metal_printf(
"KernelData patching took %.1f ms\n", (
time_dt() - starttime) * 1000.0);
396 global_defines +=
"#define __KERNEL_USE_DATA_CONSTANTS__\n";
400 metal_printf(
"================\n%s================\n",
401 global_defines.c_str());
405 *source = global_defines + *source;
409 md5.
append(global_defines);
413void MetalDevice::make_source(MetalPipelineType pso_type,
const uint kernel_features)
415 string &source = this->source[pso_type];
416 source =
"\n#include \"kernel/device/metal/kernel.metal\"\n";
423 global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
426bool MetalDevice::load_kernels(
const uint _kernel_features)
429 kernel_features |= _kernel_features;
432 if (!support_device(kernel_features))
442 refresh_source_and_kernels_md5(PSO_GENERIC);
443 if (MetalDeviceKernels::should_load_kernels(
this, PSO_GENERIC)) {
446 int this_device_id = this->device_id;
447 auto compile_kernels_fn = ^() {
448 compile_and_load(this_device_id, PSO_GENERIC);
451 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
458void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
460 string defines_md5 = preprocess_source(pso_type, kernel_features);
463 if (global_defines_md5[pso_type] != defines_md5) {
464 make_source(pso_type, kernel_features);
467 string constant_values;
468 if (pso_type != PSO_GENERIC) {
469 bool next_member_is_specialized =
true;
471# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
475# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
476 if (next_member_is_specialized) { \
477 constant_values += string(#parent "." #name "=") + \
478 to_string(_type(launch_params.data.parent.name)) + "\n"; \
481 next_member_is_specialized = true; \
486# undef KERNEL_STRUCT_MEMBER
487# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
490 metal_printf(
"================\n%s================\n",
491 constant_values.c_str());
496 md5.
append(constant_values);
497 md5.
append(source[pso_type]);
501 kernels_md5[pso_type] = md5.
get_hex();
504void MetalDevice::compile_and_load(
int device_id, MetalPipelineType pso_type)
510 id<MTLDevice> mtlDevice;
518 MetalDevice *instance = get_device_by_ID(device_id,
lock);
520 metal_printf(
"Ignoring %s compilation request - device no longer exists\n",
521 kernel_type_as_string(pso_type));
525 if (!MetalDeviceKernels::should_load_kernels(instance, pso_type)) {
528 metal_printf(
"Ignoreing %s compilation request - kernels already requested\n",
529 kernel_type_as_string(pso_type));
533 mtlDevice = instance->mtlDevice;
534 source = instance->source[pso_type];
540 MTLCompileOptions *
options = [[MTLCompileOptions alloc]
init];
543 if (@available(macos 12.0, *)) {
544 options.languageVersion = MTLLanguageVersion2_4;
546# if defined(MAC_OS_VERSION_13_0)
547 if (@available(macos 13.0, *)) {
548 options.languageVersion = MTLLanguageVersion3_0;
551# if defined(MAC_OS_VERSION_14_0)
552 if (@available(macos 14.0, *)) {
553 options.languageVersion = MTLLanguageVersion3_1;
557 if (getenv(
"CYCLES_METAL_PROFILING") || getenv(
"CYCLES_METAL_DEBUG")) {
565 id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
569 metal_printf(
"Front-end compilation finished in %.1f seconds (%s)\n",
571 kernel_type_as_string(pso_type));
575 bool blocking_pso_build = (getenv(
"CYCLES_METAL_PROFILING") ||
576 MetalDeviceKernels::is_benchmark_warmup());
577 if (blocking_pso_build) {
578 MetalDeviceKernels::wait_for_all();
586 if (MetalDevice *instance = get_device_by_ID(device_id,
lock)) {
590 << [[
error localizedDescription] UTF8String];
593 instance->mtlLibrary[pso_type] = mtlLibrary;
596 MetalDeviceKernels::load(instance, pso_type);
599 NSString *err = [
error localizedDescription];
600 instance->set_error(
string_printf(
"Failed to compile library:\n%s", [err UTF8String]));
605 if (starttime && blocking_pso_build) {
606 MetalDeviceKernels::wait_for_all();
608 metal_printf(
"Back-end compilation finished in %.1f seconds (%s)\n",
610 kernel_type_as_string(pso_type));
617 return (
tex.depth > 0 ||
tex.height > 0);
620void MetalDevice::load_texture_info()
622 if (need_texture_info) {
624 need_texture_info =
false;
625 texture_info.copy_to_device();
627 int num_textures = texture_info.size();
629 for (
int tex = 0;
tex < num_textures;
tex++) {
631 if (is_texture(texture_info[
tex]) && texture_slot_map[
tex]) {
632 id<MTLTexture> metal_texture = texture_slot_map[
tex];
633 MTLTextureType type = metal_texture.textureType;
634 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
635 [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
636 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
637 [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
640 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
641 [mtlTextureArgEncoder setTexture:nil atIndex:0];
642 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
643 [mtlTextureArgEncoder setTexture:nil atIndex:0];
646 if (default_storage_mode == MTLResourceStorageModeManaged) {
647 [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures *
sizeof(
void *))];
648 [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures *
sizeof(
void *))];
659 auto it = metal_mem_map.find(&mem);
660 if (it != metal_mem_map.end()) {
661 MetalMem *mmem = it->second.get();
664 if (mmem->pointer_index >= 0) {
666 pointers[mmem->pointer_index] = 0;
668 metal_mem_map.erase(it);
672bool MetalDevice::max_working_set_exceeded(
size_t safety_margin)
const
676 size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
677 return (stats.
mem_used > available);
680MetalDevice::MetalMem *MetalDevice::generic_alloc(
device_memory &mem)
687 id<MTLBuffer> metal_buffer = nil;
688 MTLResourceOptions
options = default_storage_mode;
692 options = MTLResourceStorageModePrivate;
695 metal_buffer = [mtlDevice newBufferWithLength:size
options:
options];
698 set_error(
"System is out of GPU memory");
712 metal_buffer.label = [NSString stringWithFormat:
@"%s", mem.
name];
714 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
716 assert(metal_mem_map.count(&mem) == 0);
717 MetalMem *mmem =
new MetalMem;
718 metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
721 mmem->mtlBuffer = metal_buffer;
724 if (
options != MTLResourceStorageModePrivate) {
725 mmem->hostPtr = [metal_buffer contents];
728 mmem->hostPtr =
nullptr;
735 if (metal_buffer.storageMode == MTLResourceStorageModeShared) {
746 mmem->use_UMA =
true;
749 mmem->use_UMA =
false;
752 if (max_working_set_exceeded()) {
753 set_error(
"System is out of GPU memory");
767 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
769 MetalMem &mmem = *metal_mem_map.at(&mem);
771 if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) {
772 [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.
memory_size())];
780 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
781 MetalMem &mmem = *metal_mem_map.at(&mem);
782 size_t size = mmem.size;
787 bool free_mtlBuffer =
false;
794 free_mtlBuffer =
true;
799 free_mtlBuffer =
true;
802 if (free_mtlBuffer) {
807 mmem.use_UMA =
false;
813 delayed_free_list.push_back(mmem.mtlBuffer);
814 mmem.mtlBuffer = nil;
817 erase_allocation(mem);
824 assert(!
"mem_alloc not supported for textures.");
848 generic_copy_to(mem);
852void MetalDevice::mem_copy_from(
device_memory &mem,
size_t y,
size_t w,
size_t h,
size_t elem)
857 bool subcopy = (
w >= 0 && h >= 0);
858 const size_t size = subcopy ? (elem *
w * h) : mem.memory_size();
859 const size_t offset = subcopy ? (elem * y *
w) : 0;
862 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
863 MetalMem &mmem = *metal_mem_map.at(&mem);
865 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
867 id<MTLCommandBuffer> cmdBuffer = [mtlGeneralCommandQueue commandBuffer];
868 id<MTLBlitCommandEncoder> blitEncoder = [cmdBuffer blitCommandEncoder];
869 [blitEncoder synchronizeResource:mmem.mtlBuffer];
870 [blitEncoder endEncoding];
872 [cmdBuffer waitUntilCompleted];
896 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
897 MetalMem &mmem = *metal_mem_map.at(&mem);
898 memset(mmem.hostPtr, 0, size);
899 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
900 [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)];
926void MetalDevice::cancel()
932 active_device_ids.erase(device_id);
937bool MetalDevice::is_ready(
string &status)
const
939 if (!error_msg.empty()) {
944 int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(
this, PSO_GENERIC);
946 status =
string_printf(
"%d / %d render kernels loaded (may take a few minutes the first time)",
952 if (
int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
953 status =
string_printf(
"%d kernels to optimize", num_requests);
955 else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
956 status =
"Using optimized intersection kernels";
958 else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
959 status =
"Using optimized kernels";
962 metal_printf(
"MetalDevice::is_ready(...) --> true\n");
966void MetalDevice::optimize_for_scene(
Scene *scene)
968 MetalPipelineType specialization_level = kernel_specialization_level;
970 if (!scene->params.background) {
972 specialization_level = (MetalPipelineType)
min(specialization_level, PSO_SPECIALIZED_INTERSECT);
977 int this_device_id = this->device_id;
978 auto specialize_kernels_fn = ^() {
979 for (
int level = 1; level <=
int(specialization_level); level++) {
980 compile_and_load(this_device_id, MetalPipelineType(level));
985 bool specialize_in_background =
true;
988 if (getenv(
"CYCLES_METAL_PROFILING") !=
nullptr) {
989 specialize_in_background =
false;
993 if (MetalDeviceKernels::is_benchmark_warmup()) {
994 specialize_in_background =
false;
997 if (specialize_in_background) {
998 if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
999 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
1000 specialize_kernels_fn);
1003 metal_printf(
"\"optimize_for_scene\" request already in flight - dropping request\n");
1007 specialize_kernels_fn();
1011void MetalDevice::const_copy_to(
const char *name,
void *host,
size_t size)
1013 if (strcmp(name,
"data") == 0) {
1018 for (
int level = 1; level <=
int(kernel_specialization_level); level++) {
1019 refresh_source_and_kernels_md5(MetalPipelineType(level));
1024 auto update_launch_pointers =
1025 [&](
size_t offset,
void *
data,
size_t data_size,
size_t pointers_size) {
1026 memcpy((
uint8_t *)&launch_params + offset, data, data_size);
1028 MetalMem **mmem = (MetalMem **)data;
1029 int pointer_count = pointers_size /
sizeof(
device_ptr);
1030 int pointer_index = offset /
sizeof(
device_ptr);
1031 for (
int i = 0; i < pointer_count; i++) {
1033 mmem[i]->pointer_index = pointer_index + i;
1039 if (strcmp(name,
"integrator_state") == 0) {
1042 update_launch_pointers(
1045# define KERNEL_DATA_ARRAY(data_type, tex_name) \
1046 else if (strcmp(name, #tex_name) == 0) { \
1047 update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
1050# undef KERNEL_DATA_ARRAY
1057 generic_copy_to(mem);
1072 MetalDevice::MetalMem *mmem = generic_alloc(mem);
1073 generic_copy_to(mem);
1077 if (slot >= texture_info.size()) {
1080 texture_info.resize(
round_up(slot + 1, 128));
1081 texture_slot_map.resize(
round_up(slot + 1, 128));
1084 texture_info[slot] = mem.
info;
1085 uint64_t offset = slot *
sizeof(
void *);
1086 [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
1087 [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
1089 texture_slot_map[slot] = nil;
1090 need_texture_info =
true;
1097 using_nanovdb =
true;
1110 "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
1116 MTLStorageMode storage_mode = MTLStorageModeManaged;
1117 if ([mtlDevice hasUnifiedMemory]) {
1118 storage_mode = MTLStorageModeShared;
1122 string bind_name = mem.
name;
1136 MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
1137 MTLPixelFormatRG8Unorm,
1138 MTLPixelFormatInvalid,
1139 MTLPixelFormatRGBA8Unorm};
1143 MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
1144 MTLPixelFormatRG16Unorm,
1145 MTLPixelFormatInvalid,
1146 MTLPixelFormatRGBA16Unorm};
1150 MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
1151 MTLPixelFormatRG32Uint,
1152 MTLPixelFormatInvalid,
1153 MTLPixelFormatRGBA32Uint};
1157 MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
1158 MTLPixelFormatRG32Sint,
1159 MTLPixelFormatInvalid,
1160 MTLPixelFormatRGBA32Sint};
1164 MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
1165 MTLPixelFormatRG32Float,
1166 MTLPixelFormatInvalid,
1167 MTLPixelFormatRGBA32Float};
1171 MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
1172 MTLPixelFormatRG16Float,
1173 MTLPixelFormatInvalid,
1174 MTLPixelFormatRGBA16Float};
1182 assert(
format != MTLPixelFormatInvalid);
1184 id<MTLTexture> mtlTexture = nil;
1189 MTLTextureDescriptor *desc;
1191 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:
format
1196 desc.storageMode = storage_mode;
1197 desc.usage = MTLTextureUsageShaderRead;
1199 desc.textureType = MTLTextureType3D;
1206 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1208 set_error(
"System is out of GPU memory");
1212 const size_t imageBytes = src_pitch * mem.
data_height;
1213 for (
size_t d = 0; d < mem.
data_depth; d++) {
1214 const size_t offset = d * imageBytes;
1218 withBytes:(
uint8_t *)mem.host_pointer + offset
1219 bytesPerRow:src_pitch
1223 else if (mem.data_height > 0) {
1225 MTLTextureDescriptor *desc;
1227 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:
format
1232 desc.storageMode = storage_mode;
1233 desc.usage = MTLTextureUsageShaderRead;
1239 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1241 set_error(
"System is out of GPU memory");
1247 withBytes:mem.host_pointer
1248 bytesPerRow:src_pitch];
1252 tex_alloc_as_buffer(mem);
1260 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1261 MetalMem *mmem =
new MetalMem;
1262 metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
1264 mmem->mtlTexture = mtlTexture;
1268 if (slot >= texture_info.size()) {
1271 texture_info.resize(slot + 128);
1272 texture_slot_map.resize(slot + 128);
1274 ssize_t min_buffer_length =
sizeof(
void *) * texture_info.size();
1275 if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
1276 if (texture_bindings_2d) {
1277 delayed_free_list.push_back(buffer_bindings_1d);
1278 delayed_free_list.push_back(texture_bindings_2d);
1279 delayed_free_list.push_back(texture_bindings_3d);
1281 stats.
mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1282 texture_bindings_3d.allocatedSize);
1284 buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
1285 options:default_storage_mode];
1286 texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
1287 options:default_storage_mode];
1288 texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
1289 options:default_storage_mode];
1291 stats.
mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1292 texture_bindings_3d.allocatedSize);
1297 id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
1298 id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
1299 [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
1300 [blitCommandEncoder endEncoding];
1301 [commandBuffer commit];
1304 texture_slot_map[slot] = mtlTexture;
1305 texture_info[slot] = mem.
info;
1306 need_texture_info =
true;
1308 texture_info[slot].
data =
uint64_t(slot) | (sampler_index << 32);
1310 if (max_working_set_exceeded()) {
1311 set_error(
"System is out of GPU memory");
1323 if (metal_mem_map.count(&mem)) {
1324 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1325 MetalMem &mmem = *metal_mem_map.at(&mem);
1327 assert(texture_slot_map[mem.
slot] == mmem.mtlTexture);
1328 if (texture_slot_map[mem.
slot] == mmem.mtlTexture)
1329 texture_slot_map[mem.
slot] = nil;
1331 if (mmem.mtlTexture) {
1333 delayed_free_list.push_back(mmem.mtlTexture);
1334 mmem.mtlTexture = nil;
1336 erase_allocation(mem);
1340unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
1342 return make_unique<MetalDeviceQueue>(
this);
1345bool MetalDevice::should_use_graphics_interop()
1353 return ((MetalMem *)
ptr)->mtlBuffer;
1356void MetalDevice::flush_delayed_free_list()
1361 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1362 for (
auto &it : delayed_free_list) {
1365 delayed_free_list.clear();
1376 BVHMetal *bvh_metal =
static_cast<BVHMetal *
>(bvh);
1377 bvh_metal->motion_blur = motion_blur;
1378 if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue,
refit)) {
1381 update_bvh(bvh_metal);
1385 if (max_working_set_exceeded()) {
1386 set_error(
"System is out of GPU memory");
1391void MetalDevice::free_bvh()
1393 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1396 unique_blas_array.clear();
1399 [blas_buffer release];
1404 [accel_struct release];
1409void MetalDevice::update_bvh(BVHMetal *bvh_metal)
1417 accel_struct = bvh_metal->accel_struct;
1418 unique_blas_array = bvh_metal->unique_blas_array;
1420 [accel_struct retain];
1421 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1427 uint64_t buffer_size = mtlBlasArgEncoder.encodedLength *
count;
1428 blas_buffer = [mtlDevice newBufferWithLength:buffer_size
options:default_storage_mode];
1429 stats.
mem_alloc(blas_buffer.allocatedSize);
1432 if (bvh_metal->blas_array[i]) {
1433 [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
1434 [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
1437 if (default_storage_mode == MTLResourceStorageModeManaged) {
1438 [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
KernelOptimizationLevel kernel_optimization_level
bool use_hardware_raytracing
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void set_error(const string &error)
void append(const uint8_t *data, int size)
void mem_free(size_t size)
void mem_alloc(size_t size)
bool is_resident(Device *sub_device) const
device_ptr device_pointer
void * host_alloc(size_t size)
static constexpr size_t datatype_size(DataType datatype)
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
#define CCL_NAMESPACE_END
@ KERNEL_OPTIMIZATION_LEVEL_OFF
@ KERNEL_OPTIMIZATION_LEVEL_FULL
@ KERNEL_OPTIMIZATION_LEVEL_INTERSECT
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
static const char * to_string(const Interpolation &interp)
#define KERNEL_FEATURE_OBJECT_MOTION
static void error(const char *str)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
string path_source_replace_includes(const string &source, const string &path)
string path_get(const string &sub)
bool path_write_text(const string &path, string &text)
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
CCL_NAMESPACE_BEGIN double time_dt()
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
ccl_device_inline size_t round_up(size_t x, size_t multiple)