22# include <TargetConditionals.h>
23# include <crt_externs.h>
30std::map<int, MetalDevice *> MetalDevice::active_device_ids;
34MetalDevice *MetalDevice::get_device_by_ID(
const int ID,
37 auto it = active_device_ids.find(
ID);
38 if (it != active_device_ids.end()) {
44bool MetalDevice::is_device_cancelled(
const int ID)
47 return get_device_by_ID(
ID,
lock) ==
nullptr;
55void MetalDevice::set_error(
const string &
error)
57 static std::mutex s_error_mutex;
58 std::lock_guard<std::mutex>
lock(s_error_mutex);
63 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
65 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
78 static int existing_devices_counter = 1;
79 device_id = existing_devices_counter++;
80 active_device_ids[device_id] =
this;
86 auto usable_devices = MetalInfo::get_usable_devices();
87 assert(mtlDevId < usable_devices.size());
88 mtlDevice = usable_devices[mtlDevId];
89 metal_printf(
"Creating new Cycles Metal device: %s\n", info.
description.c_str());
93 if (@available(macOS 13.3, *)) {
94 [mtlDevice setShouldMaximizeConcurrentCompilation:YES];
97 max_threads_per_threadgroup = 512;
100 if (
auto *metalrt = getenv(
"CYCLES_METALRT")) {
101 use_metalrt = (atoi(metalrt) != 0);
104# if defined(MAC_OS_VERSION_15_0)
107 if (use_metalrt && [mtlDevice supportsFamily:MTLGPUFamilyApple9]) {
109 if (@available(macos 15.6, *)) {
115 if (getenv(
"CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
116 capture_enabled =
true;
122 if (
auto str = getenv(
"CYCLES_METAL_PROFILING")) {
123 if (atoi(
str) && [mtlDevice supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
125 NSArray<id<MTLCounterSet>> *counterSets = [mtlDevice counterSets];
127 NSError *
error = nil;
128 MTLCounterSampleBufferDescriptor *desc = [[MTLCounterSampleBufferDescriptor alloc]
init];
129 [desc setStorageMode:MTLStorageModeShared];
130 [desc setLabel:
@"CounterSampleBuffer"];
131 [desc setSampleCount:MAX_SAMPLE_BUFFER_LENGTH];
132 [desc setCounterSet:counterSets[0]];
133 mtlCounterSampleBuffer = [mtlDevice newCounterSampleBufferWithDescriptor:desc
135 [mtlCounterSampleBuffer retain];
142 kernel_specialization_level = PSO_GENERIC;
146 kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
149 kernel_specialization_level = PSO_SPECIALIZED_SHADE;
153 if (
auto *envstr = getenv(
"CYCLES_METAL_SPECIALIZATION_LEVEL")) {
154 kernel_specialization_level = (MetalPipelineType)atoi(envstr);
156 metal_printf(
"kernel_specialization_level = %s\n",
157 kernel_type_as_string(
158 (MetalPipelineType)
min((
int)kernel_specialization_level, (
int)PSO_NUM - 1)));
160 MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc]
init];
161 arg_desc_params.dataType = MTLDataTypePointer;
162 arg_desc_params.access = MTLArgumentAccessReadOnly;
164 mtlBufferKernelParamsEncoder = [mtlDevice
165 newArgumentEncoderWithArguments:@[ arg_desc_params ]];
167 MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc]
init];
168 arg_desc_texture.dataType = MTLDataTypeTexture;
169 arg_desc_texture.access = MTLArgumentAccessReadOnly;
170 mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
171 MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc]
init];
172 arg_desc_buffer.dataType = MTLDataTypePointer;
173 arg_desc_buffer.access = MTLArgumentAccessReadOnly;
174 mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
176 buffer_bindings_1d = [mtlDevice newBufferWithLength:8192
options:MTLResourceStorageModeShared];
177 texture_bindings_2d = [mtlDevice newBufferWithLength:8192
178 options:MTLResourceStorageModeShared];
179 texture_bindings_3d = [mtlDevice newBufferWithLength:8192
180 options:MTLResourceStorageModeShared];
181 stats.
mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
182 texture_bindings_3d.allocatedSize);
188 mtlComputeCommandQueue = [mtlDevice newCommandQueue];
191 mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
194 if (@available(macos 12.0, *)) {
196 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc]
init];
197 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
198 arg_desc_as.access = MTLArgumentAccessReadOnly;
199 mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
200 [arg_desc_as release];
206 NSMutableArray *ancillary_desc = [[NSMutableArray alloc]
init];
209 MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc]
init];
210 arg_desc_tex.dataType = MTLDataTypePointer;
211 arg_desc_tex.access = MTLArgumentAccessReadOnly;
213 arg_desc_tex.index = index++;
214 [ancillary_desc addObject:[arg_desc_tex
copy]];
215 arg_desc_tex.index = index++;
216 [ancillary_desc addObject:[arg_desc_tex
copy]];
217 arg_desc_tex.index = index++;
218 [ancillary_desc addObject:[arg_desc_tex
copy]];
220 [arg_desc_tex release];
222 if (@available(macos 12.0, *)) {
224 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc]
init];
225 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
226 arg_desc_as.access = MTLArgumentAccessReadOnly;
228 MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc]
init];
229 arg_desc_ptrs.dataType = MTLDataTypePointer;
230 arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
232 MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc]
init];
233 arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
234 arg_desc_ift.access = MTLArgumentAccessReadOnly;
236 arg_desc_as.index = index++;
237 [ancillary_desc addObject:[arg_desc_as
copy]];
240 arg_desc_ift.index = index++;
241 [ancillary_desc addObject:[arg_desc_ift
copy]];
242 arg_desc_ift.index = index++;
243 [ancillary_desc addObject:[arg_desc_ift
copy]];
244 arg_desc_ift.index = index++;
245 [ancillary_desc addObject:[arg_desc_ift
copy]];
246 arg_desc_ift.index = index++;
247 [ancillary_desc addObject:[arg_desc_ift
copy]];
248 arg_desc_ift.index = index++;
249 [ancillary_desc addObject:[arg_desc_ift
copy]];
250 arg_desc_ift.index = index++;
251 [ancillary_desc addObject:[arg_desc_ift
copy]];
252 arg_desc_ift.index = index++;
253 [ancillary_desc addObject:[arg_desc_ift
copy]];
254 arg_desc_ift.index = index++;
255 [ancillary_desc addObject:[arg_desc_ift
copy]];
257 arg_desc_ptrs.index = index++;
258 [ancillary_desc addObject:[arg_desc_ptrs
copy]];
260 [arg_desc_ift release];
261 [arg_desc_as release];
262 [arg_desc_ptrs release];
266 mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
271 MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc]
init];
272 arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
273 arg_desc_blas.access = MTLArgumentAccessReadOnly;
274 mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
275 [arg_desc_blas release];
278 for (
int i = 0;
i < ancillary_desc.count;
i++) {
279 [ancillary_desc[
i] release];
281 [ancillary_desc release];
283 [arg_desc_params release];
284 [arg_desc_texture release];
288MetalDevice::~MetalDevice()
297 int num_resources = texture_info.size();
298 for (
int res = 0; res < num_resources; res++) {
299 if (is_texture(texture_info[res])) {
300 [texture_slot_map[res] release];
301 texture_slot_map[res] = nil;
306 flush_delayed_free_list();
308 if (texture_bindings_2d) {
309 stats.
mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
310 texture_bindings_3d.allocatedSize);
311 [buffer_bindings_1d release];
312 [texture_bindings_2d release];
313 [texture_bindings_3d release];
315 [mtlTextureArgEncoder release];
316 [mtlBufferKernelParamsEncoder release];
317 [mtlBufferArgEncoder release];
318 [mtlASArgEncoder release];
319 [mtlAncillaryArgEncoder release];
320 [mtlComputeCommandQueue release];
321 [mtlGeneralCommandQueue release];
322 if (mtlCounterSampleBuffer) {
323 [mtlCounterSampleBuffer release];
330bool MetalDevice::support_device(
const uint )
335bool MetalDevice::check_peer_access(
Device * )
342bool MetalDevice::use_adaptive_compilation()
347bool MetalDevice::use_local_atomic_sort()
const
352string MetalDevice::preprocess_source(MetalPipelineType pso_type,
353 const uint kernel_features,
356 string global_defines;
357 if (use_adaptive_compilation()) {
358 global_defines +=
"#define __KERNEL_FEATURES__ " +
to_string(kernel_features) +
"\n";
361 if (use_local_atomic_sort()) {
362 global_defines +=
"#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
366 global_defines +=
"#define __METALRT__\n";
368 global_defines +=
"#define __METALRT_MOTION__\n";
372# ifdef WITH_CYCLES_DEBUG
373 global_defines +=
"#define WITH_CYCLES_DEBUG\n";
376 global_defines +=
"#define __KERNEL_METAL_APPLE__\n";
377 if (@available(macos 14.0, *)) {
379 global_defines +=
"#define __METAL_GLOBAL_BUILTINS__\n";
384 if ((pso_type == PSO_GENERIC || using_nanovdb) &&
DebugFlags().metal.use_nanovdb) {
385 global_defines +=
"#define WITH_NANOVDB\n";
389 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
390 NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
391 global_defines +=
"#define __KERNEL_METAL_MACOS__ " +
to_string(macos_ver.majorVersion) +
"\n";
394 global_defines +=
"#define __KERNEL_METAL_TARGET_CPU_ARM64__\n";
401 if (pso_type != PSO_GENERIC) {
403 const double starttime =
time_dt();
405# define KERNEL_STRUCT_BEGIN(name, parent) \
406 string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
408 bool next_member_is_specialized =
true;
410# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
412# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
413 if (!next_member_is_specialized) { \
415 *source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
416 next_member_is_specialized = true; \
421# undef KERNEL_STRUCT_MEMBER
422# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
423# undef KERNEL_STRUCT_BEGIN
425 metal_printf(
"KernelData patching took %.1f ms\n", (
time_dt() - starttime) * 1000.0);
433 global_defines +=
"#define __KERNEL_USE_DATA_CONSTANTS__\n";
437 *source = global_defines + *source;
441 md5.
append(global_defines);
445void MetalDevice::make_source(MetalPipelineType pso_type,
const uint kernel_features)
447 string &source = this->source[pso_type];
448 source =
"\n#include \"kernel/device/metal/kernel.metal\"\n";
455 global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
458bool MetalDevice::load_kernels(
const uint _kernel_features)
461 kernel_features |= _kernel_features;
464 if (!support_device(kernel_features)) {
475 refresh_source_and_kernels_md5(PSO_GENERIC);
476 if (MetalDeviceKernels::should_load_kernels(
this, PSO_GENERIC)) {
479 int this_device_id = this->device_id;
480 auto compile_kernels_fn = ^() {
481 compile_and_load(this_device_id, PSO_GENERIC);
484 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
491void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
493 string defines_md5 = preprocess_source(pso_type, kernel_features);
496 if (global_defines_md5[pso_type] != defines_md5) {
497 make_source(pso_type, kernel_features);
500 string constant_values;
501 if (pso_type != PSO_GENERIC) {
502 bool next_member_is_specialized =
true;
504# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
508# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
509 if (next_member_is_specialized) { \
510 constant_values += string(#parent "." #name "=") + \
511 to_string(_type(launch_params.data.parent.name)) + "\n"; \
514 next_member_is_specialized = true; \
519# undef KERNEL_STRUCT_MEMBER
520# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
524 md5.
append(constant_values);
525 md5.
append(source[pso_type]);
529 kernels_md5[pso_type] = md5.
get_hex();
532void MetalDevice::compile_and_load(
const int device_id, MetalPipelineType pso_type)
538 id<MTLDevice> mtlDevice;
546 MetalDevice *instance = get_device_by_ID(device_id,
lock);
548 metal_printf(
"Ignoring %s compilation request - device no longer exists\n",
549 kernel_type_as_string(pso_type));
553 if (!MetalDeviceKernels::should_load_kernels(instance, pso_type)) {
556 metal_printf(
"Ignoreing %s compilation request - kernels already requested\n",
557 kernel_type_as_string(pso_type));
561 mtlDevice = instance->mtlDevice;
562 source = instance->source[pso_type];
568 MTLCompileOptions *
options = [[MTLCompileOptions alloc]
init];
571 if (@available(macos 12.0, *)) {
572 options.languageVersion = MTLLanguageVersion2_4;
574# if defined(MAC_OS_VERSION_13_0)
575 if (@available(macos 13.0, *)) {
576 options.languageVersion = MTLLanguageVersion3_0;
579# if defined(MAC_OS_VERSION_14_0)
580 if (@available(macos 14.0, *)) {
581 options.languageVersion = MTLLanguageVersion3_1;
585 if (getenv(
"CYCLES_METAL_PROFILING") || getenv(
"CYCLES_METAL_DEBUG")) {
592 NSError *
error =
nullptr;
593 id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
597 metal_printf(
"Front-end compilation finished in %.1f seconds (%s)\n",
599 kernel_type_as_string(pso_type));
603 bool blocking_pso_build = (getenv(
"CYCLES_METAL_PROFILING") ||
604 MetalDeviceKernels::is_benchmark_warmup());
605 if (blocking_pso_build) {
606 MetalDeviceKernels::wait_for_all();
614 if (MetalDevice *instance = get_device_by_ID(device_id,
lock)) {
618 << [[
error localizedDescription] UTF8String];
621 instance->mtlLibrary[pso_type] = mtlLibrary;
624 MetalDeviceKernels::load(instance, pso_type);
627 NSString *err = [
error localizedDescription];
628 instance->set_error(
string_printf(
"Failed to compile library:\n%s", [err UTF8String]));
633 if (starttime && blocking_pso_build) {
634 MetalDeviceKernels::wait_for_all();
636 metal_printf(
"Back-end compilation finished in %.1f seconds (%s)\n",
638 kernel_type_as_string(pso_type));
643bool MetalDevice::is_texture(
const TextureInfo &tex)
648void MetalDevice::load_texture_info()
650 if (need_texture_info) {
652 need_texture_info =
false;
653 texture_info.copy_to_device();
655 int num_textures = texture_info.size();
657 for (
int tex = 0; tex < num_textures; tex++) {
658 uint64_t offset = tex *
sizeof(
void *);
659 if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
660 id<MTLTexture> metal_texture = texture_slot_map[tex];
661 MTLTextureType type = metal_texture.textureType;
662 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
663 [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
664 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
665 [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
668 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
669 [mtlTextureArgEncoder setTexture:nil atIndex:0];
670 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
671 [mtlTextureArgEncoder setTexture:nil atIndex:0];
683 auto it = metal_mem_map.find(&mem);
684 if (it != metal_mem_map.end()) {
685 MetalMem *mmem = it->second.get();
688 if (mmem->pointer_index >= 0) {
690 pointers[mmem->pointer_index] = 0;
692 metal_mem_map.erase(it);
696bool MetalDevice::max_working_set_exceeded(
const size_t safety_margin)
const
700 size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
701 return (stats.
mem_used > available);
704MetalDevice::MetalMem *MetalDevice::generic_alloc(
device_memory &mem)
711 id<MTLBuffer> metal_buffer = nil;
712 MTLResourceOptions
options = MTLResourceStorageModeShared;
716 options = MTLResourceStorageModePrivate;
722 set_error(
"System is out of GPU memory");
736 metal_buffer.label = [NSString stringWithFormat:
@"%s", mem.
name];
738 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
740 assert(metal_mem_map.count(&mem) == 0);
744 mmem->mtlBuffer = metal_buffer;
747 if (
options != MTLResourceStorageModePrivate) {
748 mmem->hostPtr = [metal_buffer contents];
751 mmem->hostPtr =
nullptr;
758 if (metal_buffer.storageMode == MTLStorageModeShared) {
770 MetalMem *mmem_ptr = mmem.get();
771 metal_mem_map[&mem] = std::move(mmem);
773 if (max_working_set_exceeded()) {
774 set_error(
"System is out of GPU memory");
797 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
798 MetalMem &mmem = *metal_mem_map.at(&mem);
799 size_t size = mmem.size;
801 bool free_mtlBuffer =
true;
807 free_mtlBuffer =
false;
811 if (free_mtlBuffer) {
816 assert(!
"Metal device should not copy memory back to host");
824 delayed_free_list.push_back(mmem.mtlBuffer);
825 mmem.mtlBuffer = nil;
828 erase_allocation(mem);
834 assert(!
"mem_alloc not supported for textures.");
855 generic_copy_to(mem);
860 generic_copy_to(mem);
866 generic_copy_to(mem);
874 assert(!
"Metal does not support mem_move_to_host");
877void MetalDevice::mem_copy_from(
device_memory &,
const size_t,
size_t,
const size_t,
size_t)
913void MetalDevice::cancel()
919 active_device_ids.erase(device_id);
924bool MetalDevice::is_ready(
string &status)
const
926 if (!error_msg.empty()) {
931 int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(
this, PSO_GENERIC);
933 status =
string_printf(
"%d / %d render kernels loaded (may take a few minutes the first time)",
939 if (
int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
940 status =
string_printf(
"%d kernels to optimize", num_requests);
942 else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
943 status =
"Using optimized intersection kernels";
945 else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
946 status =
"Using optimized kernels";
949 metal_printf(
"MetalDevice::is_ready(...) --> true\n");
953void MetalDevice::optimize_for_scene(
Scene *scene)
955 MetalPipelineType specialization_level = kernel_specialization_level;
959 specialization_level = (MetalPipelineType)
min(specialization_level, PSO_SPECIALIZED_INTERSECT);
964 int this_device_id = this->device_id;
965 auto specialize_kernels_fn = ^() {
966 for (
int level = 1; level <= int(specialization_level); level++) {
967 compile_and_load(this_device_id, MetalPipelineType(level));
972 bool specialize_in_background =
true;
975 if (getenv(
"CYCLES_METAL_PROFILING") !=
nullptr) {
976 specialize_in_background =
false;
980 if (MetalDeviceKernels::is_benchmark_warmup()) {
981 specialize_in_background =
false;
984 if (specialize_in_background) {
985 if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
986 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
987 specialize_kernels_fn);
990 metal_printf(
"\"optimize_for_scene\" request already in flight - dropping request\n");
994 specialize_kernels_fn();
998void MetalDevice::const_copy_to(
const char *name,
void *host,
const size_t size)
1000 if (strcmp(name,
"data") == 0) {
1002 memcpy((uint8_t *)&launch_params.data, host,
sizeof(KernelData));
1005 for (
int level = 1; level <= int(kernel_specialization_level); level++) {
1006 refresh_source_and_kernels_md5(MetalPipelineType(level));
1011 auto update_launch_pointers =
1012 [&](
size_t offset,
void *
data,
const size_t data_size,
const size_t pointers_size) {
1013 memcpy((uint8_t *)&launch_params + offset,
data, data_size);
1015 MetalMem **mmem = (MetalMem **)
data;
1016 int pointer_count = pointers_size /
sizeof(
device_ptr);
1017 int pointer_index = offset /
sizeof(
device_ptr);
1018 for (
int i = 0;
i < pointer_count;
i++) {
1020 mmem[
i]->pointer_index = pointer_index +
i;
1026 if (strcmp(name,
"integrator_state") == 0) {
1029 update_launch_pointers(
1032# define KERNEL_DATA_ARRAY(data_type, tex_name) \
1033 else if (strcmp(name, #tex_name) == 0) { \
1034 update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
1037# undef KERNEL_DATA_ARRAY
1044 generic_copy_to(mem);
1059 MetalDevice::MetalMem *mmem = generic_alloc(mem);
1060 generic_copy_to(mem);
1064 if (slot >= texture_info.size()) {
1067 texture_info.resize(
round_up(slot + 1, 128));
1068 texture_slot_map.resize(
round_up(slot + 1, 128));
1071 texture_info[slot] = mem.
info;
1072 uint64_t offset = slot *
sizeof(
void *);
1073 [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
1074 [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
1075 texture_info[slot].data = *(
uint64_t *)((
uint64_t)buffer_bindings_1d.contents + offset);
1076 texture_slot_map[slot] = nil;
1077 need_texture_info =
true;
1084 using_nanovdb =
true;
1097 "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
1117 MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
1118 MTLPixelFormatRG8Unorm,
1119 MTLPixelFormatInvalid,
1120 MTLPixelFormatRGBA8Unorm};
1124 MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
1125 MTLPixelFormatRG16Unorm,
1126 MTLPixelFormatInvalid,
1127 MTLPixelFormatRGBA16Unorm};
1131 MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
1132 MTLPixelFormatRG32Uint,
1133 MTLPixelFormatInvalid,
1134 MTLPixelFormatRGBA32Uint};
1138 MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
1139 MTLPixelFormatRG32Sint,
1140 MTLPixelFormatInvalid,
1141 MTLPixelFormatRGBA32Sint};
1145 MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
1146 MTLPixelFormatRG32Float,
1147 MTLPixelFormatInvalid,
1148 MTLPixelFormatRGBA32Float};
1152 MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
1153 MTLPixelFormatRG16Float,
1154 MTLPixelFormatInvalid,
1155 MTLPixelFormatRGBA16Float};
1165 id<MTLTexture> mtlTexture = nil;
1170 MTLTextureDescriptor *desc;
1172 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:
format
1177 desc.storageMode = MTLStorageModeShared;
1178 desc.usage = MTLTextureUsageShaderRead;
1180 desc.textureType = MTLTextureType3D;
1187 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1189 set_error(
"System is out of GPU memory");
1193 const size_t imageBytes = src_pitch * mem.
data_height;
1194 for (
size_t d = 0; d < mem.
data_depth; d++) {
1195 const size_t offset = d * imageBytes;
1199 withBytes:(uint8_t *)mem.host_pointer + offset
1200 bytesPerRow:src_pitch
1204 else
if (mem.data_height > 0) {
1206 MTLTextureDescriptor *desc;
1208 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:
format
1213 desc.storageMode = MTLStorageModeShared;
1214 desc.usage = MTLTextureUsageShaderRead;
1220 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1222 set_error(
"System is out of GPU memory");
1228 withBytes:mem.host_pointer
1229 bytesPerRow:src_pitch];
1233 tex_alloc_as_buffer(mem);
1241 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1244 mmem->mtlTexture = mtlTexture;
1245 metal_mem_map[&mem] = std::move(mmem);
1249 if (slot >= texture_info.size()) {
1252 texture_info.resize(slot + 128);
1253 texture_slot_map.resize(slot + 128);
1255 ssize_t min_buffer_length =
sizeof(
void *) * texture_info.size();
1256 if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
1257 if (texture_bindings_2d) {
1258 delayed_free_list.push_back(buffer_bindings_1d);
1259 delayed_free_list.push_back(texture_bindings_2d);
1260 delayed_free_list.push_back(texture_bindings_3d);
1262 stats.
mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1263 texture_bindings_3d.allocatedSize);
1265 buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
1266 options:MTLResourceStorageModeShared];
1267 texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
1268 options:MTLResourceStorageModeShared];
1269 texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
1270 options:MTLResourceStorageModeShared];
1272 stats.
mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1273 texture_bindings_3d.allocatedSize);
1278 id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
1279 id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
1280 [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
1281 [blitCommandEncoder endEncoding];
1282 [commandBuffer commit];
1285 texture_slot_map[slot] = mtlTexture;
1286 texture_info[slot] = mem.
info;
1287 need_texture_info =
true;
1289 texture_info[slot].
data =
uint64_t(slot) | (sampler_index << 32);
1291 if (max_working_set_exceeded()) {
1292 set_error(
"System is out of GPU memory");
1303 id<MTLTexture> mtlTexture;
1305 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1306 mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
1308 const size_t imageBytes = src_pitch * mem.
data_height;
1309 for (
size_t d = 0; d < mem.
data_depth; d++) {
1310 const size_t offset = d * imageBytes;
1314 withBytes:(uint8_t *)mem.host_pointer + offset
1315 bytesPerRow:src_pitch
1319 else
if (mem.data_height > 0) {
1320 id<MTLTexture> mtlTexture;
1322 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1323 mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
1327 withBytes:mem.host_pointer
1328 bytesPerRow:src_pitch];
1331 generic_copy_to(mem);
1343 if (metal_mem_map.count(&mem)) {
1344 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1345 MetalMem &mmem = *metal_mem_map.at(&mem);
1347 assert(texture_slot_map[mem.
slot] == mmem.mtlTexture);
1348 if (texture_slot_map[mem.
slot] == mmem.mtlTexture) {
1349 texture_slot_map[mem.
slot] = nil;
1352 if (mmem.mtlTexture) {
1354 delayed_free_list.push_back(mmem.mtlTexture);
1355 mmem.mtlTexture = nil;
1357 erase_allocation(mem);
1363 return make_unique<MetalDeviceQueue>(
this);
1375 return ((MetalMem *)
ptr)->mtlBuffer;
1378void MetalDevice::flush_delayed_free_list()
1383 std::lock_guard<std::recursive_mutex>
lock(metal_mem_map_mutex);
1384 for (
auto &it : delayed_free_list) {
1387 delayed_free_list.clear();
1398 BVHMetal *bvh_metal =
static_cast<BVHMetal *
>(bvh);
1399 bvh_metal->motion_blur = motion_blur;
1400 bvh_metal->use_pcmi = use_pcmi;
1401 if (bvh_metal->build(
progress, mtlDevice, mtlGeneralCommandQueue,
refit)) {
1404 update_bvh(bvh_metal);
1408 if (max_working_set_exceeded()) {
1409 set_error(
"System is out of GPU memory");
1414void MetalDevice::free_bvh()
1416 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1419 unique_blas_array.clear();
1422 [blas_buffer release];
1427 [accel_struct release];
1432void MetalDevice::update_bvh(BVHMetal *bvh_metal)
1440 accel_struct = bvh_metal->accel_struct;
1441 unique_blas_array = bvh_metal->unique_blas_array;
1443 [accel_struct retain];
1444 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1450 uint64_t buffer_size = mtlBlasArgEncoder.encodedLength *
count;
1451 blas_buffer = [mtlDevice newBufferWithLength:buffer_size
options:MTLResourceStorageModeShared];
1452 stats.
mem_alloc(blas_buffer.allocatedSize);
1455 if (bvh_metal->blas_array[
i]) {
1456 [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:
i * mtlBlasArgEncoder.encodedLength];
1457 [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[
i] atIndex:0];
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
KernelOptimizationLevel kernel_optimization_level
bool use_hardware_raytracing
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void set_error(const string &error)
void append(const uint8_t *data, const int nbytes)
void mem_alloc(const size_t size)
void mem_free(const size_t size)
bool is_resident(Device *sub_device) const
void * host_alloc(const size_t size)
device_ptr device_pointer
static constexpr size_t datatype_size(DataType datatype)
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
#define KERNEL_FEATURE_OBJECT_MOTION
#define CCL_NAMESPACE_END
@ KERNEL_OPTIMIZATION_LEVEL_OFF
@ KERNEL_OPTIMIZATION_LEVEL_FULL
@ KERNEL_OPTIMIZATION_LEVEL_INTERSECT
static const char * to_string(const Interpolation &interp)
#define assert(assertion)
static void error(const char *str)
static void init(bNodeTree *, bNode *node)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
string path_source_replace_includes(const string &source, const string &path)
string path_get(const string &sub)
bool path_write_text(const string &path, string &text)
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN