Blender V4.3
device_impl.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
8# include "device/metal/device.h"
9
10# include "scene/scene.h"
11
12# include "util/debug.h"
13# include "util/md5.h"
14# include "util/path.h"
15# include "util/time.h"
16
17# include <TargetConditionals.h>
18# include <crt_externs.h>
19
21
22class MetalDevice;
23
24thread_mutex MetalDevice::existing_devices_mutex;
25std::map<int, MetalDevice *> MetalDevice::active_device_ids;
26
27/* Thread-safe device access for async work. Calling code must pass an appropriately scoped lock
28 * to existing_devices_mutex to safeguard against destruction of the returned instance. */
29MetalDevice *MetalDevice::get_device_by_ID(int ID,
30 thread_scoped_lock & /*existing_devices_mutex_lock*/)
31{
32 auto it = active_device_ids.find(ID);
33 if (it != active_device_ids.end()) {
34 return it->second;
35 }
36 return nullptr;
37}
38
39bool MetalDevice::is_device_cancelled(int ID)
40{
41 thread_scoped_lock lock(existing_devices_mutex);
42 return get_device_by_ID(ID, lock) == nullptr;
43}
44
45BVHLayoutMask MetalDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
46{
47 return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
48}
49
50void MetalDevice::set_error(const string &error)
51{
52 static std::mutex s_error_mutex;
53 std::lock_guard<std::mutex> lock(s_error_mutex);
54
56
57 if (!has_error) {
58 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
59 fprintf(stderr,
60 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
61 has_error = true;
62 }
63}
64
65MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
66 : Device(info, stats, profiler, headless), texture_info(this, "texture_info", MEM_GLOBAL)
67{
68 @autoreleasepool {
69 {
70 /* Assign an ID for this device which we can use to query whether async shader compilation
71 * requests are still relevant. */
72 thread_scoped_lock lock(existing_devices_mutex);
73 static int existing_devices_counter = 1;
74 device_id = existing_devices_counter++;
75 active_device_ids[device_id] = this;
76 }
77
78 mtlDevId = info.num;
79
80 /* select chosen device */
81 auto usable_devices = MetalInfo::get_usable_devices();
82 assert(mtlDevId < usable_devices.size());
83 mtlDevice = usable_devices[mtlDevId];
84 metal_printf("Creating new Cycles Metal device: %s\n", info.description.c_str());
85
86 /* determine default storage mode based on whether UMA is supported */
87
88 default_storage_mode = MTLResourceStorageModeManaged;
89
90 /* We only support Apple Silicon which hasUnifiedMemory support. But leave this check here
91 * just in case a future GPU comes out that doesn't. */
92 if ([mtlDevice hasUnifiedMemory]) {
93 default_storage_mode = MTLResourceStorageModeShared;
94 }
95
96 max_threads_per_threadgroup = 512;
97
98 use_metalrt = info.use_hardware_raytracing;
99 if (auto metalrt = getenv("CYCLES_METALRT")) {
100 use_metalrt = (atoi(metalrt) != 0);
101 }
102
103 if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
104 capture_enabled = true;
105 }
106
107 /* Set kernel_specialization_level based on user preferences. */
108 switch (info.kernel_optimization_level) {
110 kernel_specialization_level = PSO_GENERIC;
111 break;
112 default:
114 kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
115 break;
117 kernel_specialization_level = PSO_SPECIALIZED_SHADE;
118 break;
119 }
120
121 if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
122 kernel_specialization_level = (MetalPipelineType)atoi(envstr);
123 }
124 metal_printf("kernel_specialization_level = %s\n",
125 kernel_type_as_string(
126 (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
127
128 MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
129 arg_desc_params.dataType = MTLDataTypePointer;
130 arg_desc_params.access = MTLArgumentAccessReadOnly;
131 arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
132 mtlBufferKernelParamsEncoder = [mtlDevice
133 newArgumentEncoderWithArguments:@[ arg_desc_params ]];
134
135 MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
136 arg_desc_texture.dataType = MTLDataTypeTexture;
137 arg_desc_texture.access = MTLArgumentAccessReadOnly;
138 mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
139 MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
140 arg_desc_buffer.dataType = MTLDataTypePointer;
141 arg_desc_buffer.access = MTLArgumentAccessReadOnly;
142 mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
143
144 buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
145 texture_bindings_2d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
146 texture_bindings_3d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
147 stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
148 texture_bindings_3d.allocatedSize);
149
150 /* Command queue for path-tracing work on the GPU. In a situation where multiple
151 * MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue.
152 * This is thread safe and just as performant as each having their own instance. It also
153 * adheres to best practices of maximizing the lifetime of each MTLCommandQueue. */
154 mtlComputeCommandQueue = [mtlDevice newCommandQueue];
155
156 /* Command queue for non-tracing work on the GPU. */
157 mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
158
159 /* Acceleration structure arg encoder, if needed */
160 if (@available(macos 12.0, *)) {
161 if (use_metalrt) {
162 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
163 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
164 arg_desc_as.access = MTLArgumentAccessReadOnly;
165 mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
166 [arg_desc_as release];
167 }
168 }
169
170 /* Build the arg encoder for the ancillary bindings */
171 {
172 NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
173
174 int index = 0;
175 MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
176 arg_desc_tex.dataType = MTLDataTypePointer;
177 arg_desc_tex.access = MTLArgumentAccessReadOnly;
178
179 arg_desc_tex.index = index++;
180 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
181 arg_desc_tex.index = index++;
182 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
183 arg_desc_tex.index = index++;
184 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */
185
186 [arg_desc_tex release];
187
188 if (@available(macos 12.0, *)) {
189 if (use_metalrt) {
190 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
191 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
192 arg_desc_as.access = MTLArgumentAccessReadOnly;
193
194 MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
195 arg_desc_ptrs.dataType = MTLDataTypePointer;
196 arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
197
198 MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
199 arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
200 arg_desc_ift.access = MTLArgumentAccessReadOnly;
201
202 arg_desc_as.index = index++;
203 [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
204
205 /* Intersection function tables */
206 arg_desc_ift.index = index++;
207 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
208 arg_desc_ift.index = index++;
209 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
210 arg_desc_ift.index = index++;
211 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */
212 arg_desc_ift.index = index++;
213 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */
214 arg_desc_ift.index = index++;
215 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
216 arg_desc_ift.index = index++;
217 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */
218 arg_desc_ift.index = index++;
219 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */
220 arg_desc_ift.index = index++;
221 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */
222
223 arg_desc_ptrs.index = index++;
224 [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */
225
226 [arg_desc_ift release];
227 [arg_desc_as release];
228 [arg_desc_ptrs release];
229 }
230 }
231
232 mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
233
234 // preparing the blas arg encoder
235
236 if (use_metalrt) {
237 MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
238 arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
239 arg_desc_blas.access = MTLArgumentAccessReadOnly;
240 mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
241 [arg_desc_blas release];
242 }
243
244 for (int i = 0; i < ancillary_desc.count; i++) {
245 [ancillary_desc[i] release];
246 }
247 [ancillary_desc release];
248 }
249 [arg_desc_params release];
250 [arg_desc_texture release];
251 }
252}
253
254MetalDevice::~MetalDevice()
255{
256 /* Cancel any async shader compilations that are in flight. */
257 cancel();
258
259 /* This lock safeguards against destruction during use (see other uses of
260 * existing_devices_mutex). */
261 thread_scoped_lock lock(existing_devices_mutex);
262
263 int num_resources = texture_info.size();
264 for (int res = 0; res < num_resources; res++) {
265 if (is_texture(texture_info[res])) {
266 [texture_slot_map[res] release];
267 texture_slot_map[res] = nil;
268 }
269 }
270
271 free_bvh();
272 flush_delayed_free_list();
273
274 if (texture_bindings_2d) {
275 stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
276 texture_bindings_3d.allocatedSize);
277 [buffer_bindings_1d release];
278 [texture_bindings_2d release];
279 [texture_bindings_3d release];
280 }
281 [mtlTextureArgEncoder release];
282 [mtlBufferKernelParamsEncoder release];
283 [mtlBufferArgEncoder release];
284 [mtlASArgEncoder release];
285 [mtlAncillaryArgEncoder release];
286 [mtlComputeCommandQueue release];
287 [mtlGeneralCommandQueue release];
288 [mtlDevice release];
289
290 texture_info.free();
291}
292
293bool MetalDevice::support_device(const uint /*kernel_features*/)
294{
295 return true;
296}
297
298bool MetalDevice::check_peer_access(Device * /*peer_device*/)
299{
300 assert(0);
301 /* does peer access make sense? */
302 return false;
303}
304
305bool MetalDevice::use_adaptive_compilation()
306{
308}
309
310bool MetalDevice::use_local_atomic_sort() const
311{
313}
314
315string MetalDevice::preprocess_source(MetalPipelineType pso_type,
316 const uint kernel_features,
317 string *source)
318{
319 string global_defines;
320 if (use_adaptive_compilation()) {
321 global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
322 }
323
324 if (use_local_atomic_sort()) {
325 global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
326 }
327
328 if (use_metalrt) {
329 global_defines += "#define __METALRT__\n";
330 if (motion_blur) {
331 global_defines += "#define __METALRT_MOTION__\n";
332 }
333 }
334
335# ifdef WITH_CYCLES_DEBUG
336 global_defines += "#define WITH_CYCLES_DEBUG\n";
337# endif
338
339 global_defines += "#define __KERNEL_METAL_APPLE__\n";
340 if (@available(macos 14.0, *)) {
341 /* Use Program Scope Global Built-ins, when available. */
342 global_defines += "#define __METAL_GLOBAL_BUILTINS__\n";
343 }
344# ifdef WITH_NANOVDB
345 /* Compiling in NanoVDB results in a marginal drop in render performance,
346 * so disable it for specialized PSOs when no textures are using it. */
347 if ((pso_type == PSO_GENERIC || using_nanovdb) && DebugFlags().metal.use_nanovdb) {
348 global_defines += "#define WITH_NANOVDB\n";
349 }
350# endif
351
352 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
353 NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
354 global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";
355
356# if TARGET_CPU_ARM64
357 global_defines += "#define __KERNEL_METAL_TARGET_CPU_ARM64__\n";
358# endif
359
360 /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
361 * the same character length. Build a string of all active constant values which is then hashed
362 * in order to identify the PSO.
363 */
364 if (pso_type != PSO_GENERIC) {
365 if (source) {
366 const double starttime = time_dt();
367
368# define KERNEL_STRUCT_BEGIN(name, parent) \
369 string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
370
371 bool next_member_is_specialized = true;
372
373# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
374
375# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
376 if (!next_member_is_specialized) { \
377 string_replace( \
378 *source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
379 next_member_is_specialized = true; \
380 }
381
382# include "kernel/data_template.h"
383
384# undef KERNEL_STRUCT_MEMBER
385# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
386# undef KERNEL_STRUCT_BEGIN
387
388 metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
389 }
390
391 /* Opt in to all of available specializations. This can be made more granular for the
392 * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
393 * but the overhead should be negligible as these are very quick to (re)build and aren't
394 * serialized to disk via MTLBinaryArchives.
395 */
396 global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
397 }
398
399# if 0
400 metal_printf("================\n%s================\n",
401 global_defines.c_str());
402# endif
403
404 if (source) {
405 *source = global_defines + *source;
406 }
407
408 MD5Hash md5;
409 md5.append(global_defines);
410 return md5.get_hex();
411}
412
413void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
414{
415 string &source = this->source[pso_type];
416 source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
417 source = path_source_replace_includes(source, path_get("source"));
418
419 /* Perform any required specialization on the source.
420 * With Metal function constants we can generate a single variant of the kernel source which can
421 * be repeatedly respecialized.
422 */
423 global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
424}
425
426bool MetalDevice::load_kernels(const uint _kernel_features)
427{
428 @autoreleasepool {
429 kernel_features |= _kernel_features;
430
431 /* check if GPU is supported */
432 if (!support_device(kernel_features))
433 return false;
434
435 /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
436 * This is necessary since objects may be reported to have motion if the Vector pass is
437 * active, but may still need to be rendered without motion blur if that isn't active as well.
438 */
439 motion_blur |= kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
440
441 /* Only request generic kernels if they aren't cached in memory. */
442 refresh_source_and_kernels_md5(PSO_GENERIC);
443 if (MetalDeviceKernels::should_load_kernels(this, PSO_GENERIC)) {
444 /* If needed, load them asynchronously in order to responsively message progress to the user.
445 */
446 int this_device_id = this->device_id;
447 auto compile_kernels_fn = ^() {
448 compile_and_load(this_device_id, PSO_GENERIC);
449 };
450
451 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
452 compile_kernels_fn);
453 }
454 }
455 return true;
456}
457
458void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
459{
460 string defines_md5 = preprocess_source(pso_type, kernel_features);
461
462 /* Rebuild the source string if the injected block of #defines has changed. */
463 if (global_defines_md5[pso_type] != defines_md5) {
464 make_source(pso_type, kernel_features);
465 }
466
467 string constant_values;
468 if (pso_type != PSO_GENERIC) {
469 bool next_member_is_specialized = true;
470
471# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
472
473 /* Add specialization constants to md5 so that 'get_best_pipeline' is able to return a suitable
474 * match. */
475# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
476 if (next_member_is_specialized) { \
477 constant_values += string(#parent "." #name "=") + \
478 to_string(_type(launch_params.data.parent.name)) + "\n"; \
479 } \
480 else { \
481 next_member_is_specialized = true; \
482 }
483
484# include "kernel/data_template.h"
485
486# undef KERNEL_STRUCT_MEMBER
487# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
488
489# if 0
490 metal_printf("================\n%s================\n",
491 constant_values.c_str());
492# endif
493 }
494
495 MD5Hash md5;
496 md5.append(constant_values);
497 md5.append(source[pso_type]);
498 if (use_metalrt) {
499 md5.append(string_printf("metalrt_features=%d", kernel_features & METALRT_FEATURE_MASK));
500 }
501 kernels_md5[pso_type] = md5.get_hex();
502}
503
504void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
505{
506 @autoreleasepool {
507 /* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few
508 * seconds, so we avoid blocking device tear-down if the user cancels a render immediately. */
509
510 id<MTLDevice> mtlDevice;
511 string source;
512
513 /* Safely gather any state required for the MSL->AIR compilation. */
514 {
515 thread_scoped_lock lock(existing_devices_mutex);
516
517 /* Check whether the device still exists. */
518 MetalDevice *instance = get_device_by_ID(device_id, lock);
519 if (!instance) {
520 metal_printf("Ignoring %s compilation request - device no longer exists\n",
521 kernel_type_as_string(pso_type));
522 return;
523 }
524
525 if (!MetalDeviceKernels::should_load_kernels(instance, pso_type)) {
526 /* We already have a full set of matching pipelines which are cached or queued. Return
527 * early to avoid redundant MTLLibrary compilation. */
528 metal_printf("Ignoreing %s compilation request - kernels already requested\n",
529 kernel_type_as_string(pso_type));
530 return;
531 }
532
533 mtlDevice = instance->mtlDevice;
534 source = instance->source[pso_type];
535 }
536
537 /* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
538 * in this time. */
539
540 MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
541
542 options.fastMathEnabled = YES;
543 if (@available(macos 12.0, *)) {
544 options.languageVersion = MTLLanguageVersion2_4;
545 }
546# if defined(MAC_OS_VERSION_13_0)
547 if (@available(macos 13.0, *)) {
548 options.languageVersion = MTLLanguageVersion3_0;
549 }
550# endif
551# if defined(MAC_OS_VERSION_14_0)
552 if (@available(macos 14.0, *)) {
553 options.languageVersion = MTLLanguageVersion3_1;
554 }
555# endif
556
557 if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
558 path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
559 source);
560 }
561
562 double starttime = time_dt();
563
564 NSError *error = NULL;
565 id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
567 error:&error];
568
569 metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
570 time_dt() - starttime,
571 kernel_type_as_string(pso_type));
572
573 [options release];
574
575 bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") ||
576 MetalDeviceKernels::is_benchmark_warmup());
577 if (blocking_pso_build) {
578 MetalDeviceKernels::wait_for_all();
579 starttime = 0.0;
580 }
581
582 /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
583 * exists). */
584 {
585 thread_scoped_lock lock(existing_devices_mutex);
586 if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
587 if (mtlLibrary) {
588 if (error && [error localizedDescription]) {
589 VLOG_WARNING << "MSL compilation messages: "
590 << [[error localizedDescription] UTF8String];
591 }
592
593 instance->mtlLibrary[pso_type] = mtlLibrary;
594
595 starttime = time_dt();
596 MetalDeviceKernels::load(instance, pso_type);
597 }
598 else {
599 NSString *err = [error localizedDescription];
600 instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
601 }
602 }
603 }
604
605 if (starttime && blocking_pso_build) {
606 MetalDeviceKernels::wait_for_all();
607
608 metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
609 time_dt() - starttime,
610 kernel_type_as_string(pso_type));
611 }
612 }
613}
614
615bool MetalDevice::is_texture(const TextureInfo &tex)
616{
617 return (tex.depth > 0 || tex.height > 0);
618}
619
620void MetalDevice::load_texture_info()
621{
622 if (need_texture_info) {
623 /* Unset flag before copying. */
624 need_texture_info = false;
625 texture_info.copy_to_device();
626
627 int num_textures = texture_info.size();
628
629 for (int tex = 0; tex < num_textures; tex++) {
630 uint64_t offset = tex * sizeof(void *);
631 if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
632 id<MTLTexture> metal_texture = texture_slot_map[tex];
633 MTLTextureType type = metal_texture.textureType;
634 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
635 [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
636 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
637 [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
638 }
639 else {
640 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
641 [mtlTextureArgEncoder setTexture:nil atIndex:0];
642 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
643 [mtlTextureArgEncoder setTexture:nil atIndex:0];
644 }
645 }
646 if (default_storage_mode == MTLResourceStorageModeManaged) {
647 [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
648 [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
649 }
650 }
651}
652
653void MetalDevice::erase_allocation(device_memory &mem)
654{
655 stats.mem_free(mem.device_size);
656 mem.device_pointer = 0;
657 mem.device_size = 0;
658
659 auto it = metal_mem_map.find(&mem);
660 if (it != metal_mem_map.end()) {
661 MetalMem *mmem = it->second.get();
662
663 /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
664 if (mmem->pointer_index >= 0) {
665 device_ptr *pointers = (device_ptr *)&launch_params;
666 pointers[mmem->pointer_index] = 0;
667 }
668 metal_mem_map.erase(it);
669 }
670}
671
672bool MetalDevice::max_working_set_exceeded(size_t safety_margin) const
673{
674 /* We're allowed to allocate beyond the safe working set size, but then if all resources are made
675 * resident we will get command buffer failures at render time. */
676 size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
677 return (stats.mem_used > available);
678}
679
680MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
681{
682 @autoreleasepool {
683 size_t size = mem.memory_size();
684
685 mem.device_pointer = 0;
686
687 id<MTLBuffer> metal_buffer = nil;
688 MTLResourceOptions options = default_storage_mode;
689
690 if (size > 0) {
691 if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
692 options = MTLResourceStorageModePrivate;
693 }
694
695 metal_buffer = [mtlDevice newBufferWithLength:size options:options];
696
697 if (!metal_buffer) {
698 set_error("System is out of GPU memory");
699 return nullptr;
700 }
701 }
702
703 if (mem.name) {
704 VLOG_WORK << "Buffer allocate: " << mem.name << ", "
705 << string_human_readable_number(mem.memory_size()) << " bytes. ("
707 }
708
709 mem.device_size = metal_buffer.allocatedSize;
710 stats.mem_alloc(mem.device_size);
711
712 metal_buffer.label = [NSString stringWithFormat:@"%s", mem.name];
713
714 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
715
716 assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
717 MetalMem *mmem = new MetalMem;
718 metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
719
720 mmem->mem = &mem;
721 mmem->mtlBuffer = metal_buffer;
722 mmem->offset = 0;
723 mmem->size = size;
724 if (options != MTLResourceStorageModePrivate) {
725 mmem->hostPtr = [metal_buffer contents];
726 }
727 else {
728 mmem->hostPtr = nullptr;
729 }
730
731 /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device
732 * pointer recalculation */
733 mem.device_pointer = device_ptr(mmem);
734
735 if (metal_buffer.storageMode == MTLResourceStorageModeShared) {
736 /* Replace host pointer with our host allocation. */
737
738 if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
739 memcpy(mmem->hostPtr, mem.host_pointer, size);
740
741 mem.host_free();
742 mem.host_pointer = mmem->hostPtr;
743 }
744 mem.shared_pointer = mmem->hostPtr;
745 mem.shared_counter++;
746 mmem->use_UMA = true;
747 }
748 else {
749 mmem->use_UMA = false;
750 }
751
752 if (max_working_set_exceeded()) {
753 set_error("System is out of GPU memory");
754 return nullptr;
755 }
756
757 return mmem;
758 }
759}
760
761void MetalDevice::generic_copy_to(device_memory &mem)
762{
763 if (!mem.host_pointer || !mem.device_pointer) {
764 return;
765 }
766
767 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
768 if (!metal_mem_map.at(&mem)->use_UMA || mem.host_pointer != mem.shared_pointer) {
769 MetalMem &mmem = *metal_mem_map.at(&mem);
770 memcpy(mmem.hostPtr, mem.host_pointer, mem.memory_size());
771 if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) {
772 [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.memory_size())];
773 }
774 }
775}
776
777void MetalDevice::generic_free(device_memory &mem)
778{
779 if (mem.device_pointer) {
780 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
781 MetalMem &mmem = *metal_mem_map.at(&mem);
782 size_t size = mmem.size;
783
784 /* If mmem.use_uma is true, reference counting is used
785 * to safely free memory. */
786
787 bool free_mtlBuffer = false;
788
789 if (mmem.use_UMA) {
790 assert(mem.shared_pointer);
791 if (mem.shared_pointer) {
792 assert(mem.shared_counter > 0);
793 if (--mem.shared_counter == 0) {
794 free_mtlBuffer = true;
795 }
796 }
797 }
798 else {
799 free_mtlBuffer = true;
800 }
801
802 if (free_mtlBuffer) {
803 if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
804 /* Safely move the device-side data back to the host before it is freed. */
805 mem.host_pointer = mem.host_alloc(size);
806 memcpy(mem.host_pointer, mem.shared_pointer, size);
807 mmem.use_UMA = false;
808 }
809
810 mem.shared_pointer = 0;
811
812 /* Free device memory. */
813 delayed_free_list.push_back(mmem.mtlBuffer);
814 mmem.mtlBuffer = nil;
815 }
816
817 erase_allocation(mem);
818 }
819}
820
821void MetalDevice::mem_alloc(device_memory &mem)
822{
823 if (mem.type == MEM_TEXTURE) {
824 assert(!"mem_alloc not supported for textures.");
825 }
826 else if (mem.type == MEM_GLOBAL) {
827 generic_alloc(mem);
828 }
829 else {
830 generic_alloc(mem);
831 }
832}
833
834void MetalDevice::mem_copy_to(device_memory &mem)
835{
836 if (mem.type == MEM_GLOBAL) {
837 global_free(mem);
838 global_alloc(mem);
839 }
840 else if (mem.type == MEM_TEXTURE) {
841 tex_free((device_texture &)mem);
842 tex_alloc((device_texture &)mem);
843 }
844 else {
845 if (!mem.device_pointer) {
846 generic_alloc(mem);
847 }
848 generic_copy_to(mem);
849 }
850}
851
852void MetalDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
853{
854 @autoreleasepool {
855 if (mem.host_pointer) {
856
857 bool subcopy = (w >= 0 && h >= 0);
858 const size_t size = subcopy ? (elem * w * h) : mem.memory_size();
859 const size_t offset = subcopy ? (elem * y * w) : 0;
860
861 if (mem.device_pointer) {
862 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
863 MetalMem &mmem = *metal_mem_map.at(&mem);
864
865 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
866
867 id<MTLCommandBuffer> cmdBuffer = [mtlGeneralCommandQueue commandBuffer];
868 id<MTLBlitCommandEncoder> blitEncoder = [cmdBuffer blitCommandEncoder];
869 [blitEncoder synchronizeResource:mmem.mtlBuffer];
870 [blitEncoder endEncoding];
871 [cmdBuffer commit];
872 [cmdBuffer waitUntilCompleted];
873 }
874
875 if (mem.host_pointer != mmem.hostPtr) {
876 memcpy((uchar *)mem.host_pointer + offset, (uchar *)mmem.hostPtr + offset, size);
877 }
878 }
879 else {
880 memset((char *)mem.host_pointer + offset, 0, size);
881 }
882 }
883 }
884}
885
886void MetalDevice::mem_zero(device_memory &mem)
887{
888 if (!mem.device_pointer) {
889 mem_alloc(mem);
890 }
891 if (!mem.device_pointer) {
892 return;
893 }
894
895 size_t size = mem.memory_size();
896 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
897 MetalMem &mmem = *metal_mem_map.at(&mem);
898 memset(mmem.hostPtr, 0, size);
899 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
900 [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)];
901 }
902}
903
904void MetalDevice::mem_free(device_memory &mem)
905{
906 if (mem.type == MEM_GLOBAL) {
907 global_free(mem);
908 }
909 else if (mem.type == MEM_TEXTURE) {
910 tex_free((device_texture &)mem);
911 }
912 else {
913 generic_free(mem);
914 }
915}
916
917device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory & /*mem*/,
918 size_t /*offset*/,
919 size_t /*size*/)
920{
921 /* METAL_WIP - revive if necessary */
922 assert(0);
923 return 0;
924}
925
926void MetalDevice::cancel()
927{
928 /* Remove this device's ID from the list of active devices. Any pending compilation requests
929 * originating from this session will be cancelled. */
930 thread_scoped_lock lock(existing_devices_mutex);
931 if (device_id) {
932 active_device_ids.erase(device_id);
933 device_id = 0;
934 }
935}
936
937bool MetalDevice::is_ready(string &status) const
938{
939 if (!error_msg.empty()) {
940 /* Avoid hanging if we had an error. */
941 return true;
942 }
943
944 int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
945 if (num_loaded < DEVICE_KERNEL_NUM) {
946 status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
947 num_loaded,
949 return false;
950 }
951
952 if (int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
953 status = string_printf("%d kernels to optimize", num_requests);
954 }
955 else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
956 status = "Using optimized intersection kernels";
957 }
958 else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
959 status = "Using optimized kernels";
960 }
961
962 metal_printf("MetalDevice::is_ready(...) --> true\n");
963 return true;
964}
965
966void MetalDevice::optimize_for_scene(Scene *scene)
967{
968 MetalPipelineType specialization_level = kernel_specialization_level;
969
970 if (!scene->params.background) {
971 /* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
972 specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
973 }
974
975 /* For responsive rendering, specialize the kernels in the background, and only if there isn't an
976 * existing "optimize_for_scene" request in flight. */
977 int this_device_id = this->device_id;
978 auto specialize_kernels_fn = ^() {
979 for (int level = 1; level <= int(specialization_level); level++) {
980 compile_and_load(this_device_id, MetalPipelineType(level));
981 }
982 };
983
984 /* In normal use, we always compile the specialized kernels in the background. */
985 bool specialize_in_background = true;
986
987 /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
988 if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
989 specialize_in_background = false;
990 }
991
992 /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
993 if (MetalDeviceKernels::is_benchmark_warmup()) {
994 specialize_in_background = false;
995 }
996
997 if (specialize_in_background) {
998 if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
999 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
1000 specialize_kernels_fn);
1001 }
1002 else {
1003 metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
1004 }
1005 }
1006 else {
1007 specialize_kernels_fn();
1008 }
1009}
1010
1011void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
1012{
1013 if (strcmp(name, "data") == 0) {
1014 assert(size == sizeof(KernelData));
1015 memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
1016
1017 /* Refresh the kernels_md5 checksums for specialized kernel sets. */
1018 for (int level = 1; level <= int(kernel_specialization_level); level++) {
1019 refresh_source_and_kernels_md5(MetalPipelineType(level));
1020 }
1021 return;
1022 }
1023
1024 auto update_launch_pointers =
1025 [&](size_t offset, void *data, size_t data_size, size_t pointers_size) {
1026 memcpy((uint8_t *)&launch_params + offset, data, data_size);
1027
1028 MetalMem **mmem = (MetalMem **)data;
1029 int pointer_count = pointers_size / sizeof(device_ptr);
1030 int pointer_index = offset / sizeof(device_ptr);
1031 for (int i = 0; i < pointer_count; i++) {
1032 if (mmem[i]) {
1033 mmem[i]->pointer_index = pointer_index + i;
1034 }
1035 }
1036 };
1037
1038 /* Update data storage pointers in launch parameters. */
1039 if (strcmp(name, "integrator_state") == 0) {
1040 /* IntegratorStateGPU is contiguous pointers */
1041 const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
1042 update_launch_pointers(
1043 offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
1044 }
1045# define KERNEL_DATA_ARRAY(data_type, tex_name) \
1046 else if (strcmp(name, #tex_name) == 0) { \
1047 update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
1048 }
1049# include "kernel/data_arrays.h"
1050# undef KERNEL_DATA_ARRAY
1051}
1052
1053void MetalDevice::global_alloc(device_memory &mem)
1054{
1055 if (mem.is_resident(this)) {
1056 generic_alloc(mem);
1057 generic_copy_to(mem);
1058 }
1059
1060 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
1061}
1062
1063void MetalDevice::global_free(device_memory &mem)
1064{
1065 if (mem.is_resident(this) && mem.device_pointer) {
1066 generic_free(mem);
1067 }
1068}
1069
1070void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
1071{
1072 MetalDevice::MetalMem *mmem = generic_alloc(mem);
1073 generic_copy_to(mem);
1074
1075 /* Resize once */
1076 const uint slot = mem.slot;
1077 if (slot >= texture_info.size()) {
1078 /* Allocate some slots in advance, to reduce amount
1079 * of re-allocations. */
1080 texture_info.resize(round_up(slot + 1, 128));
1081 texture_slot_map.resize(round_up(slot + 1, 128));
1082 }
1083
1084 texture_info[slot] = mem.info;
1085 uint64_t offset = slot * sizeof(void *);
1086 [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
1087 [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
1088 texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
1089 texture_slot_map[slot] = nil;
1090 need_texture_info = true;
1091
1096 {
1097 using_nanovdb = true;
1098 }
1099}
1100
1101void MetalDevice::tex_alloc(device_texture &mem)
1102{
1103 @autoreleasepool {
1104 /* Check that dimensions fit within maximum allowable size.
1105 * If 1D texture is allocated, use 1D buffer.
1106 * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
1107 if (mem.data_height > 0) {
1108 if (mem.data_width > 16384 || mem.data_height > 16384) {
1109 set_error(string_printf(
1110 "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
1111 mem.data_width,
1112 mem.data_height));
1113 return;
1114 }
1115 }
1116 MTLStorageMode storage_mode = MTLStorageModeManaged;
1117 if ([mtlDevice hasUnifiedMemory]) {
1118 storage_mode = MTLStorageModeShared;
1119 }
1120
1121 /* General variables for both architectures */
1122 string bind_name = mem.name;
1123 size_t dsize = datatype_size(mem.data_type);
1124 size_t size = mem.memory_size();
1125
1126 /* sampler_index maps into the GPU's constant 'metal_samplers' array */
1127 uint64_t sampler_index = mem.info.extension;
1129 sampler_index += 4;
1130 }
1131
1132 /* Image Texture Storage */
1133 MTLPixelFormat format;
1134 switch (mem.data_type) {
1135 case TYPE_UCHAR: {
1136 MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
1137 MTLPixelFormatRG8Unorm,
1138 MTLPixelFormatInvalid,
1139 MTLPixelFormatRGBA8Unorm};
1140 format = formats[mem.data_elements - 1];
1141 } break;
1142 case TYPE_UINT16: {
1143 MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
1144 MTLPixelFormatRG16Unorm,
1145 MTLPixelFormatInvalid,
1146 MTLPixelFormatRGBA16Unorm};
1147 format = formats[mem.data_elements - 1];
1148 } break;
1149 case TYPE_UINT: {
1150 MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
1151 MTLPixelFormatRG32Uint,
1152 MTLPixelFormatInvalid,
1153 MTLPixelFormatRGBA32Uint};
1154 format = formats[mem.data_elements - 1];
1155 } break;
1156 case TYPE_INT: {
1157 MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
1158 MTLPixelFormatRG32Sint,
1159 MTLPixelFormatInvalid,
1160 MTLPixelFormatRGBA32Sint};
1161 format = formats[mem.data_elements - 1];
1162 } break;
1163 case TYPE_FLOAT: {
1164 MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
1165 MTLPixelFormatRG32Float,
1166 MTLPixelFormatInvalid,
1167 MTLPixelFormatRGBA32Float};
1168 format = formats[mem.data_elements - 1];
1169 } break;
1170 case TYPE_HALF: {
1171 MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
1172 MTLPixelFormatRG16Float,
1173 MTLPixelFormatInvalid,
1174 MTLPixelFormatRGBA16Float};
1175 format = formats[mem.data_elements - 1];
1176 } break;
1177 default:
1178 assert(0);
1179 return;
1180 }
1181
1182 assert(format != MTLPixelFormatInvalid);
1183
1184 id<MTLTexture> mtlTexture = nil;
1185 size_t src_pitch = mem.data_width * dsize * mem.data_elements;
1186
1187 if (mem.data_depth > 1) {
1188 /* 3D texture using array */
1189 MTLTextureDescriptor *desc;
1190
1191 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
1192 width:mem.data_width
1193 height:mem.data_height
1194 mipmapped:NO];
1195
1196 desc.storageMode = storage_mode;
1197 desc.usage = MTLTextureUsageShaderRead;
1198
1199 desc.textureType = MTLTextureType3D;
1200 desc.depth = mem.data_depth;
1201
1202 VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
1203 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1204 << string_human_readable_size(mem.memory_size()) << ")";
1205
1206 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1207 if (!mtlTexture) {
1208 set_error("System is out of GPU memory");
1209 return;
1210 }
1211
1212 const size_t imageBytes = src_pitch * mem.data_height;
1213 for (size_t d = 0; d < mem.data_depth; d++) {
1214 const size_t offset = d * imageBytes;
1215 [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
1216 mipmapLevel:0
1217 slice:0
1218 withBytes:(uint8_t *)mem.host_pointer + offset
1219 bytesPerRow:src_pitch
1220 bytesPerImage:0];
1221 }
1222 }
1223 else if (mem.data_height > 0) {
1224 /* 2D texture */
1225 MTLTextureDescriptor *desc;
1226
1227 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
1228 width:mem.data_width
1229 height:mem.data_height
1230 mipmapped:NO];
1231
1232 desc.storageMode = storage_mode;
1233 desc.usage = MTLTextureUsageShaderRead;
1234
1235 VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
1236 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1237 << string_human_readable_size(mem.memory_size()) << ")";
1238
1239 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1240 if (!mtlTexture) {
1241 set_error("System is out of GPU memory");
1242 return;
1243 }
1244
1245 [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
1246 mipmapLevel:0
1247 withBytes:mem.host_pointer
1248 bytesPerRow:src_pitch];
1249 }
1250 else {
1251 /* 1D texture, using linear memory. */
1252 tex_alloc_as_buffer(mem);
1253 return;
1254 }
1255
1256 mem.device_pointer = (device_ptr)mtlTexture;
1257 mem.device_size = size;
1258 stats.mem_alloc(size);
1259
1260 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1261 MetalMem *mmem = new MetalMem;
1262 metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
1263 mmem->mem = &mem;
1264 mmem->mtlTexture = mtlTexture;
1265
1266 /* Resize once */
1267 const uint slot = mem.slot;
1268 if (slot >= texture_info.size()) {
1269 /* Allocate some slots in advance, to reduce amount
1270 * of re-allocations. */
1271 texture_info.resize(slot + 128);
1272 texture_slot_map.resize(slot + 128);
1273
1274 ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
1275 if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
1276 if (texture_bindings_2d) {
1277 delayed_free_list.push_back(buffer_bindings_1d);
1278 delayed_free_list.push_back(texture_bindings_2d);
1279 delayed_free_list.push_back(texture_bindings_3d);
1280
1281 stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1282 texture_bindings_3d.allocatedSize);
1283 }
1284 buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
1285 options:default_storage_mode];
1286 texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
1287 options:default_storage_mode];
1288 texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
1289 options:default_storage_mode];
1290
1291 stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1292 texture_bindings_3d.allocatedSize);
1293 }
1294 }
1295
1296 /* Optimize the texture for GPU access. */
1297 id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
1298 id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
1299 [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
1300 [blitCommandEncoder endEncoding];
1301 [commandBuffer commit];
1302
1303 /* Set Mapping and tag that we need to (re-)upload to device */
1304 texture_slot_map[slot] = mtlTexture;
1305 texture_info[slot] = mem.info;
1306 need_texture_info = true;
1307
1308 texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
1309
1310 if (max_working_set_exceeded()) {
1311 set_error("System is out of GPU memory");
1312 }
1313 }
1314}
1315
1316void MetalDevice::tex_free(device_texture &mem)
1317{
1318 if (mem.data_depth == 0 && mem.data_height == 0) {
1319 generic_free(mem);
1320 return;
1321 }
1322
1323 if (metal_mem_map.count(&mem)) {
1324 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1325 MetalMem &mmem = *metal_mem_map.at(&mem);
1326
1327 assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
1328 if (texture_slot_map[mem.slot] == mmem.mtlTexture)
1329 texture_slot_map[mem.slot] = nil;
1330
1331 if (mmem.mtlTexture) {
1332 /* Free bindless texture. */
1333 delayed_free_list.push_back(mmem.mtlTexture);
1334 mmem.mtlTexture = nil;
1335 }
1336 erase_allocation(mem);
1337 }
1338}
1339
1340unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
1341{
1342 return make_unique<MetalDeviceQueue>(this);
1343}
1344
1345bool MetalDevice::should_use_graphics_interop()
1346{
1347 /* METAL_WIP - provide fast interop */
1348 return false;
1349}
1350
1351void *MetalDevice::get_native_buffer(device_ptr ptr)
1352{
1353 return ((MetalMem *)ptr)->mtlBuffer;
1354}
1355
1356void MetalDevice::flush_delayed_free_list()
1357{
1358 /* free any Metal buffers that may have been freed by host while a command
1359 * buffer was being generated. This function should be called after each
1360 * completion of a command buffer */
1361 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1362 for (auto &it : delayed_free_list) {
1363 [it release];
1364 }
1365 delayed_free_list.clear();
1366}
1367
1368void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1369{
1370 @autoreleasepool {
1371 if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1372 Device::build_bvh(bvh, progress, refit);
1373 return;
1374 }
1375
1376 BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
1377 bvh_metal->motion_blur = motion_blur;
1378 if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
1379
1380 if (bvh->params.top_level) {
1381 update_bvh(bvh_metal);
1382 }
1383 }
1384
1385 if (max_working_set_exceeded()) {
1386 set_error("System is out of GPU memory");
1387 }
1388 }
1389}
1390
1391void MetalDevice::free_bvh()
1392{
1393 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1394 [blas release];
1395 }
1396 unique_blas_array.clear();
1397
1398 if (blas_buffer) {
1399 [blas_buffer release];
1400 blas_buffer = nil;
1401 }
1402
1403 if (accel_struct) {
1404 [accel_struct release];
1405 accel_struct = nil;
1406 }
1407}
1408
1409void MetalDevice::update_bvh(BVHMetal *bvh_metal)
1410{
1411 free_bvh();
1412
1413 if (!bvh_metal) {
1414 return;
1415 }
1416
1417 accel_struct = bvh_metal->accel_struct;
1418 unique_blas_array = bvh_metal->unique_blas_array;
1419
1420 [accel_struct retain];
1421 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1422 [blas retain];
1423 }
1424
1425 // Allocate required buffers for BLAS array.
1426 uint64_t count = bvh_metal->blas_array.size();
1427 uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
1428 blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:default_storage_mode];
1429 stats.mem_alloc(blas_buffer.allocatedSize);
1430
1431 for (uint64_t i = 0; i < count; ++i) {
1432 if (bvh_metal->blas_array[i]) {
1433 [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
1434 [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
1435 }
1436 }
1437 if (default_storage_mode == MTLResourceStorageModeManaged) {
1438 [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
1439 }
1440}
1441
1443
1444#endif
unsigned char uchar
unsigned int uint
volatile int lock
void init()
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
BVHLayout bvh_layout
Definition params.h:84
bool top_level
Definition params.h:81
Definition bvh/bvh.h:66
BVHParams params
Definition bvh/bvh.h:68
Metal metal
Definition debug.h:129
KernelOptimizationLevel kernel_optimization_level
string description
bool use_hardware_raytracing
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void set_error(const string &error)
Definition md5.h:21
string get_hex()
Definition md5.cpp:354
void append(const uint8_t *data, int size)
Definition md5.cpp:255
size_t mem_used
Definition util/stats.h:32
void mem_free(size_t size)
Definition util/stats.h:26
void mem_alloc(size_t size)
Definition util/stats.h:20
bool is_resident(Device *sub_device) const
Definition memory.cpp:127
void * host_alloc(size_t size)
Definition memory.cpp:42
void host_free()
Definition memory.cpp:60
static constexpr size_t datatype_size(DataType datatype)
@ MEM_TEXTURE
@ MEM_DEVICE_ONLY
@ TYPE_UINT16
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
Definition debug.h:142
#define CCL_NAMESPACE_END
@ KERNEL_OPTIMIZATION_LEVEL_OFF
@ KERNEL_OPTIMIZATION_LEVEL_FULL
@ KERNEL_OPTIMIZATION_LEVEL_INTERSECT
#define NULL
#define offsetof(t, d)
CCL_NAMESPACE_BEGIN struct KernelParamsMetal KernelParamsMetal
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
static const char * to_string(const Interpolation &interp)
Definition gl_shader.cc:82
int count
#define KERNEL_FEATURE_OBJECT_MOTION
KernelData
@ BVH_LAYOUT_METAL
@ BVH_LAYOUT_BVH2
@ DEVICE_KERNEL_NUM
format
#define VLOG_WARNING
Definition log.h:70
#define VLOG_WORK
Definition log.h:75
static void error(const char *str)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
int BVHLayoutMask
Definition params.h:51
string path_cache_get(const string &sub)
Definition path.cpp:362
string path_source_replace_includes(const string &source, const string &path)
Definition path.cpp:966
string path_get(const string &sub)
Definition path.cpp:339
bool path_write_text(const string &path, string &text)
Definition path.cpp:674
#define min(a, b)
Definition sort.c:32
unsigned char uint8_t
Definition stdint.h:78
unsigned __int64 uint64_t
Definition stdint.h:90
string string_human_readable_size(size_t size)
Definition string.cpp:234
string string_human_readable_number(size_t num)
Definition string.cpp:255
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool adaptive_compile
Definition debug.h:94
bool use_local_atomic_sort
Definition debug.h:97
Definition DNA_ID.h:413
uint64_t data
uint interpolation
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:30
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition thread.h:29
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:36
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
@ INTERPOLATION_CLOSEST
ccl_device_inline size_t round_up(size_t x, size_t multiple)
Definition util/types.h:58
uint64_t device_ptr
Definition util/types.h:45
PointerRNA * ptr
Definition wm_files.cc:4126