Blender V4.5
device_impl.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include <map>
8# include <mutex>
9
10# include "device/metal/device.h"
12
13# include "scene/scene.h"
14
16
17# include "util/debug.h"
18# include "util/md5.h"
19# include "util/path.h"
20# include "util/time.h"
21
22# include <TargetConditionals.h>
23# include <crt_externs.h>
24
26
27class MetalDevice;
28
29thread_mutex MetalDevice::existing_devices_mutex;
30std::map<int, MetalDevice *> MetalDevice::active_device_ids;
31
32/* Thread-safe device access for async work. Calling code must pass an appropriately scoped lock
33 * to existing_devices_mutex to safeguard against destruction of the returned instance. */
34MetalDevice *MetalDevice::get_device_by_ID(const int ID,
35 thread_scoped_lock & /*existing_devices_mutex_lock*/)
36{
37 auto it = active_device_ids.find(ID);
38 if (it != active_device_ids.end()) {
39 return it->second;
40 }
41 return nullptr;
42}
43
44bool MetalDevice::is_device_cancelled(const int ID)
45{
46 thread_scoped_lock lock(existing_devices_mutex);
47 return get_device_by_ID(ID, lock) == nullptr;
48}
49
50BVHLayoutMask MetalDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
51{
52 return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
53}
54
55void MetalDevice::set_error(const string &error)
56{
57 static std::mutex s_error_mutex;
58 std::lock_guard<std::mutex> lock(s_error_mutex);
59
61
62 if (!has_error) {
63 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
64 fprintf(stderr,
65 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
66 has_error = true;
67 }
68}
69
70MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
71 : Device(info, stats, profiler, headless), texture_info(this, "texture_info", MEM_GLOBAL)
72{
73 @autoreleasepool {
74 {
75 /* Assign an ID for this device which we can use to query whether async shader compilation
76 * requests are still relevant. */
77 thread_scoped_lock lock(existing_devices_mutex);
78 static int existing_devices_counter = 1;
79 device_id = existing_devices_counter++;
80 active_device_ids[device_id] = this;
81 }
82
83 mtlDevId = info.num;
84
85 /* select chosen device */
86 auto usable_devices = MetalInfo::get_usable_devices();
87 assert(mtlDevId < usable_devices.size());
88 mtlDevice = usable_devices[mtlDevId];
89 metal_printf("Creating new Cycles Metal device: %s\n", info.description.c_str());
90
91 /* Enable increased concurrent shader compiler limit.
92 * This is also done by MTLContext::MTLContext, but only in GUI mode. */
93 if (@available(macOS 13.3, *)) {
94 [mtlDevice setShouldMaximizeConcurrentCompilation:YES];
95 }
96
97 max_threads_per_threadgroup = 512;
98
99 use_metalrt = info.use_hardware_raytracing;
100 if (auto *metalrt = getenv("CYCLES_METALRT")) {
101 use_metalrt = (atoi(metalrt) != 0);
102 }
103
104# if defined(MAC_OS_VERSION_15_0)
105 /* Use "Ray tracing with per component motion interpolation" if available.
106 * Requires Apple9 support (https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf). */
107 if (use_metalrt && [mtlDevice supportsFamily:MTLGPUFamilyApple9]) {
108 /* Concave motion paths weren't correctly bounded prior to macOS 15.6 (#136253). */
109 if (@available(macos 15.6, *)) {
110 use_pcmi = DebugFlags().metal.use_metalrt_pcmi;
111 }
112 }
113# endif
114
115 if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
116 capture_enabled = true;
117 }
118
119 /* Create a global counter sampling buffer when kernel profiling is enabled.
120 * There's a limit to the number of concurrent counter sampling buffers per device, so we
121 * create one that can be reused by successive device queues. */
122 if (auto str = getenv("CYCLES_METAL_PROFILING")) {
123 if (atoi(str) && [mtlDevice supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
124 {
125 NSArray<id<MTLCounterSet>> *counterSets = [mtlDevice counterSets];
126
127 NSError *error = nil;
128 MTLCounterSampleBufferDescriptor *desc = [[MTLCounterSampleBufferDescriptor alloc] init];
129 [desc setStorageMode:MTLStorageModeShared];
130 [desc setLabel:@"CounterSampleBuffer"];
131 [desc setSampleCount:MAX_SAMPLE_BUFFER_LENGTH];
132 [desc setCounterSet:counterSets[0]];
133 mtlCounterSampleBuffer = [mtlDevice newCounterSampleBufferWithDescriptor:desc
134 error:&error];
135 [mtlCounterSampleBuffer retain];
136 }
137 }
138
139 /* Set kernel_specialization_level based on user preferences. */
140 switch (info.kernel_optimization_level) {
142 kernel_specialization_level = PSO_GENERIC;
143 break;
144 default:
146 kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
147 break;
149 kernel_specialization_level = PSO_SPECIALIZED_SHADE;
150 break;
151 }
152
153 if (auto *envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
154 kernel_specialization_level = (MetalPipelineType)atoi(envstr);
155 }
156 metal_printf("kernel_specialization_level = %s\n",
157 kernel_type_as_string(
158 (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
159
160 MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
161 arg_desc_params.dataType = MTLDataTypePointer;
162 arg_desc_params.access = MTLArgumentAccessReadOnly;
163 arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
164 mtlBufferKernelParamsEncoder = [mtlDevice
165 newArgumentEncoderWithArguments:@[ arg_desc_params ]];
166
167 MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
168 arg_desc_texture.dataType = MTLDataTypeTexture;
169 arg_desc_texture.access = MTLArgumentAccessReadOnly;
170 mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
171 MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
172 arg_desc_buffer.dataType = MTLDataTypePointer;
173 arg_desc_buffer.access = MTLArgumentAccessReadOnly;
174 mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
175
176 buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
177 texture_bindings_2d = [mtlDevice newBufferWithLength:8192
178 options:MTLResourceStorageModeShared];
179 texture_bindings_3d = [mtlDevice newBufferWithLength:8192
180 options:MTLResourceStorageModeShared];
181 stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
182 texture_bindings_3d.allocatedSize);
183
184 /* Command queue for path-tracing work on the GPU. In a situation where multiple
185 * MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue.
186 * This is thread safe and just as performant as each having their own instance. It also
187 * adheres to best practices of maximizing the lifetime of each MTLCommandQueue. */
188 mtlComputeCommandQueue = [mtlDevice newCommandQueue];
189
190 /* Command queue for non-tracing work on the GPU. */
191 mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
192
193 /* Acceleration structure arg encoder, if needed */
194 if (@available(macos 12.0, *)) {
195 if (use_metalrt) {
196 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
197 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
198 arg_desc_as.access = MTLArgumentAccessReadOnly;
199 mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
200 [arg_desc_as release];
201 }
202 }
203
204 /* Build the arg encoder for the ancillary bindings */
205 {
206 NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
207
208 int index = 0;
209 MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
210 arg_desc_tex.dataType = MTLDataTypePointer;
211 arg_desc_tex.access = MTLArgumentAccessReadOnly;
212
213 arg_desc_tex.index = index++;
214 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
215 arg_desc_tex.index = index++;
216 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
217 arg_desc_tex.index = index++;
218 [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */
219
220 [arg_desc_tex release];
221
222 if (@available(macos 12.0, *)) {
223 if (use_metalrt) {
224 MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
225 arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
226 arg_desc_as.access = MTLArgumentAccessReadOnly;
227
228 MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
229 arg_desc_ptrs.dataType = MTLDataTypePointer;
230 arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
231
232 MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
233 arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
234 arg_desc_ift.access = MTLArgumentAccessReadOnly;
235
236 arg_desc_as.index = index++;
237 [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
238
239 /* Intersection function tables */
240 arg_desc_ift.index = index++;
241 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
242 arg_desc_ift.index = index++;
243 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
244 arg_desc_ift.index = index++;
245 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */
246 arg_desc_ift.index = index++;
247 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */
248 arg_desc_ift.index = index++;
249 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
250 arg_desc_ift.index = index++;
251 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */
252 arg_desc_ift.index = index++;
253 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */
254 arg_desc_ift.index = index++;
255 [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */
256
257 arg_desc_ptrs.index = index++;
258 [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */
259
260 [arg_desc_ift release];
261 [arg_desc_as release];
262 [arg_desc_ptrs release];
263 }
264 }
265
266 mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
267
268 // preparing the blas arg encoder
269
270 if (use_metalrt) {
271 MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
272 arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
273 arg_desc_blas.access = MTLArgumentAccessReadOnly;
274 mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
275 [arg_desc_blas release];
276 }
277
278 for (int i = 0; i < ancillary_desc.count; i++) {
279 [ancillary_desc[i] release];
280 }
281 [ancillary_desc release];
282 }
283 [arg_desc_params release];
284 [arg_desc_texture release];
285 }
286}
287
288MetalDevice::~MetalDevice()
289{
290 /* Cancel any async shader compilations that are in flight. */
291 cancel();
292
293 /* This lock safeguards against destruction during use (see other uses of
294 * existing_devices_mutex). */
295 thread_scoped_lock lock(existing_devices_mutex);
296
297 int num_resources = texture_info.size();
298 for (int res = 0; res < num_resources; res++) {
299 if (is_texture(texture_info[res])) {
300 [texture_slot_map[res] release];
301 texture_slot_map[res] = nil;
302 }
303 }
304
305 free_bvh();
306 flush_delayed_free_list();
307
308 if (texture_bindings_2d) {
309 stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
310 texture_bindings_3d.allocatedSize);
311 [buffer_bindings_1d release];
312 [texture_bindings_2d release];
313 [texture_bindings_3d release];
314 }
315 [mtlTextureArgEncoder release];
316 [mtlBufferKernelParamsEncoder release];
317 [mtlBufferArgEncoder release];
318 [mtlASArgEncoder release];
319 [mtlAncillaryArgEncoder release];
320 [mtlComputeCommandQueue release];
321 [mtlGeneralCommandQueue release];
322 if (mtlCounterSampleBuffer) {
323 [mtlCounterSampleBuffer release];
324 }
325 [mtlDevice release];
326
327 texture_info.free();
328}
329
330bool MetalDevice::support_device(const uint /*kernel_features*/)
331{
332 return true;
333}
334
335bool MetalDevice::check_peer_access(Device * /*peer_device*/)
336{
337 assert(0);
338 /* does peer access make sense? */
339 return false;
340}
341
342bool MetalDevice::use_adaptive_compilation()
343{
345}
346
347bool MetalDevice::use_local_atomic_sort() const
348{
350}
351
352string MetalDevice::preprocess_source(MetalPipelineType pso_type,
353 const uint kernel_features,
354 string *source)
355{
356 string global_defines;
357 if (use_adaptive_compilation()) {
358 global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
359 }
360
361 if (use_local_atomic_sort()) {
362 global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
363 }
364
365 if (use_metalrt) {
366 global_defines += "#define __METALRT__\n";
367 if (motion_blur) {
368 global_defines += "#define __METALRT_MOTION__\n";
369 }
370 }
371
372# ifdef WITH_CYCLES_DEBUG
373 global_defines += "#define WITH_CYCLES_DEBUG\n";
374# endif
375
376 global_defines += "#define __KERNEL_METAL_APPLE__\n";
377 if (@available(macos 14.0, *)) {
378 /* Use Program Scope Global Built-ins, when available. */
379 global_defines += "#define __METAL_GLOBAL_BUILTINS__\n";
380 }
381# ifdef WITH_NANOVDB
382 /* Compiling in NanoVDB results in a marginal drop in render performance,
383 * so disable it for specialized PSOs when no textures are using it. */
384 if ((pso_type == PSO_GENERIC || using_nanovdb) && DebugFlags().metal.use_nanovdb) {
385 global_defines += "#define WITH_NANOVDB\n";
386 }
387# endif
388
389 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
390 NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
391 global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";
392
393# if TARGET_CPU_ARM64
394 global_defines += "#define __KERNEL_METAL_TARGET_CPU_ARM64__\n";
395# endif
396
397 /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
398 * the same character length. Build a string of all active constant values which is then hashed
399 * in order to identify the PSO.
400 */
401 if (pso_type != PSO_GENERIC) {
402 if (source) {
403 const double starttime = time_dt();
404
405# define KERNEL_STRUCT_BEGIN(name, parent) \
406 string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
407
408 bool next_member_is_specialized = true;
409
410# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
411
412# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
413 if (!next_member_is_specialized) { \
414 string_replace( \
415 *source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
416 next_member_is_specialized = true; \
417 }
418
419# include "kernel/data_template.h"
420
421# undef KERNEL_STRUCT_MEMBER
422# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
423# undef KERNEL_STRUCT_BEGIN
424
425 metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
426 }
427
428 /* Opt in to all of available specializations. This can be made more granular for the
429 * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
430 * but the overhead should be negligible as these are very quick to (re)build and aren't
431 * serialized to disk via MTLBinaryArchives.
432 */
433 global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
434 }
435
436 if (source) {
437 *source = global_defines + *source;
438 }
439
440 MD5Hash md5;
441 md5.append(global_defines);
442 return md5.get_hex();
443}
444
445void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
446{
447 string &source = this->source[pso_type];
448 source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
449 source = path_source_replace_includes(source, path_get("source"));
450
451 /* Perform any required specialization on the source.
452 * With Metal function constants we can generate a single variant of the kernel source which can
453 * be repeatedly respecialized.
454 */
455 global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
456}
457
458bool MetalDevice::load_kernels(const uint _kernel_features)
459{
460 @autoreleasepool {
461 kernel_features |= _kernel_features;
462
463 /* check if GPU is supported */
464 if (!support_device(kernel_features)) {
465 return false;
466 }
467
468 /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
469 * This is necessary since objects may be reported to have motion if the Vector pass is
470 * active, but may still need to be rendered without motion blur if that isn't active as well.
471 */
472 motion_blur = motion_blur || (kernel_features & KERNEL_FEATURE_OBJECT_MOTION);
473
474 /* Only request generic kernels if they aren't cached in memory. */
475 refresh_source_and_kernels_md5(PSO_GENERIC);
476 if (MetalDeviceKernels::should_load_kernels(this, PSO_GENERIC)) {
477 /* If needed, load them asynchronously in order to responsively message progress to the user.
478 */
479 int this_device_id = this->device_id;
480 auto compile_kernels_fn = ^() {
481 compile_and_load(this_device_id, PSO_GENERIC);
482 };
483
484 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
485 compile_kernels_fn);
486 }
487 }
488 return true;
489}
490
491void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
492{
493 string defines_md5 = preprocess_source(pso_type, kernel_features);
494
495 /* Rebuild the source string if the injected block of #defines has changed. */
496 if (global_defines_md5[pso_type] != defines_md5) {
497 make_source(pso_type, kernel_features);
498 }
499
500 string constant_values;
501 if (pso_type != PSO_GENERIC) {
502 bool next_member_is_specialized = true;
503
504# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
505
506 /* Add specialization constants to md5 so that 'get_best_pipeline' is able to return a suitable
507 * match. */
508# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
509 if (next_member_is_specialized) { \
510 constant_values += string(#parent "." #name "=") + \
511 to_string(_type(launch_params.data.parent.name)) + "\n"; \
512 } \
513 else { \
514 next_member_is_specialized = true; \
515 }
516
517# include "kernel/data_template.h"
518
519# undef KERNEL_STRUCT_MEMBER
520# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
521 }
522
523 MD5Hash md5;
524 md5.append(constant_values);
525 md5.append(source[pso_type]);
526 if (use_metalrt) {
527 md5.append(string_printf("metalrt_features=%d", kernel_features & METALRT_FEATURE_MASK));
528 }
529 kernels_md5[pso_type] = md5.get_hex();
530}
531
532void MetalDevice::compile_and_load(const int device_id, MetalPipelineType pso_type)
533{
534 @autoreleasepool {
535 /* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few
536 * seconds, so we avoid blocking device tear-down if the user cancels a render immediately. */
537
538 id<MTLDevice> mtlDevice;
539 string source;
540
541 /* Safely gather any state required for the MSL->AIR compilation. */
542 {
543 thread_scoped_lock lock(existing_devices_mutex);
544
545 /* Check whether the device still exists. */
546 MetalDevice *instance = get_device_by_ID(device_id, lock);
547 if (!instance) {
548 metal_printf("Ignoring %s compilation request - device no longer exists\n",
549 kernel_type_as_string(pso_type));
550 return;
551 }
552
553 if (!MetalDeviceKernels::should_load_kernels(instance, pso_type)) {
554 /* We already have a full set of matching pipelines which are cached or queued. Return
555 * early to avoid redundant MTLLibrary compilation. */
556 metal_printf("Ignoreing %s compilation request - kernels already requested\n",
557 kernel_type_as_string(pso_type));
558 return;
559 }
560
561 mtlDevice = instance->mtlDevice;
562 source = instance->source[pso_type];
563 }
564
565 /* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
566 * in this time. */
567
568 MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
569
570 options.fastMathEnabled = YES;
571 if (@available(macos 12.0, *)) {
572 options.languageVersion = MTLLanguageVersion2_4;
573 }
574# if defined(MAC_OS_VERSION_13_0)
575 if (@available(macos 13.0, *)) {
576 options.languageVersion = MTLLanguageVersion3_0;
577 }
578# endif
579# if defined(MAC_OS_VERSION_14_0)
580 if (@available(macos 14.0, *)) {
581 options.languageVersion = MTLLanguageVersion3_1;
582 }
583# endif
584
585 if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
586 path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
587 source);
588 }
589
590 double starttime = time_dt();
591
592 NSError *error = nullptr;
593 id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
595 error:&error];
596
597 metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
598 time_dt() - starttime,
599 kernel_type_as_string(pso_type));
600
601 [options release];
602
603 bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") ||
604 MetalDeviceKernels::is_benchmark_warmup());
605 if (blocking_pso_build) {
606 MetalDeviceKernels::wait_for_all();
607 starttime = 0.0;
608 }
609
610 /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
611 * exists). */
612 {
613 thread_scoped_lock lock(existing_devices_mutex);
614 if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
615 if (mtlLibrary) {
616 if (error && [error localizedDescription]) {
617 VLOG_WARNING << "MSL compilation messages: "
618 << [[error localizedDescription] UTF8String];
619 }
620
621 instance->mtlLibrary[pso_type] = mtlLibrary;
622
623 starttime = time_dt();
624 MetalDeviceKernels::load(instance, pso_type);
625 }
626 else {
627 NSString *err = [error localizedDescription];
628 instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
629 }
630 }
631 }
632
633 if (starttime && blocking_pso_build) {
634 MetalDeviceKernels::wait_for_all();
635
636 metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
637 time_dt() - starttime,
638 kernel_type_as_string(pso_type));
639 }
640 }
641}
642
643bool MetalDevice::is_texture(const TextureInfo &tex)
644{
645 return (tex.depth > 0 || tex.height > 0);
646}
647
648void MetalDevice::load_texture_info()
649{
650 if (need_texture_info) {
651 /* Unset flag before copying. */
652 need_texture_info = false;
653 texture_info.copy_to_device();
654
655 int num_textures = texture_info.size();
656
657 for (int tex = 0; tex < num_textures; tex++) {
658 uint64_t offset = tex * sizeof(void *);
659 if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
660 id<MTLTexture> metal_texture = texture_slot_map[tex];
661 MTLTextureType type = metal_texture.textureType;
662 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
663 [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
664 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
665 [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
666 }
667 else {
668 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
669 [mtlTextureArgEncoder setTexture:nil atIndex:0];
670 [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
671 [mtlTextureArgEncoder setTexture:nil atIndex:0];
672 }
673 }
674 }
675}
676
677void MetalDevice::erase_allocation(device_memory &mem)
678{
679 stats.mem_free(mem.device_size);
680 mem.device_pointer = 0;
681 mem.device_size = 0;
682
683 auto it = metal_mem_map.find(&mem);
684 if (it != metal_mem_map.end()) {
685 MetalMem *mmem = it->second.get();
686
687 /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
688 if (mmem->pointer_index >= 0) {
689 device_ptr *pointers = (device_ptr *)&launch_params;
690 pointers[mmem->pointer_index] = 0;
691 }
692 metal_mem_map.erase(it);
693 }
694}
695
696bool MetalDevice::max_working_set_exceeded(const size_t safety_margin) const
697{
698 /* We're allowed to allocate beyond the safe working set size, but then if all resources are made
699 * resident we will get command buffer failures at render time. */
700 size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
701 return (stats.mem_used > available);
702}
703
704MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
705{
706 @autoreleasepool {
707 size_t size = mem.memory_size();
708
709 mem.device_pointer = 0;
710
711 id<MTLBuffer> metal_buffer = nil;
712 MTLResourceOptions options = MTLResourceStorageModeShared;
713
714 if (size > 0) {
715 if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
716 options = MTLResourceStorageModePrivate;
717 }
718
719 metal_buffer = [mtlDevice newBufferWithLength:size options:options];
720
721 if (!metal_buffer) {
722 set_error("System is out of GPU memory");
723 return nullptr;
724 }
725 }
726
727 if (mem.name) {
728 VLOG_WORK << "Buffer allocate: " << mem.name << ", "
729 << string_human_readable_number(mem.memory_size()) << " bytes. ("
731 }
732
733 mem.device_size = metal_buffer.allocatedSize;
734 stats.mem_alloc(mem.device_size);
735
736 metal_buffer.label = [NSString stringWithFormat:@"%s", mem.name];
737
738 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
739
740 assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
741 unique_ptr<MetalMem> mmem = make_unique<MetalMem>();
742
743 mmem->mem = &mem;
744 mmem->mtlBuffer = metal_buffer;
745 mmem->offset = 0;
746 mmem->size = size;
747 if (options != MTLResourceStorageModePrivate) {
748 mmem->hostPtr = [metal_buffer contents];
749 }
750 else {
751 mmem->hostPtr = nullptr;
752 }
753
754 /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device
755 * pointer recalculation */
756 mem.device_pointer = device_ptr(mmem.get());
757
758 if (metal_buffer.storageMode == MTLStorageModeShared) {
759 /* Replace host pointer with our host allocation. */
760 if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
761 memcpy(mmem->hostPtr, mem.host_pointer, size);
762
763 host_free(mem.type, mem.host_pointer, mem.memory_size());
764 mem.host_pointer = mmem->hostPtr;
765 }
766 mem.shared_pointer = mmem->hostPtr;
767 mem.shared_counter++;
768 }
769
770 MetalMem *mmem_ptr = mmem.get();
771 metal_mem_map[&mem] = std::move(mmem);
772
773 if (max_working_set_exceeded()) {
774 set_error("System is out of GPU memory");
775 return nullptr;
776 }
777
778 return mmem_ptr;
779 }
780}
781
782void MetalDevice::generic_copy_to(device_memory &)
783{
784 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
785}
786
787void MetalDevice::generic_free(device_memory &mem)
788{
789 if (!mem.device_pointer) {
790 return;
791 }
792
793 /* Host pointer should already have been freed at this point. If not we might
794 * end up freeing shared memory and can't recover original host memory. */
795 assert(mem.host_pointer == nullptr);
796
797 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
798 MetalMem &mmem = *metal_mem_map.at(&mem);
799 size_t size = mmem.size;
800
801 bool free_mtlBuffer = true;
802
803 /* If this is shared, reference counting is used to safely free memory. */
804 if (mem.shared_pointer) {
805 assert(mem.shared_counter > 0);
806 if (--mem.shared_counter > 0) {
807 free_mtlBuffer = false;
808 }
809 }
810
811 if (free_mtlBuffer) {
812 if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
813 /* Safely move the device-side data back to the host before it is freed.
814 * We should actually never reach this code as it is inefficient, but
815 * better than to crash if there is a bug. */
816 assert(!"Metal device should not copy memory back to host");
817 mem.host_pointer = mem.host_alloc(size);
818 memcpy(mem.host_pointer, mem.shared_pointer, size);
819 }
820
821 mem.shared_pointer = nullptr;
822
823 /* Free device memory. */
824 delayed_free_list.push_back(mmem.mtlBuffer);
825 mmem.mtlBuffer = nil;
826 }
827
828 erase_allocation(mem);
829}
830
831void MetalDevice::mem_alloc(device_memory &mem)
832{
833 if (mem.type == MEM_TEXTURE) {
834 assert(!"mem_alloc not supported for textures.");
835 }
836 else if (mem.type == MEM_GLOBAL) {
837 generic_alloc(mem);
838 }
839 else {
840 generic_alloc(mem);
841 }
842}
843
844void MetalDevice::mem_copy_to(device_memory &mem)
845{
846 if (!mem.device_pointer) {
847 if (mem.type == MEM_GLOBAL) {
848 global_alloc(mem);
849 }
850 else if (mem.type == MEM_TEXTURE) {
851 tex_alloc((device_texture &)mem);
852 }
853 else {
854 generic_alloc(mem);
855 generic_copy_to(mem);
856 }
857 }
858 else if (mem.is_resident(this)) {
859 if (mem.type == MEM_GLOBAL) {
860 generic_copy_to(mem);
861 }
862 else if (mem.type == MEM_TEXTURE) {
863 tex_copy_to((device_texture &)mem);
864 }
865 else {
866 generic_copy_to(mem);
867 }
868 }
869}
870
871void MetalDevice::mem_move_to_host(device_memory & /*mem*/)
872{
873 /* Metal implements own mechanism for moving host memory. */
874 assert(!"Metal does not support mem_move_to_host");
875}
876
877void MetalDevice::mem_copy_from(device_memory &, const size_t, size_t, const size_t, size_t)
878{
879 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
880}
881
882void MetalDevice::mem_zero(device_memory &mem)
883{
884 if (!mem.device_pointer) {
885 mem_alloc(mem);
886 }
888 memset(mem.shared_pointer, 0, mem.memory_size());
889}
890
891void MetalDevice::mem_free(device_memory &mem)
892{
893 if (mem.type == MEM_GLOBAL) {
894 global_free(mem);
895 }
896 else if (mem.type == MEM_TEXTURE) {
897 tex_free((device_texture &)mem);
898 }
899 else {
900 generic_free(mem);
901 }
902}
903
904device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory & /*mem*/,
905 size_t /*offset*/,
906 size_t /*size*/)
907{
908 /* METAL_WIP - revive if necessary */
909 assert(0);
910 return 0;
911}
912
913void MetalDevice::cancel()
914{
915 /* Remove this device's ID from the list of active devices. Any pending compilation requests
916 * originating from this session will be cancelled. */
917 thread_scoped_lock lock(existing_devices_mutex);
918 if (device_id) {
919 active_device_ids.erase(device_id);
920 device_id = 0;
921 }
922}
923
924bool MetalDevice::is_ready(string &status) const
925{
926 if (!error_msg.empty()) {
927 /* Avoid hanging if we had an error. */
928 return true;
929 }
930
931 int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
932 if (num_loaded < DEVICE_KERNEL_NUM) {
933 status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
934 num_loaded,
936 return false;
937 }
938
939 if (int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
940 status = string_printf("%d kernels to optimize", num_requests);
941 }
942 else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
943 status = "Using optimized intersection kernels";
944 }
945 else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
946 status = "Using optimized kernels";
947 }
948
949 metal_printf("MetalDevice::is_ready(...) --> true\n");
950 return true;
951}
952
953void MetalDevice::optimize_for_scene(Scene *scene)
954{
955 MetalPipelineType specialization_level = kernel_specialization_level;
956
957 if (!scene->params.background) {
958 /* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
959 specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
960 }
961
962 /* For responsive rendering, specialize the kernels in the background, and only if there isn't an
963 * existing "optimize_for_scene" request in flight. */
964 int this_device_id = this->device_id;
965 auto specialize_kernels_fn = ^() {
966 for (int level = 1; level <= int(specialization_level); level++) {
967 compile_and_load(this_device_id, MetalPipelineType(level));
968 }
969 };
970
971 /* In normal use, we always compile the specialized kernels in the background. */
972 bool specialize_in_background = true;
973
974 /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
975 if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
976 specialize_in_background = false;
977 }
978
979 /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
980 if (MetalDeviceKernels::is_benchmark_warmup()) {
981 specialize_in_background = false;
982 }
983
984 if (specialize_in_background) {
985 if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
986 dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
987 specialize_kernels_fn);
988 }
989 else {
990 metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
991 }
992 }
993 else {
994 specialize_kernels_fn();
995 }
996}
997
998void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
999{
1000 if (strcmp(name, "data") == 0) {
1001 assert(size == sizeof(KernelData));
1002 memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
1003
1004 /* Refresh the kernels_md5 checksums for specialized kernel sets. */
1005 for (int level = 1; level <= int(kernel_specialization_level); level++) {
1006 refresh_source_and_kernels_md5(MetalPipelineType(level));
1007 }
1008 return;
1009 }
1010
1011 auto update_launch_pointers =
1012 [&](size_t offset, void *data, const size_t data_size, const size_t pointers_size) {
1013 memcpy((uint8_t *)&launch_params + offset, data, data_size);
1014
1015 MetalMem **mmem = (MetalMem **)data;
1016 int pointer_count = pointers_size / sizeof(device_ptr);
1017 int pointer_index = offset / sizeof(device_ptr);
1018 for (int i = 0; i < pointer_count; i++) {
1019 if (mmem[i]) {
1020 mmem[i]->pointer_index = pointer_index + i;
1021 }
1022 }
1023 };
1024
1025 /* Update data storage pointers in launch parameters. */
1026 if (strcmp(name, "integrator_state") == 0) {
1027 /* IntegratorStateGPU is contiguous pointers */
1028 const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
1029 update_launch_pointers(
1030 offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
1031 }
1032# define KERNEL_DATA_ARRAY(data_type, tex_name) \
1033 else if (strcmp(name, #tex_name) == 0) { \
1034 update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
1035 }
1036# include "kernel/data_arrays.h"
1037# undef KERNEL_DATA_ARRAY
1038}
1039
1040void MetalDevice::global_alloc(device_memory &mem)
1041{
1042 if (mem.is_resident(this)) {
1043 generic_alloc(mem);
1044 generic_copy_to(mem);
1045 }
1046
1047 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
1048}
1049
1050void MetalDevice::global_free(device_memory &mem)
1051{
1052 if (mem.is_resident(this) && mem.device_pointer) {
1053 generic_free(mem);
1054 }
1055}
1056
1057void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
1058{
1059 MetalDevice::MetalMem *mmem = generic_alloc(mem);
1060 generic_copy_to(mem);
1061
1062 /* Resize once */
1063 const uint slot = mem.slot;
1064 if (slot >= texture_info.size()) {
1065 /* Allocate some slots in advance, to reduce amount
1066 * of re-allocations. */
1067 texture_info.resize(round_up(slot + 1, 128));
1068 texture_slot_map.resize(round_up(slot + 1, 128));
1069 }
1070
1071 texture_info[slot] = mem.info;
1072 uint64_t offset = slot * sizeof(void *);
1073 [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
1074 [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
1075 texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
1076 texture_slot_map[slot] = nil;
1077 need_texture_info = true;
1078
1083 {
1084 using_nanovdb = true;
1085 }
1086}
1087
1088void MetalDevice::tex_alloc(device_texture &mem)
1089{
1090 @autoreleasepool {
1091 /* Check that dimensions fit within maximum allowable size.
1092 * If 1D texture is allocated, use 1D buffer.
1093 * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
1094 if (mem.data_height > 0) {
1095 if (mem.data_width > 16384 || mem.data_height > 16384) {
1096 set_error(string_printf(
1097 "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
1098 mem.data_width,
1099 mem.data_height));
1100 return;
1101 }
1102 }
1103
1104 /* General variables for both architectures */
1105 size_t size = mem.memory_size();
1106
1107 /* sampler_index maps into the GPU's constant 'metal_samplers' array */
1108 uint64_t sampler_index = mem.info.extension;
1110 sampler_index += 4;
1111 }
1112
1113 /* Image Texture Storage */
1114 MTLPixelFormat format;
1115 switch (mem.data_type) {
1116 case TYPE_UCHAR: {
1117 MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
1118 MTLPixelFormatRG8Unorm,
1119 MTLPixelFormatInvalid,
1120 MTLPixelFormatRGBA8Unorm};
1121 format = formats[mem.data_elements - 1];
1122 } break;
1123 case TYPE_UINT16: {
1124 MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
1125 MTLPixelFormatRG16Unorm,
1126 MTLPixelFormatInvalid,
1127 MTLPixelFormatRGBA16Unorm};
1128 format = formats[mem.data_elements - 1];
1129 } break;
1130 case TYPE_UINT: {
1131 MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
1132 MTLPixelFormatRG32Uint,
1133 MTLPixelFormatInvalid,
1134 MTLPixelFormatRGBA32Uint};
1135 format = formats[mem.data_elements - 1];
1136 } break;
1137 case TYPE_INT: {
1138 MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
1139 MTLPixelFormatRG32Sint,
1140 MTLPixelFormatInvalid,
1141 MTLPixelFormatRGBA32Sint};
1142 format = formats[mem.data_elements - 1];
1143 } break;
1144 case TYPE_FLOAT: {
1145 MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
1146 MTLPixelFormatRG32Float,
1147 MTLPixelFormatInvalid,
1148 MTLPixelFormatRGBA32Float};
1149 format = formats[mem.data_elements - 1];
1150 } break;
1151 case TYPE_HALF: {
1152 MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
1153 MTLPixelFormatRG16Float,
1154 MTLPixelFormatInvalid,
1155 MTLPixelFormatRGBA16Float};
1156 format = formats[mem.data_elements - 1];
1157 } break;
1158 default:
1159 assert(0);
1160 return;
1161 }
1162
1163 assert(format != MTLPixelFormatInvalid);
1164
1165 id<MTLTexture> mtlTexture = nil;
1166 size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
1167
1168 if (mem.data_depth > 1) {
1169 /* 3D texture using array */
1170 MTLTextureDescriptor *desc;
1171
1172 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
1173 width:mem.data_width
1174 height:mem.data_height
1175 mipmapped:NO];
1176
1177 desc.storageMode = MTLStorageModeShared;
1178 desc.usage = MTLTextureUsageShaderRead;
1179
1180 desc.textureType = MTLTextureType3D;
1181 desc.depth = mem.data_depth;
1182
1183 VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
1184 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1185 << string_human_readable_size(mem.memory_size()) << ")";
1186
1187 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1188 if (!mtlTexture) {
1189 set_error("System is out of GPU memory");
1190 return;
1191 }
1192
1193 const size_t imageBytes = src_pitch * mem.data_height;
1194 for (size_t d = 0; d < mem.data_depth; d++) {
1195 const size_t offset = d * imageBytes;
1196 [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
1197 mipmapLevel:0
1198 slice:0
1199 withBytes:(uint8_t *)mem.host_pointer + offset
1200 bytesPerRow:src_pitch
1201 bytesPerImage:0];
1202 }
1203 }
1204 else if (mem.data_height > 0) {
1205 /* 2D texture */
1206 MTLTextureDescriptor *desc;
1207
1208 desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
1209 width:mem.data_width
1210 height:mem.data_height
1211 mipmapped:NO];
1212
1213 desc.storageMode = MTLStorageModeShared;
1214 desc.usage = MTLTextureUsageShaderRead;
1215
1216 VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
1217 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1218 << string_human_readable_size(mem.memory_size()) << ")";
1219
1220 mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
1221 if (!mtlTexture) {
1222 set_error("System is out of GPU memory");
1223 return;
1224 }
1225
1226 [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
1227 mipmapLevel:0
1228 withBytes:mem.host_pointer
1229 bytesPerRow:src_pitch];
1230 }
1231 else {
1232 /* 1D texture, using linear memory. */
1233 tex_alloc_as_buffer(mem);
1234 return;
1235 }
1236
1237 mem.device_pointer = (device_ptr)mtlTexture;
1238 mem.device_size = size;
1239 stats.mem_alloc(size);
1240
1241 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1242 unique_ptr<MetalMem> mmem = make_unique<MetalMem>();
1243 mmem->mem = &mem;
1244 mmem->mtlTexture = mtlTexture;
1245 metal_mem_map[&mem] = std::move(mmem);
1246
1247 /* Resize once */
1248 const uint slot = mem.slot;
1249 if (slot >= texture_info.size()) {
1250 /* Allocate some slots in advance, to reduce amount
1251 * of re-allocations. */
1252 texture_info.resize(slot + 128);
1253 texture_slot_map.resize(slot + 128);
1254
1255 ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
1256 if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
1257 if (texture_bindings_2d) {
1258 delayed_free_list.push_back(buffer_bindings_1d);
1259 delayed_free_list.push_back(texture_bindings_2d);
1260 delayed_free_list.push_back(texture_bindings_3d);
1261
1262 stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1263 texture_bindings_3d.allocatedSize);
1264 }
1265 buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
1266 options:MTLResourceStorageModeShared];
1267 texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
1268 options:MTLResourceStorageModeShared];
1269 texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
1270 options:MTLResourceStorageModeShared];
1271
1272 stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
1273 texture_bindings_3d.allocatedSize);
1274 }
1275 }
1276
1277 /* Optimize the texture for GPU access. */
1278 id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
1279 id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
1280 [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
1281 [blitCommandEncoder endEncoding];
1282 [commandBuffer commit];
1283
1284 /* Set Mapping and tag that we need to (re-)upload to device */
1285 texture_slot_map[slot] = mtlTexture;
1286 texture_info[slot] = mem.info;
1287 need_texture_info = true;
1288
1289 texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
1290
1291 if (max_working_set_exceeded()) {
1292 set_error("System is out of GPU memory");
1293 }
1294 }
1295}
1296
1297void MetalDevice::tex_copy_to(device_texture &mem)
1298{
1299 if (mem.is_resident(this)) {
1300 const size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
1301
1302 if (mem.data_depth > 0) {
1303 id<MTLTexture> mtlTexture;
1304 {
1305 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1306 mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
1307 }
1308 const size_t imageBytes = src_pitch * mem.data_height;
1309 for (size_t d = 0; d < mem.data_depth; d++) {
1310 const size_t offset = d * imageBytes;
1311 [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
1312 mipmapLevel:0
1313 slice:0
1314 withBytes:(uint8_t *)mem.host_pointer + offset
1315 bytesPerRow:src_pitch
1316 bytesPerImage:0];
1317 }
1318 }
1319 else if (mem.data_height > 0) {
1320 id<MTLTexture> mtlTexture;
1321 {
1322 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1323 mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
1324 }
1325 [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
1326 mipmapLevel:0
1327 withBytes:mem.host_pointer
1328 bytesPerRow:src_pitch];
1329 }
1330 else {
1331 generic_copy_to(mem);
1332 }
1333 }
1334}
1335
1336void MetalDevice::tex_free(device_texture &mem)
1337{
1338 if (mem.data_depth == 0 && mem.data_height == 0) {
1339 generic_free(mem);
1340 return;
1341 }
1342
1343 if (metal_mem_map.count(&mem)) {
1344 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1345 MetalMem &mmem = *metal_mem_map.at(&mem);
1346
1347 assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
1348 if (texture_slot_map[mem.slot] == mmem.mtlTexture) {
1349 texture_slot_map[mem.slot] = nil;
1350 }
1351
1352 if (mmem.mtlTexture) {
1353 /* Free bindless texture. */
1354 delayed_free_list.push_back(mmem.mtlTexture);
1355 mmem.mtlTexture = nil;
1356 }
1357 erase_allocation(mem);
1358 }
1359}
1360
1361unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
1362{
1363 return make_unique<MetalDeviceQueue>(this);
1364}
1365
1366bool MetalDevice::should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
1367 const bool /*log*/)
1368{
1369 /* Always supported with unified memory. */
1370 return interop_device.type == GraphicsInteropDevice::METAL;
1371}
1372
1373void *MetalDevice::get_native_buffer(device_ptr ptr)
1374{
1375 return ((MetalMem *)ptr)->mtlBuffer;
1376}
1377
1378void MetalDevice::flush_delayed_free_list()
1379{
1380 /* free any Metal buffers that may have been freed by host while a command
1381 * buffer was being generated. This function should be called after each
1382 * completion of a command buffer */
1383 std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1384 for (auto &it : delayed_free_list) {
1385 [it release];
1386 }
1387 delayed_free_list.clear();
1388}
1389
1390void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1391{
1392 @autoreleasepool {
1393 if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1395 return;
1396 }
1397
1398 BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
1399 bvh_metal->motion_blur = motion_blur;
1400 bvh_metal->use_pcmi = use_pcmi;
1401 if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
1402
1403 if (bvh->params.top_level) {
1404 update_bvh(bvh_metal);
1405 }
1406 }
1407
1408 if (max_working_set_exceeded()) {
1409 set_error("System is out of GPU memory");
1410 }
1411 }
1412}
1413
1414void MetalDevice::free_bvh()
1415{
1416 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1417 [blas release];
1418 }
1419 unique_blas_array.clear();
1420
1421 if (blas_buffer) {
1422 [blas_buffer release];
1423 blas_buffer = nil;
1424 }
1425
1426 if (accel_struct) {
1427 [accel_struct release];
1428 accel_struct = nil;
1429 }
1430}
1431
1432void MetalDevice::update_bvh(BVHMetal *bvh_metal)
1433{
1434 free_bvh();
1435
1436 if (!bvh_metal) {
1437 return;
1438 }
1439
1440 accel_struct = bvh_metal->accel_struct;
1441 unique_blas_array = bvh_metal->unique_blas_array;
1442
1443 [accel_struct retain];
1444 for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
1445 [blas retain];
1446 }
1447
1448 // Allocate required buffers for BLAS array.
1449 uint64_t count = bvh_metal->blas_array.size();
1450 uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
1451 blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:MTLResourceStorageModeShared];
1452 stats.mem_alloc(blas_buffer.allocatedSize);
1453
1454 for (uint64_t i = 0; i < count; ++i) {
1455 if (bvh_metal->blas_array[i]) {
1456 [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
1457 [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
1458 }
1459 }
1460}
1461
1463
1464#endif
unsigned int uint
float progress
Definition WM_types.hh:1019
volatile int lock
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
BVHLayout bvh_layout
Definition params.h:83
bool top_level
Definition params.h:80
Definition bvh/bvh.h:67
BVHParams params
Definition bvh/bvh.h:69
Metal metal
Definition debug.h:132
KernelOptimizationLevel kernel_optimization_level
string description
bool use_hardware_raytracing
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void set_error(const string &error)
Definition md5.h:19
void append(const uint8_t *data, const int nbytes)
Definition md5.cpp:260
string get_hex()
Definition md5.cpp:359
bool background
Definition scene.h:76
size_t mem_used
Definition util/stats.h:30
void mem_alloc(const size_t size)
Definition util/stats.h:18
void mem_free(const size_t size)
Definition util/stats.h:24
bool is_resident(Device *sub_device) const
Definition memory.cpp:134
void * host_alloc(const size_t size)
Definition memory.cpp:42
static constexpr size_t datatype_size(DataType datatype)
@ MEM_TEXTURE
@ MEM_DEVICE_ONLY
@ TYPE_UINT16
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
Definition debug.h:145
#define KERNEL_FEATURE_OBJECT_MOTION
#define CCL_NAMESPACE_END
@ KERNEL_OPTIMIZATION_LEVEL_OFF
@ KERNEL_OPTIMIZATION_LEVEL_FULL
@ KERNEL_OPTIMIZATION_LEVEL_INTERSECT
#define offsetof(t, d)
#define str(s)
static const char * to_string(const Interpolation &interp)
Definition gl_shader.cc:109
#define this
#define assert(assertion)
int count
@ BVH_LAYOUT_METAL
@ BVH_LAYOUT_BVH2
@ DEVICE_KERNEL_NUM
format
#define VLOG_WARNING
Definition log.h:69
#define VLOG_WORK
Definition log.h:74
static void error(const char *str)
static void init(bNodeTree *, bNode *node)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
int BVHLayoutMask
Definition params.h:50
string path_cache_get(const string &sub)
Definition path.cpp:360
string path_source_replace_includes(const string &source, const string &path)
Definition path.cpp:968
string path_get(const string &sub)
Definition path.cpp:337
bool path_write_text(const string &path, string &text)
Definition path.cpp:673
#define min(a, b)
Definition sort.cc:36
string string_human_readable_size(size_t size)
Definition string.cpp:257
string string_human_readable_number(size_t num)
Definition string.cpp:276
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool use_metalrt_pcmi
Definition debug.h:106
bool adaptive_compile
Definition debug.h:93
bool use_local_atomic_sort
Definition debug.h:96
Definition DNA_ID.h:404
SceneParams params
Definition scene.h:167
uint64_t data
uint interpolation
i
Definition text_draw.cc:230
std::mutex thread_mutex
Definition thread.h:27
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:38
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
Definition types_base.h:57
uint64_t device_ptr
Definition types_base.h:44
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
@ INTERPOLATION_CLOSEST
PointerRNA * ptr
Definition wm_files.cc:4227