Blender V5.0
kernel.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include <algorithm>
8# include <atomic>
9# include <chrono>
10# include <deque>
11# include <thread>
12# include <vector>
13
15# include "device/metal/kernel.h"
16
18
19# include "util/debug.h"
20# include "util/md5.h"
21# include "util/path.h"
22# include "util/tbb.h"
23# include "util/time.h"
24# include "util/unique_ptr.h"
25
27
28const char *kernel_type_as_string(MetalPipelineType pso_type)
29{
30 switch (pso_type) {
31 case PSO_GENERIC:
32 return "PSO_GENERIC";
33 case PSO_SPECIALIZED_INTERSECT:
34 return "PSO_SPECIALIZED_INTERSECT";
35 case PSO_SPECIALIZED_SHADE:
36 return "PSO_SPECIALIZED_SHADE";
37 default:
38 assert(0);
39 }
40 return "";
41}
42
43struct ShaderCache {
44 ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
45 {
46 /* Initialize occupancy tuning LUT. */
47
48 // TODO: Look into tuning for DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT and
49 // DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT.
50
51 switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
52 default:
53 case APPLE_M3:
54 /* Peak occupancy is achieved through Dynamic Caching on M3 GPUs. */
55 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
56 occupancy_tuning[i] = {64, 64};
57 }
58 break;
59 case APPLE_M2_BIG:
60 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
61 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
62 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
63 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
64 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
65 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
66 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
67 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
68 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
69 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
70 break;
71 case APPLE_M2:
72 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
73 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
74 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
75 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
76 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
77 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
78 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
79 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
80 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
81 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
82 break;
83 case APPLE_M1:
84 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
85 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
86 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
87 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
88 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
89 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
90 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
91 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
92 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
93 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
94 break;
95 }
96
97 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
98 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
99 }
100 ~ShaderCache();
101
102 /* Get the fastest available pipeline for the specified kernel. */
103 MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
104
105 /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
106 * device. */
107 void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
108
109 bool should_load_kernel(DeviceKernel device_kernel,
110 const MetalDevice *device,
111 MetalPipelineType pso_type);
112
113 void wait_for_all();
114
115 friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
116
117 void compile_thread_func();
118
119 using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
120
121 struct OccupancyTuningParameters {
122 int threads_per_threadgroup = 0;
123 int num_threads_per_block = 0;
124 } occupancy_tuning[DEVICE_KERNEL_NUM];
125
126 std::mutex cache_mutex;
127
128 PipelineCollection pipelines[DEVICE_KERNEL_NUM];
129 id<MTLDevice> mtlDevice;
130
131 static bool running;
132 std::condition_variable cond_var;
133 std::deque<unique_ptr<MetalKernelPipeline>> request_queue;
134 std::vector<std::thread> compile_threads;
135 std::atomic_int incomplete_requests = 0;
136 std::atomic_int incomplete_specialization_requests = 0;
137};
138
139bool ShaderCache::running = true;
140
141const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
142using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
143int g_shaderCacheCount = 0;
144DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
145
146/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */
147static std::atomic_int g_next_pipeline_id = 0;
148
149ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
150{
151 for (int i = 0; i < g_shaderCacheCount; i++) {
152 if (g_shaderCache[i].first == mtlDevice) {
153 return g_shaderCache[i].second.get();
154 }
155 }
156
157 static thread_mutex g_shaderCacheCountMutex;
158 g_shaderCacheCountMutex.lock();
159 int index = g_shaderCacheCount++;
160 g_shaderCacheCountMutex.unlock();
161
162 assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
163 g_shaderCache[index].first = mtlDevice;
164 g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
165 return g_shaderCache[index].second.get();
166}
167
168ShaderCache::~ShaderCache()
169{
170 running = false;
171 cond_var.notify_all();
172
173 metal_printf("Waiting for ShaderCache threads... (incomplete_requests = %d)",
174 int(incomplete_requests));
175 for (auto &thread : compile_threads) {
176 thread.join();
177 }
178 metal_printf("ShaderCache shut down.");
179}
180
181void ShaderCache::wait_for_all()
182{
183 while (incomplete_requests > 0) {
184 std::this_thread::sleep_for(std::chrono::milliseconds(100));
185 }
186}
187
188void ShaderCache::compile_thread_func()
189{
190 while (running) {
191
192 /* wait for / acquire next request */
194 {
195 thread_scoped_lock lock(cache_mutex);
196 cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
197 if (!running || request_queue.empty()) {
198 continue;
199 }
200
201 pipeline = std::move(request_queue.front());
202 request_queue.pop_front();
203 }
204
205 /* Service the request. */
206 DeviceKernel device_kernel = pipeline->device_kernel;
207 MetalPipelineType pso_type = pipeline->pso_type;
208
209 if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
210 /* The originating MetalDevice is no longer active, so this request is obsolete. */
211 metal_printf("Cancelling compilation of %s (%s)",
212 device_kernel_as_string(device_kernel),
213 kernel_type_as_string(pso_type));
214 }
215 else {
216 /* Do the actual compilation. */
217 pipeline->compile();
218
219 thread_scoped_lock lock(cache_mutex);
220 auto &collection = pipelines[device_kernel];
221
222 /* Cache up to 3 kernel variants with the same pso_type in memory, purging oldest first. */
223 int max_entries_of_same_pso_type = 3;
224 for (int i = (int)collection.size() - 1; i >= 0; i--) {
225 if (collection[i]->pso_type == pso_type) {
226 max_entries_of_same_pso_type -= 1;
227 if (max_entries_of_same_pso_type == 0) {
228 metal_printf("Purging oldest %s:%s kernel from ShaderCache",
229 kernel_type_as_string(pso_type),
230 device_kernel_as_string(device_kernel));
231 collection.erase(collection.begin() + i);
232 break;
233 }
234 }
235 }
236 collection.push_back(std::move(pipeline));
237 }
238 incomplete_requests--;
239 if (pso_type != PSO_GENERIC) {
240 incomplete_specialization_requests--;
241 }
242 }
243}
244
245bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
246 const MetalDevice *device,
247 MetalPipelineType pso_type)
248{
249 if (!running) {
250 return false;
251 }
252
253 if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
254 /* Skip megakernel. */
255 return false;
256 }
257
259 if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
260 /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
261 return false;
262 }
263 }
264
265 if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
266 if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
267 /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
268 return false;
269 }
270 }
271
272 if (pso_type != PSO_GENERIC) {
273 /* Only specialize kernels where it can make an impact. */
274 if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
276 {
277 return false;
278 }
279
280 /* Only specialize shading / intersection kernels as requested. */
281 bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
282 bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
283 if (is_shade_pso != is_shade_kernel) {
284 return false;
285 }
286 }
287
288 {
289 /* check whether the kernel has already been requested / cached */
290 thread_scoped_lock lock(cache_mutex);
291 for (auto &pipeline : pipelines[device_kernel]) {
292 if (pipeline->kernels_md5 == device->kernels_md5[pso_type]) {
293 return false;
294 }
295 }
296 }
297
298 return true;
299}
300
301void ShaderCache::load_kernel(DeviceKernel device_kernel,
302 MetalDevice *device,
303 MetalPipelineType pso_type)
304{
305 {
306 /* create compiler threads on first run */
307 thread_scoped_lock lock(cache_mutex);
308 if (compile_threads.empty()) {
309 /* Limit to 2 MTLCompiler instances by default. In macOS >= 13.3 we can query the upper
310 * limit. */
311 int max_mtlcompiler_threads = 2;
312
313# if defined(MAC_OS_VERSION_13_3)
314 if (@available(macOS 13.3, *)) {
315 /* Subtract one to avoid contention with the real-time GPU module. */
316 max_mtlcompiler_threads = max(2,
317 int([mtlDevice maximumConcurrentCompilationTaskCount]) - 1);
318 }
319# endif
320
321 metal_printf("Spawning %d Cycles kernel compilation threads", max_mtlcompiler_threads);
322 for (int i = 0; i < max_mtlcompiler_threads; i++) {
323 compile_threads.emplace_back([this] { this->compile_thread_func(); });
324 }
325 }
326 }
327
328 if (!should_load_kernel(device_kernel, device, pso_type)) {
329 return;
330 }
331
332 incomplete_requests++;
333 if (pso_type != PSO_GENERIC) {
334 incomplete_specialization_requests++;
335 }
336
337 unique_ptr<MetalKernelPipeline> pipeline = make_unique<MetalKernelPipeline>();
338
339 /* Keep track of the originating device's ID so that we can cancel requests if the device ceases
340 * to be active. */
341 pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
342 pipeline->originating_device_id = device->device_id;
343 pipeline->kernel_data_ = device->launch_params->data;
344 pipeline->pso_type = pso_type;
345 pipeline->mtlDevice = mtlDevice;
346 pipeline->kernels_md5 = device->kernels_md5[pso_type];
347 pipeline->mtlLibrary = device->mtlLibrary[pso_type];
348 pipeline->device_kernel = device_kernel;
349 pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
350
351 if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
352 pipeline->threads_per_threadgroup = occupancy_tuning[device_kernel].threads_per_threadgroup;
353 pipeline->num_threads_per_block = occupancy_tuning[device_kernel].num_threads_per_block;
354 }
355
356 /* metalrt options */
357 pipeline->use_metalrt = device->use_metalrt;
358 pipeline->kernel_features = device->kernel_features;
359
360 {
361 thread_scoped_lock lock(cache_mutex);
362 request_queue.push_back(std::move(pipeline));
363 }
364 cond_var.notify_one();
365}
366
367MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
368{
369 while (running && !device->has_error) {
370 /* Search all loaded pipelines with matching kernels_md5 checksums. */
371 MetalKernelPipeline *best_match = nullptr;
372 {
373 thread_scoped_lock lock(cache_mutex);
374 for (auto &candidate : pipelines[kernel]) {
375 if (candidate->loaded &&
376 candidate->kernels_md5 == device->kernels_md5[candidate->pso_type])
377 {
378 /* Replace existing match if candidate is more specialized. */
379 if (!best_match || candidate->pso_type > best_match->pso_type) {
380 best_match = candidate.get();
381 }
382 }
383 }
384 }
385
386 if (best_match) {
387 if (best_match->usage_count == 0 && best_match->pso_type != PSO_GENERIC) {
388 metal_printf("Swapping in %s version of %s",
389 kernel_type_as_string(best_match->pso_type),
391 }
392 best_match->usage_count += 1;
393 return best_match;
394 }
395
396 /* Spin until a matching kernel is loaded, or we're shutting down. */
397 std::this_thread::sleep_for(std::chrono::milliseconds(100));
398 }
399 return nullptr;
400}
401
402bool MetalKernelPipeline::should_use_binary_archive() const
403{
404 /* Issues with binary archives in older macOS versions. */
405 if (@available(macOS 15.4, *)) {
406 if (auto *str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
407 if (atoi(str) != 0) {
408 /* Don't archive if we have opted out by env var. */
409 return false;
410 }
411 }
412
413 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
414 /* Binary linked functions aren't supported in binary archives. */
415 return false;
416 }
417
418 if (pso_type == PSO_GENERIC) {
419 /* Archive the generic kernels. */
420 return true;
421 }
422
423 if ((device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
424 device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) ||
425 (device_kernel >= DEVICE_KERNEL_SHADER_EVAL_DISPLACE &&
427 {
428 /* Archive all shade kernels - they take a long time to compile. */
429 return true;
430 }
431
432 /* The remaining kernels are all fast to compile. They may get cached by the system shader
433 * cache, but will be quick to regenerate if not. */
434 }
435 return false;
436}
437
438static MTLFunctionConstantValues *GetConstantValues(const KernelData *data = nullptr)
439{
440 MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
441
442 MTLDataType MTLDataType_int = MTLDataTypeInt;
443 MTLDataType MTLDataType_float = MTLDataTypeFloat;
444 MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
445 KernelData zero_data = {0};
446 if (!data) {
447 data = &zero_data;
448 }
449 [constant_values setConstantValue:&zero_data type:MTLDataType_int atIndex:Kernel_DummyConstant];
450
451 bool next_member_is_specialized = true;
452
453# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
454
455# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
456 [constant_values setConstantValue:next_member_is_specialized ? (void *)&data->parent.name : \
457 (void *)&zero_data \
458 type:MTLDataType_##_type \
459 atIndex:KernelData_##parent##_##name]; \
460 next_member_is_specialized = true;
461
462# include "kernel/data_template.h"
463
464 [constant_values setConstantValue:&data->kernel_features
465 type:MTLDataTypeInt
467
468 return constant_values;
469}
470
471void MetalDispatchPipeline::free_intersection_function_tables()
472{
473 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
474 if (intersection_func_table[table]) {
475 [intersection_func_table[table] release];
476 intersection_func_table[table] = nil;
477 }
478 }
479}
480
481MetalDispatchPipeline::~MetalDispatchPipeline()
482{
483 free_intersection_function_tables();
484}
485
486bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel)
487{
488 const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
489 kernel);
490 if (!best_pipeline) {
491 return false;
492 }
493
494 if (pipeline_id == best_pipeline->pipeline_id) {
495 /* The best pipeline is already active - nothing to do. */
496 return true;
497 }
498 pipeline_id = best_pipeline->pipeline_id;
499 pipeline = best_pipeline->pipeline;
500 pso_type = best_pipeline->pso_type;
501 num_threads_per_block = best_pipeline->num_threads_per_block;
502
503 /* Create the MTLIntersectionFunctionTables if needed. */
504 if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) {
505 free_intersection_function_tables();
506
507 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
508 @autoreleasepool {
509 MTLIntersectionFunctionTableDescriptor *ift_desc =
510 [[MTLIntersectionFunctionTableDescriptor alloc] init];
511 ift_desc.functionCount = best_pipeline->table_functions[table].count;
512 intersection_func_table[table] = [this->pipeline
513 newIntersectionFunctionTableWithDescriptor:ift_desc];
514
515 /* Finally write the function handles into this pipeline's table */
516 int size = int([best_pipeline->table_functions[table] count]);
517 for (int i = 0; i < size; i++) {
518 id<MTLFunctionHandle> handle = [pipeline
519 functionHandleWithFunction:best_pipeline->table_functions[table][i]];
520 [intersection_func_table[table] setFunction:handle atIndex:i];
521 }
522 }
523 }
524 }
525
526 return true;
527}
528
529id<MTLFunction> MetalKernelPipeline::make_intersection_function(const char *function_name)
530{
531 MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
532 desc.name = [@(function_name) copy];
533
534 if (pso_type != PSO_GENERIC) {
535 desc.constantValues = GetConstantValues(&kernel_data_);
536 }
537 else {
538 desc.constantValues = GetConstantValues();
539 }
540
541 NSError *error = nullptr;
542 id<MTLFunction> rt_intersection_function = [mtlLibrary newFunctionWithDescriptor:desc
543 error:&error];
544
545 if (rt_intersection_function == nil) {
546 NSString *err = [error localizedDescription];
547 string errors = [err UTF8String];
548
549 error_str = string_printf(
550 "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
551 }
552 else {
553 rt_intersection_function.label = [@(function_name) copy];
554 }
555 return rt_intersection_function;
556}
557
558void MetalKernelPipeline::compile()
559{
560 const std::string function_name = std::string("cycles_metal_") +
561 device_kernel_as_string(device_kernel);
562
563 NSError *error = nullptr;
564
565 MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
566 func_desc.name = [@(function_name.c_str()) copy];
567
568 if (pso_type != PSO_GENERIC) {
569 func_desc.constantValues = GetConstantValues(&kernel_data_);
570 }
571 else {
572 func_desc.constantValues = GetConstantValues();
573 }
574
575 function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
576
577 if (function == nil) {
578 NSString *err = [error localizedDescription];
579 string errors = [err UTF8String];
580 metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
581 return;
582 }
583
584 function.label = [@(function_name.c_str()) copy];
585
586 NSArray *linked_functions = nil;
587
588 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
589
590 NSMutableSet *unique_functions = [[NSMutableSet alloc] init];
591
592 auto add_intersection_functions = [&](int table_index,
593 const char *tri_fn,
594 const char *curve_fn = nullptr,
595 const char *point_fn = nullptr) {
596 table_functions[table_index] = [NSArray
597 arrayWithObjects:make_intersection_function(tri_fn),
598 curve_fn ? make_intersection_function(curve_fn) : nil,
599 point_fn ? make_intersection_function(point_fn) : nil,
600 nil];
601
602 [unique_functions addObjectsFromArray:table_functions[table_index]];
603 };
604
605 add_intersection_functions(METALRT_TABLE_DEFAULT,
606 "__intersection__tri",
607 "__intersection__curve",
608 "__intersection__point");
609 add_intersection_functions(METALRT_TABLE_SHADOW,
610 "__intersection__tri_shadow",
611 "__intersection__curve_shadow",
612 "__intersection__point_shadow");
613 add_intersection_functions(METALRT_TABLE_SHADOW_ALL,
614 "__intersection__tri_shadow_all",
615 "__intersection__curve_shadow_all",
616 "__intersection__point_shadow_all");
617 add_intersection_functions(METALRT_TABLE_VOLUME, "__intersection__volume_tri");
618 add_intersection_functions(METALRT_TABLE_LOCAL, "__intersection__local_tri");
619 add_intersection_functions(METALRT_TABLE_LOCAL_MBLUR, "__intersection__local_tri_mblur");
620 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT,
621 "__intersection__local_tri_single_hit");
622 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT_MBLUR,
623 "__intersection__local_tri_single_hit_mblur");
624
625 linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
626 sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
627 return [f1.label compare:f2.label];
628 }];
629 unique_functions = nil;
630 }
631
632 MTLComputePipelineDescriptor *computePipelineStateDescriptor =
633 [[MTLComputePipelineDescriptor alloc] init];
634
635 computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
636 computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
637 computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
638
639 computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
640 computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
641
642 computePipelineStateDescriptor.computeFunction = function;
643
644 /* Attach the additional functions to an MTLLinkedFunctions object */
645 if (linked_functions) {
646 computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
647 computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
648 }
649 computePipelineStateDescriptor.maxCallStackDepth = 1;
650 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
651 computePipelineStateDescriptor.maxCallStackDepth = 2;
652 }
653
654 MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
655
656 bool use_binary_archive = should_use_binary_archive();
657 bool loading_existing_archive = false;
658 bool creating_new_archive = false;
659
660 id<MTLBinaryArchive> archive = nil;
661 string metalbin_path;
662 string metalbin_name;
663 if (use_binary_archive) {
664 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
665 string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
666 MD5Hash local_md5;
667 local_md5.append(kernels_md5);
668 local_md5.append(osVersion);
669 local_md5.append((uint8_t *)&this->threads_per_threadgroup,
670 sizeof(this->threads_per_threadgroup));
671
672 /* Replace non-alphanumerical characters with underscores. */
673 string device_name = [mtlDevice.name UTF8String];
674 for (char &c : device_name) {
675 if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
676 c = '_';
677 }
678 }
679
680 metalbin_name = device_name;
681 metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
682 metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
683 metalbin_name = path_join(metalbin_name, local_md5.get_hex() + ".bin");
684
685 metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
686 path_create_directories(metalbin_path);
687
688 /* Check if shader binary exists on disk, and if so, update the file timestamp for LRU purging
689 * to work as intended. */
690 loading_existing_archive = path_cache_kernel_exists_and_mark_used(metalbin_path);
691 creating_new_archive = !loading_existing_archive;
692
693 MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
694 if (loading_existing_archive) {
695 archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
696 }
697 NSError *error = nil;
698 archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:&error];
699 if (!archive) {
700 const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
701 metal_printf("newBinaryArchiveWithDescriptor failed: %s", err ? err : "nil");
702 }
703 [archiveDesc release];
704
705 if (loading_existing_archive) {
706 pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
707 computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
708 }
709 }
710
711 bool recreate_archive = false;
712
713 /* Lambda to do the actual pipeline compilation. */
714 auto do_compilation = [&]() {
715 __block bool compilation_finished = false;
716 __block string error_str;
717
718 if (loading_existing_archive || !DebugFlags().metal.use_async_pso_creation) {
719 /* Use the blocking variant of newComputePipelineStateWithDescriptor if an archive exists on
720 * disk. It should load almost instantaneously, and will fail gracefully when loading a
721 * corrupt archive (unlike the async variant). */
722 NSError *error = nil;
723 pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
724 options:pipelineOptions
725 reflection:nullptr
726 error:&error];
727 const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
728 error_str = err ? err : "nil";
729 }
730 else {
731 /* Use the async variant of newComputePipelineStateWithDescriptor if no archive exists on
732 * disk. This allows us to respond to app shutdown. */
733 [mtlDevice
734 newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
735 options:pipelineOptions
736 completionHandler:^(id<MTLComputePipelineState> computePipelineState,
737 MTLComputePipelineReflection * /*reflection*/,
738 NSError *error) {
739 pipeline = computePipelineState;
740
741 /* Retain the pipeline so we can use it safely past the completion
742 * handler. */
743 if (pipeline) {
744 [pipeline retain];
745 }
746 const char *err = error ?
747 [[error localizedDescription] UTF8String] :
748 nullptr;
749 error_str = err ? err : "nil";
750
751 compilation_finished = true;
752 }];
753
754 /* Immediately wait for either the compilation to finish or for app shutdown. */
755 while (ShaderCache::running && !compilation_finished) {
756 std::this_thread::sleep_for(std::chrono::milliseconds(5));
757 }
758 }
759
760 if (creating_new_archive && pipeline) {
761 /* Add pipeline into the new archive. */
762 NSError *error;
763 if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
764 error:&error])
765 {
766 NSString *errStr = [error localizedDescription];
767 metal_printf("Failed to add PSO to archive:\n%s", errStr ? [errStr UTF8String] : "nil");
768 }
769 }
770
771 if (!pipeline) {
772 metal_printf(
773 "newComputePipelineStateWithDescriptor failed for \"%s\"%s. "
774 "Error:\n%s\n",
775 device_kernel_as_string(device_kernel),
776 (archive && !recreate_archive) ? " Archive may be incomplete or corrupt - attempting "
777 "recreation.." :
778 "",
779 error_str.c_str());
780 }
781 };
782
783 double starttime = time_dt();
784
785 do_compilation();
786
787 /* An archive might have a corrupt entry and fail to materialize the pipeline. This shouldn't
788 * happen, but if it does we recreate it. */
789 if (pipeline == nil && archive) {
790 recreate_archive = true;
791 pipelineOptions = MTLPipelineOptionNone;
792 path_remove(metalbin_path);
793
794 do_compilation();
795 }
796
797 double duration = time_dt() - starttime;
798
799 if (pipeline == nil) {
800 metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!",
801 kernel_type_as_string(pso_type),
802 device_kernel,
803 device_kernel_as_string(device_kernel),
804 duration);
805 return;
806 }
807
808 if (!num_threads_per_block) {
809 num_threads_per_block = round_down(pipeline.maxTotalThreadsPerThreadgroup,
810 pipeline.threadExecutionWidth);
811 num_threads_per_block = std::max(num_threads_per_block, (int)pipeline.threadExecutionWidth);
812 }
813
814 if (ShaderCache::running) {
815 if (creating_new_archive || recreate_archive) {
816 if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] error:&error])
817 {
818 metal_printf("Failed to save binary archive to %s, error:\n%s",
819 metalbin_path.c_str(),
820 [[error localizedDescription] UTF8String]);
821 }
822 else {
824 }
825 }
826 }
827
828 this->loaded = true;
829 [computePipelineStateDescriptor release];
830 computePipelineStateDescriptor = nil;
831
832 if (!use_binary_archive) {
833 metal_printf("%16s | %2d | %-55s | %7.2fs",
834 kernel_type_as_string(pso_type),
835 int(device_kernel),
836 device_kernel_as_string(device_kernel),
837 duration);
838 }
839 else {
840 metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s",
841 kernel_type_as_string(pso_type),
842 device_kernel,
843 device_kernel_as_string(device_kernel),
844 duration,
845 creating_new_archive ? " new" : "load",
846 metalbin_name.c_str());
847 }
848}
849
850bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
851{
852 auto *shader_cache = get_shader_cache(device->mtlDevice);
853 for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
854 shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
855 }
856 return true;
857}
858
859void MetalDeviceKernels::wait_for_all()
860{
861 for (int i = 0; i < g_shaderCacheCount; i++) {
862 g_shaderCache[i].second->wait_for_all();
863 }
864}
865
866int MetalDeviceKernels::num_incomplete_specialization_requests()
867{
868 /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
869 * only 1). */
870 int total = 0;
871 for (int i = 0; i < g_shaderCacheCount; i++) {
872 total += g_shaderCache[i].second->incomplete_specialization_requests;
873 }
874 return total;
875}
876
877int MetalDeviceKernels::get_loaded_kernel_count(const MetalDevice *device,
878 MetalPipelineType pso_type)
879{
880 auto *shader_cache = get_shader_cache(device->mtlDevice);
881 int loaded_count = DEVICE_KERNEL_NUM;
882 for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
883 if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
884 loaded_count -= 1;
885 }
886 }
887 return loaded_count;
888}
889
890bool MetalDeviceKernels::should_load_kernels(const MetalDevice *device, MetalPipelineType pso_type)
891{
892 return get_loaded_kernel_count(device, pso_type) != DEVICE_KERNEL_NUM;
893}
894
895const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
896 DeviceKernel kernel)
897{
898 return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
899}
900
901bool MetalDeviceKernels::is_benchmark_warmup()
902{
903 NSArray *args = [[NSProcessInfo processInfo] arguments];
904 for (int i = 0; i < args.count; i++) {
905 if (const char *arg = [[args objectAtIndex:i] cStringUsingEncoding:NSASCIIStringEncoding]) {
906 if (!strcmp(arg, "--warm-up")) {
907 return true;
908 }
909 }
910 }
911 return false;
912}
913
914void MetalDeviceKernels::static_deinitialize()
915{
916 for (int i = 0; i < g_shaderCacheCount; i++) {
917 g_shaderCache[i] = DeviceShaderCache();
918 }
919}
920
922
923#endif /* WITH_METAL */
volatile int lock
BMesh const char void * data
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
Definition md5.h:19
void append(const uint8_t *data, const int nbytes)
Definition md5.cpp:268
string get_hex()
Definition md5.cpp:367
bool join()
Definition thread.cpp:48
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
Definition debug.h:145
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define str(s)
@ Kernel_DummyConstant
@ KernelData_kernel_features
#define assert(assertion)
int count
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_SHADER_EVAL_VOLUME_DENSITY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
static void error(const char *str)
static void init(bNodeTree *, bNode *node)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
Definition path.cpp:360
string path_join(const string &dir, const string &file)
Definition path.cpp:415
bool path_cache_kernel_exists_and_mark_used(const string &path)
Definition path.cpp:994
void path_cache_kernel_mark_added_and_clear_old(const string &new_path, const size_t max_old_kernel_of_same_type)
Definition path.cpp:1003
void path_create_directories(const string &filepath)
Definition path.cpp:647
bool path_remove(const string &path)
Definition path.cpp:778
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
std::mutex thread_mutex
Definition thread.h:27
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:47
ccl_device_inline size_t round_down(const size_t x, const size_t multiple)
Definition types_base.h:62