Blender V4.3
kernel.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include "device/metal/kernel.h"
10# include "util/md5.h"
11# include "util/path.h"
12# include "util/tbb.h"
13# include "util/time.h"
14# include "util/unique_ptr.h"
15
17
18const char *kernel_type_as_string(MetalPipelineType pso_type)
19{
20 switch (pso_type) {
21 case PSO_GENERIC:
22 return "PSO_GENERIC";
23 case PSO_SPECIALIZED_INTERSECT:
24 return "PSO_SPECIALIZED_INTERSECT";
25 case PSO_SPECIALIZED_SHADE:
26 return "PSO_SPECIALIZED_SHADE";
27 default:
28 assert(0);
29 }
30 return "";
31}
32
33struct ShaderCache {
34 ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
35 {
36 /* Initialize occupancy tuning LUT. */
37
38 // TODO: Look into tuning for DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT and
39 // DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT.
40
41 switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
42 default:
43 case APPLE_M3:
44 /* Peak occupancy is achieved through Dynamic Caching on M3 GPUs. */
45 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
46 occupancy_tuning[i] = {64, 64};
47 }
48 break;
49 case APPLE_M2_BIG:
50 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
51 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
52 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
53 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
54 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
55 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
56 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
57 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
58 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
59 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
60 break;
61 case APPLE_M2:
62 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
63 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
64 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
65 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
66 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
67 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
68 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
69 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
70 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
71 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
72 break;
73 case APPLE_M1:
74 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
75 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
76 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
77 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
78 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
79 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
80 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
81 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
82 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
83 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
84 break;
85 }
86
87 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
88 occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
89 }
90 ~ShaderCache();
91
92 /* Get the fastest available pipeline for the specified kernel. */
93 MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
94
95 /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
96 * device. */
97 void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
98
99 bool should_load_kernel(DeviceKernel device_kernel,
100 MetalDevice const *device,
101 MetalPipelineType pso_type);
102
103 void wait_for_all();
104
105 friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
106
107 void compile_thread_func();
108
109 using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
110
111 struct OccupancyTuningParameters {
112 int threads_per_threadgroup = 0;
113 int num_threads_per_block = 0;
114 } occupancy_tuning[DEVICE_KERNEL_NUM];
115
116 std::mutex cache_mutex;
117
118 PipelineCollection pipelines[DEVICE_KERNEL_NUM];
119 id<MTLDevice> mtlDevice;
120
121 static bool running;
122 std::condition_variable cond_var;
123 std::deque<MetalKernelPipeline *> request_queue;
124 std::vector<std::thread> compile_threads;
125 std::atomic_int incomplete_requests = 0;
126 std::atomic_int incomplete_specialization_requests = 0;
127};
128
129bool ShaderCache::running = true;
130
131const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
132using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
133int g_shaderCacheCount = 0;
134DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
135
136/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */
137static std::atomic_int g_next_pipeline_id = 0;
138
139ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
140{
141 for (int i = 0; i < g_shaderCacheCount; i++) {
142 if (g_shaderCache[i].first == mtlDevice) {
143 return g_shaderCache[i].second.get();
144 }
145 }
146
147 static thread_mutex g_shaderCacheCountMutex;
148 g_shaderCacheCountMutex.lock();
149 int index = g_shaderCacheCount++;
150 g_shaderCacheCountMutex.unlock();
151
152 assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
153 g_shaderCache[index].first = mtlDevice;
154 g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
155 return g_shaderCache[index].second.get();
156}
157
158ShaderCache::~ShaderCache()
159{
160 running = false;
161 cond_var.notify_all();
162
163 metal_printf("Waiting for ShaderCache threads... (incomplete_requests = %d)\n",
164 int(incomplete_requests));
165 for (auto &thread : compile_threads) {
166 thread.join();
167 }
168 metal_printf("ShaderCache shut down.\n");
169}
170
171void ShaderCache::wait_for_all()
172{
173 while (incomplete_requests > 0) {
174 std::this_thread::sleep_for(std::chrono::milliseconds(100));
175 }
176}
177
178void ShaderCache::compile_thread_func()
179{
180 while (running) {
181
182 /* wait for / acquire next request */
183 MetalKernelPipeline *pipeline;
184 {
185 thread_scoped_lock lock(cache_mutex);
186 cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
187 if (!running || request_queue.empty()) {
188 continue;
189 }
190
191 pipeline = request_queue.front();
192 request_queue.pop_front();
193 }
194
195 /* Service the request. */
196 DeviceKernel device_kernel = pipeline->device_kernel;
197 MetalPipelineType pso_type = pipeline->pso_type;
198
199 if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
200 /* The originating MetalDevice is no longer active, so this request is obsolete. */
201 metal_printf("Cancelling compilation of %s (%s)\n",
202 device_kernel_as_string(device_kernel),
203 kernel_type_as_string(pso_type));
204 }
205 else {
206 /* Do the actual compilation. */
207 pipeline->compile();
208
209 thread_scoped_lock lock(cache_mutex);
210 auto &collection = pipelines[device_kernel];
211
212 /* Cache up to 3 kernel variants with the same pso_type in memory, purging oldest first. */
213 int max_entries_of_same_pso_type = 3;
214 for (int i = (int)collection.size() - 1; i >= 0; i--) {
215 if (collection[i]->pso_type == pso_type) {
216 max_entries_of_same_pso_type -= 1;
217 if (max_entries_of_same_pso_type == 0) {
218 metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
219 kernel_type_as_string(pso_type),
220 device_kernel_as_string(device_kernel));
221 collection.erase(collection.begin() + i);
222 break;
223 }
224 }
225 }
226 collection.push_back(unique_ptr<MetalKernelPipeline>(pipeline));
227 }
228 incomplete_requests--;
229 if (pso_type != PSO_GENERIC) {
230 incomplete_specialization_requests--;
231 }
232 }
233}
234
235bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
236 MetalDevice const *device,
237 MetalPipelineType pso_type)
238{
239 if (!running) {
240 return false;
241 }
242
243 if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
244 /* Skip megakernel. */
245 return false;
246 }
247
249 if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
250 /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
251 return false;
252 }
253 }
254
255 if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
256 if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
257 /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
258 return false;
259 }
260 }
261
262 if (pso_type != PSO_GENERIC) {
263 /* Only specialize kernels where it can make an impact. */
264 if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
266 {
267 return false;
268 }
269
270 /* Only specialize shading / intersection kernels as requested. */
271 bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
272 bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
273 if (is_shade_pso != is_shade_kernel) {
274 return false;
275 }
276 }
277
278 {
279 /* check whether the kernel has already been requested / cached */
280 thread_scoped_lock lock(cache_mutex);
281 for (auto &pipeline : pipelines[device_kernel]) {
282 if (pipeline->kernels_md5 == device->kernels_md5[pso_type]) {
283 return false;
284 }
285 }
286 }
287
288 return true;
289}
290
291void ShaderCache::load_kernel(DeviceKernel device_kernel,
292 MetalDevice *device,
293 MetalPipelineType pso_type)
294{
295 {
296 /* create compiler threads on first run */
297 thread_scoped_lock lock(cache_mutex);
298 if (compile_threads.empty()) {
299 /* Limit to 2 MTLCompiler instances by default. In macOS >= 13.3 we can query the upper
300 * limit. */
301 int max_mtlcompiler_threads = 2;
302
303# if defined(MAC_OS_VERSION_13_3)
304 if (@available(macOS 13.3, *)) {
305 /* Subtract one to avoid contention with the real-time GPU module. */
306 max_mtlcompiler_threads = max(2,
307 int([mtlDevice maximumConcurrentCompilationTaskCount]) - 1);
308 }
309# endif
310
311 metal_printf("Spawning %d Cycles kernel compilation threads\n", max_mtlcompiler_threads);
312 for (int i = 0; i < max_mtlcompiler_threads; i++) {
313 compile_threads.push_back(std::thread([this] { this->compile_thread_func(); }));
314 }
315 }
316 }
317
318 if (!should_load_kernel(device_kernel, device, pso_type)) {
319 return;
320 }
321
322 incomplete_requests++;
323 if (pso_type != PSO_GENERIC) {
324 incomplete_specialization_requests++;
325 }
326
327 MetalKernelPipeline *pipeline = new MetalKernelPipeline;
328
329 /* Keep track of the originating device's ID so that we can cancel requests if the device ceases
330 * to be active. */
331 pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
332 pipeline->originating_device_id = device->device_id;
333 memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
334 pipeline->pso_type = pso_type;
335 pipeline->mtlDevice = mtlDevice;
336 pipeline->kernels_md5 = device->kernels_md5[pso_type];
337 pipeline->mtlLibrary = device->mtlLibrary[pso_type];
338 pipeline->device_kernel = device_kernel;
339 pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
340
341 if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
342 pipeline->threads_per_threadgroup = occupancy_tuning[device_kernel].threads_per_threadgroup;
343 pipeline->num_threads_per_block = occupancy_tuning[device_kernel].num_threads_per_block;
344 }
345
346 /* metalrt options */
347 pipeline->use_metalrt = device->use_metalrt;
348 pipeline->kernel_features = device->kernel_features;
349
350 {
351 thread_scoped_lock lock(cache_mutex);
352 request_queue.push_back(pipeline);
353 }
354 cond_var.notify_one();
355}
356
357MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
358{
359 while (running && !device->has_error) {
360 /* Search all loaded pipelines with matching kernels_md5 checksums. */
361 MetalKernelPipeline *best_match = nullptr;
362 {
363 thread_scoped_lock lock(cache_mutex);
364 for (auto &candidate : pipelines[kernel]) {
365 if (candidate->loaded &&
366 candidate->kernels_md5 == device->kernels_md5[candidate->pso_type])
367 {
368 /* Replace existing match if candidate is more specialized. */
369 if (!best_match || candidate->pso_type > best_match->pso_type) {
370 best_match = candidate.get();
371 }
372 }
373 }
374 }
375
376 if (best_match) {
377 if (best_match->usage_count == 0 && best_match->pso_type != PSO_GENERIC) {
378 metal_printf("Swapping in %s version of %s\n",
379 kernel_type_as_string(best_match->pso_type),
381 }
382 best_match->usage_count += 1;
383 return best_match;
384 }
385
386 /* Spin until a matching kernel is loaded, or we're shutting down. */
387 std::this_thread::sleep_for(std::chrono::milliseconds(100));
388 }
389 return nullptr;
390}
391
392bool MetalKernelPipeline::should_use_binary_archive() const
393{
394 /* Issues with binary archives in older macOS versions. */
395 if (@available(macOS 13.0, *)) {
396 if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
397 if (atoi(str) != 0) {
398 /* Don't archive if we have opted out by env var. */
399 return false;
400 }
401 }
402
403 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
404 /* Binary linked functions aren't supported in binary archives. */
405 return false;
406 }
407
408 if (pso_type == PSO_GENERIC) {
409 /* Archive the generic kernels. */
410 return true;
411 }
412
413 if ((device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
414 device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) ||
415 (device_kernel >= DEVICE_KERNEL_SHADER_EVAL_DISPLACE &&
417 {
418 /* Archive all shade kernels - they take a long time to compile. */
419 return true;
420 }
421
422 /* The remaining kernels are all fast to compile. They may get cached by the system shader
423 * cache, but will be quick to regenerate if not. */
424 }
425 return false;
426}
427
428static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
429{
430 MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
431
432 MTLDataType MTLDataType_int = MTLDataTypeInt;
433 MTLDataType MTLDataType_float = MTLDataTypeFloat;
434 MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
435 KernelData zero_data = {0};
436 if (!data) {
437 data = &zero_data;
438 }
439 [constant_values setConstantValue:&zero_data type:MTLDataType_int atIndex:Kernel_DummyConstant];
440
441 bool next_member_is_specialized = true;
442
443# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
444
445# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
446 [constant_values setConstantValue:next_member_is_specialized ? (void *)&data->parent.name : \
447 (void *)&zero_data \
448 type:MTLDataType_##_type \
449 atIndex:KernelData_##parent##_##name]; \
450 next_member_is_specialized = true;
451
452# include "kernel/data_template.h"
453
454 return constant_values;
455}
456
457void MetalDispatchPipeline::free_intersection_function_tables()
458{
459 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
460 if (intersection_func_table[table]) {
461 [intersection_func_table[table] release];
462 intersection_func_table[table] = nil;
463 }
464 }
465}
466
467MetalDispatchPipeline::~MetalDispatchPipeline()
468{
469 free_intersection_function_tables();
470}
471
472bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel)
473{
474 const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
475 kernel);
476 if (!best_pipeline) {
477 return false;
478 }
479
480 if (pipeline_id == best_pipeline->pipeline_id) {
481 /* The best pipeline is already active - nothing to do. */
482 return true;
483 }
484 pipeline_id = best_pipeline->pipeline_id;
485 pipeline = best_pipeline->pipeline;
486 pso_type = best_pipeline->pso_type;
487 num_threads_per_block = best_pipeline->num_threads_per_block;
488
489 /* Create the MTLIntersectionFunctionTables if needed. */
490 if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) {
491 free_intersection_function_tables();
492
493 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
494 @autoreleasepool {
495 MTLIntersectionFunctionTableDescriptor *ift_desc =
496 [[MTLIntersectionFunctionTableDescriptor alloc] init];
497 ift_desc.functionCount = best_pipeline->table_functions[table].count;
498 intersection_func_table[table] = [this->pipeline
499 newIntersectionFunctionTableWithDescriptor:ift_desc];
500
501 /* Finally write the function handles into this pipeline's table */
502 int size = int([best_pipeline->table_functions[table] count]);
503 for (int i = 0; i < size; i++) {
504 id<MTLFunctionHandle> handle = [pipeline
505 functionHandleWithFunction:best_pipeline->table_functions[table][i]];
506 [intersection_func_table[table] setFunction:handle atIndex:i];
507 }
508 }
509 }
510 }
511
512 return true;
513}
514
515id<MTLFunction> MetalKernelPipeline::make_intersection_function(const char *function_name)
516{
517 MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
518 desc.name = [@(function_name) copy];
519
520 if (pso_type != PSO_GENERIC) {
521 desc.constantValues = GetConstantValues(&kernel_data_);
522 }
523 else {
524 desc.constantValues = GetConstantValues();
525 }
526
527 NSError *error = NULL;
528 id<MTLFunction> rt_intersection_function = [mtlLibrary newFunctionWithDescriptor:desc
529 error:&error];
530
531 if (rt_intersection_function == nil) {
532 NSString *err = [error localizedDescription];
533 string errors = [err UTF8String];
534
535 error_str = string_printf(
536 "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
537 }
538 else {
539 rt_intersection_function.label = [@(function_name) copy];
540 }
541 return rt_intersection_function;
542}
543
544void MetalKernelPipeline::compile()
545{
546 const std::string function_name = std::string("cycles_metal_") +
547 device_kernel_as_string(device_kernel);
548
549 NSError *error = NULL;
550
551 MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
552 func_desc.name = [@(function_name.c_str()) copy];
553
554 if (pso_type != PSO_GENERIC) {
555 func_desc.constantValues = GetConstantValues(&kernel_data_);
556 }
557 else {
558 func_desc.constantValues = GetConstantValues();
559 }
560
561 function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
562
563 if (function == nil) {
564 NSString *err = [error localizedDescription];
565 string errors = [err UTF8String];
566 metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
567 return;
568 }
569
570 function.label = [@(function_name.c_str()) copy];
571
572 NSArray *linked_functions = nil;
573
574 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
575
576 NSMutableSet *unique_functions = [[NSMutableSet alloc] init];
577
578 auto add_intersection_functions = [&](int table_index,
579 const char *tri_fn,
580 const char *curve_fn = nullptr,
581 const char *point_fn = nullptr) {
582 table_functions[table_index] = [NSArray
583 arrayWithObjects:make_intersection_function(tri_fn),
584 curve_fn ? make_intersection_function(curve_fn) : nil,
585 point_fn ? make_intersection_function(point_fn) : nil,
586 nil];
587
588 [unique_functions addObjectsFromArray:table_functions[table_index]];
589 };
590
591 add_intersection_functions(METALRT_TABLE_DEFAULT,
592 "__intersection__tri",
593 "__intersection__curve",
594 "__intersection__point");
595 add_intersection_functions(METALRT_TABLE_SHADOW,
596 "__intersection__tri_shadow",
597 "__intersection__curve_shadow",
598 "__intersection__point_shadow");
599 add_intersection_functions(METALRT_TABLE_SHADOW_ALL,
600 "__intersection__tri_shadow_all",
601 "__intersection__curve_shadow_all",
602 "__intersection__point_shadow_all");
603 add_intersection_functions(METALRT_TABLE_VOLUME, "__intersection__volume_tri");
604 add_intersection_functions(METALRT_TABLE_LOCAL, "__intersection__local_tri");
605 add_intersection_functions(METALRT_TABLE_LOCAL_MBLUR, "__intersection__local_tri_mblur");
606 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT,
607 "__intersection__local_tri_single_hit");
608 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT_MBLUR,
609 "__intersection__local_tri_single_hit_mblur");
610
611 linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
612 sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
613 return [f1.label compare:f2.label];
614 }];
615 unique_functions = nil;
616 }
617
618 MTLComputePipelineDescriptor *computePipelineStateDescriptor =
619 [[MTLComputePipelineDescriptor alloc] init];
620
621 computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
622 computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
623 computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
624
625 computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
626 computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
627
628 computePipelineStateDescriptor.computeFunction = function;
629
630 /* Attach the additional functions to an MTLLinkedFunctions object */
631 if (linked_functions) {
632 computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
633 computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
634 }
635 computePipelineStateDescriptor.maxCallStackDepth = 1;
636 if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
637 computePipelineStateDescriptor.maxCallStackDepth = 2;
638 }
639
640 MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
641
642 bool use_binary_archive = should_use_binary_archive();
643 bool loading_existing_archive = false;
644 bool creating_new_archive = false;
645
646 id<MTLBinaryArchive> archive = nil;
647 string metalbin_path;
648 string metalbin_name;
649 if (use_binary_archive) {
650 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
651 string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
652 MD5Hash local_md5;
653 local_md5.append(kernels_md5);
654 local_md5.append(osVersion);
655 local_md5.append((uint8_t *)&this->threads_per_threadgroup,
656 sizeof(this->threads_per_threadgroup));
657
658 /* Replace non-alphanumerical characters with underscores. */
659 string device_name = [mtlDevice.name UTF8String];
660 for (char &c : device_name) {
661 if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
662 c = '_';
663 }
664 }
665
666 metalbin_name = device_name;
667 metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
668 metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
669 metalbin_name = path_join(metalbin_name, local_md5.get_hex() + ".bin");
670
671 metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
672 path_create_directories(metalbin_path);
673
674 /* Check if shader binary exists on disk, and if so, update the file timestamp for LRU purging
675 * to work as intended. */
676 loading_existing_archive = path_cache_kernel_exists_and_mark_used(metalbin_path);
677 creating_new_archive = !loading_existing_archive;
678
679 MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
680 if (loading_existing_archive) {
681 archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
682 }
683 NSError *error = nil;
684 archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:&error];
685 if (!archive) {
686 const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
687 metal_printf("newBinaryArchiveWithDescriptor failed: %s\n", err ? err : "nil");
688 }
689 [archiveDesc release];
690
691 if (loading_existing_archive) {
692 pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
693 computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
694 }
695 }
696
697 bool recreate_archive = false;
698
699 /* Lambda to do the actual pipeline compilation. */
700 auto do_compilation = [&]() {
701 __block bool compilation_finished = false;
702 __block string error_str;
703
704 if (loading_existing_archive || !DebugFlags().metal.use_async_pso_creation) {
705 /* Use the blocking variant of newComputePipelineStateWithDescriptor if an archive exists on
706 * disk. It should load almost instantaneously, and will fail gracefully when loading a
707 * corrupt archive (unlike the async variant). */
708 NSError *error = nil;
709 pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
710 options:pipelineOptions
711 reflection:nullptr
712 error:&error];
713 const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
714 error_str = err ? err : "nil";
715 }
716 else {
717 /* Use the async variant of newComputePipelineStateWithDescriptor if no archive exists on
718 * disk. This allows us to respond to app shutdown. */
719 [mtlDevice
720 newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
721 options:pipelineOptions
722 completionHandler:^(id<MTLComputePipelineState> computePipelineState,
723 MTLComputePipelineReflection * /*reflection*/,
724 NSError *error) {
725 pipeline = computePipelineState;
726
727 /* Retain the pipeline so we can use it safely past the completion
728 * handler. */
729 if (pipeline) {
730 [pipeline retain];
731 }
732 const char *err = error ?
733 [[error localizedDescription] UTF8String] :
734 nullptr;
735 error_str = err ? err : "nil";
736
737 compilation_finished = true;
738 }];
739
740 /* Immediately wait for either the compilation to finish or for app shutdown. */
741 while (ShaderCache::running && !compilation_finished) {
742 std::this_thread::sleep_for(std::chrono::milliseconds(5));
743 }
744 }
745
746 if (creating_new_archive && pipeline) {
747 /* Add pipeline into the new archive. */
748 NSError *error;
749 if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
750 error:&error])
751 {
752 NSString *errStr = [error localizedDescription];
753 metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
754 }
755 }
756
757 if (!pipeline) {
758 metal_printf(
759 "newComputePipelineStateWithDescriptor failed for \"%s\"%s. "
760 "Error:\n%s\n",
762 (archive && !recreate_archive) ? " Archive may be incomplete or corrupt - attempting "
763 "recreation.." :
764 "",
765 error_str.c_str());
766 }
767 };
768
769 double starttime = time_dt();
770
771 do_compilation();
772
773 /* An archive might have a corrupt entry and fail to materialize the pipeline. This shouldn't
774 * happen, but if it does we recreate it. */
775 if (pipeline == nil && archive) {
776 recreate_archive = true;
777 pipelineOptions = MTLPipelineOptionNone;
778 path_remove(metalbin_path);
779
780 do_compilation();
781 }
782
783 double duration = time_dt() - starttime;
784
785 if (pipeline == nil) {
786 metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
787 kernel_type_as_string(pso_type),
788 device_kernel,
790 duration);
791 return;
792 }
793
794 if (!num_threads_per_block) {
795 num_threads_per_block = round_down(pipeline.maxTotalThreadsPerThreadgroup,
796 pipeline.threadExecutionWidth);
797 num_threads_per_block = std::max(num_threads_per_block, (int)pipeline.threadExecutionWidth);
798 }
799
800 if (ShaderCache::running) {
801 if (creating_new_archive || recreate_archive) {
802 if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] error:&error])
803 {
804 metal_printf("Failed to save binary archive to %s, error:\n%s\n",
805 metalbin_path.c_str(),
806 [[error localizedDescription] UTF8String]);
807 }
808 else {
810 }
811 }
812 }
813
814 this->loaded = true;
815 [computePipelineStateDescriptor release];
816 computePipelineStateDescriptor = nil;
817
818 if (!use_binary_archive) {
819 metal_printf("%16s | %2d | %-55s | %7.2fs\n",
820 kernel_type_as_string(pso_type),
821 int(device_kernel),
822 device_kernel_as_string(device_kernel),
823 duration);
824 }
825 else {
826 metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
827 kernel_type_as_string(pso_type),
828 device_kernel,
830 duration,
831 creating_new_archive ? " new" : "load",
832 metalbin_name.c_str());
833 }
834}
835
836bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
837{
838 auto shader_cache = get_shader_cache(device->mtlDevice);
839 for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
840 shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
841 }
842 return true;
843}
844
845void MetalDeviceKernels::wait_for_all()
846{
847 for (int i = 0; i < g_shaderCacheCount; i++) {
848 g_shaderCache[i].second->wait_for_all();
849 }
850}
851
852int MetalDeviceKernels::num_incomplete_specialization_requests()
853{
854 /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
855 * only 1). */
856 int total = 0;
857 for (int i = 0; i < g_shaderCacheCount; i++) {
858 total += g_shaderCache[i].second->incomplete_specialization_requests;
859 }
860 return total;
861}
862
863int MetalDeviceKernels::get_loaded_kernel_count(MetalDevice const *device,
864 MetalPipelineType pso_type)
865{
866 auto shader_cache = get_shader_cache(device->mtlDevice);
867 int loaded_count = DEVICE_KERNEL_NUM;
868 for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
869 if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
870 loaded_count -= 1;
871 }
872 }
873 return loaded_count;
874}
875
876bool MetalDeviceKernels::should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type)
877{
878 return get_loaded_kernel_count(device, pso_type) != DEVICE_KERNEL_NUM;
879}
880
881const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
882 DeviceKernel kernel)
883{
884 return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
885}
886
887bool MetalDeviceKernels::is_benchmark_warmup()
888{
889 NSArray *args = [[NSProcessInfo processInfo] arguments];
890 for (int i = 0; i < args.count; i++) {
891 if (const char *arg = [[args objectAtIndex:i] cStringUsingEncoding:NSASCIIStringEncoding]) {
892 if (!strcmp(arg, "--warm-up")) {
893 return true;
894 }
895 }
896 }
897 return false;
898}
899
900void MetalDeviceKernels::static_deinitialize()
901{
902 for (int i = 0; i < g_shaderCacheCount; i++) {
903 g_shaderCache[i] = DeviceShaderCache();
904 }
905}
906
908
909#endif /* WITH_METAL */
volatile int lock
void init()
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
Metal metal
Definition debug.h:129
Definition md5.h:21
string get_hex()
Definition md5.cpp:354
void append(const uint8_t *data, int size)
Definition md5.cpp:255
bool join()
Definition thread.cpp:43
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
Definition debug.h:142
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define NULL
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
#define str(s)
@ Kernel_DummyConstant
int count
KernelData
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
static void error(const char *str)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
Definition path.cpp:362
string path_join(const string &dir, const string &file)
Definition path.cpp:417
bool path_cache_kernel_exists_and_mark_used(const string &path)
Definition path.cpp:992
void path_cache_kernel_mark_added_and_clear_old(const string &new_path, const size_t max_old_kernel_of_same_type)
Definition path.cpp:1003
void path_create_directories(const string &filepath)
Definition path.cpp:648
bool path_remove(const string &path)
Definition path.cpp:778
unsigned char uint8_t
Definition stdint.h:78
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool use_async_pso_creation
Definition debug.h:103
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:30
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition thread.h:29
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:36
float max
ccl_device_inline size_t round_down(size_t x, size_t multiple)
Definition util/types.h:63