28const char *kernel_type_as_string(MetalPipelineType pso_type)
33 case PSO_SPECIALIZED_INTERSECT:
34 return "PSO_SPECIALIZED_INTERSECT";
35 case PSO_SPECIALIZED_SHADE:
36 return "PSO_SPECIALIZED_SHADE";
44 ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
51 switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
56 occupancy_tuning[
i] = {64, 64};
103 MetalKernelPipeline *get_best_pipeline(
DeviceKernel kernel,
const MetalDevice *device);
107 void load_kernel(
DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
110 const MetalDevice *device,
111 MetalPipelineType pso_type);
115 friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
117 void compile_thread_func();
119 using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
121 struct OccupancyTuningParameters {
122 int threads_per_threadgroup = 0;
123 int num_threads_per_block = 0;
126 std::mutex cache_mutex;
129 id<MTLDevice> mtlDevice;
132 std::condition_variable cond_var;
133 std::deque<unique_ptr<MetalKernelPipeline>> request_queue;
134 std::vector<std::thread> compile_threads;
135 std::atomic_int incomplete_requests = 0;
136 std::atomic_int incomplete_specialization_requests = 0;
139bool ShaderCache::running =
true;
141const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
143int g_shaderCacheCount = 0;
144DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
147static std::atomic_int g_next_pipeline_id = 0;
149ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
151 for (
int i = 0;
i < g_shaderCacheCount;
i++) {
152 if (g_shaderCache[
i].first == mtlDevice) {
153 return g_shaderCache[
i].second.get();
158 g_shaderCacheCountMutex.lock();
159 int index = g_shaderCacheCount++;
160 g_shaderCacheCountMutex.unlock();
162 assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
163 g_shaderCache[index].first = mtlDevice;
164 g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
165 return g_shaderCache[index].second.get();
168ShaderCache::~ShaderCache()
171 cond_var.notify_all();
173 metal_printf(
"Waiting for ShaderCache threads... (incomplete_requests = %d)",
174 int(incomplete_requests));
175 for (
auto &
thread : compile_threads) {
178 metal_printf(
"ShaderCache shut down.");
181void ShaderCache::wait_for_all()
183 while (incomplete_requests > 0) {
184 std::this_thread::sleep_for(std::chrono::milliseconds(100));
188void ShaderCache::compile_thread_func()
196 cond_var.wait(
lock, [&] {
return !running || !request_queue.empty(); });
197 if (!running || request_queue.empty()) {
201 pipeline = std::move(request_queue.front());
202 request_queue.pop_front();
207 MetalPipelineType pso_type = pipeline->pso_type;
209 if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
211 metal_printf(
"Cancelling compilation of %s (%s)",
213 kernel_type_as_string(pso_type));
220 auto &collection = pipelines[device_kernel];
223 int max_entries_of_same_pso_type = 3;
224 for (
int i = (
int)collection.size() - 1;
i >= 0;
i--) {
225 if (collection[
i]->pso_type == pso_type) {
226 max_entries_of_same_pso_type -= 1;
227 if (max_entries_of_same_pso_type == 0) {
228 metal_printf(
"Purging oldest %s:%s kernel from ShaderCache",
229 kernel_type_as_string(pso_type),
231 collection.erase(collection.begin() +
i);
236 collection.push_back(std::move(pipeline));
238 incomplete_requests--;
239 if (pso_type != PSO_GENERIC) {
240 incomplete_specialization_requests--;
245bool ShaderCache::should_load_kernel(
DeviceKernel device_kernel,
246 const MetalDevice *device,
247 MetalPipelineType pso_type)
272 if (pso_type != PSO_GENERIC) {
282 bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
283 if (is_shade_pso != is_shade_kernel) {
291 for (
auto &pipeline : pipelines[device_kernel]) {
292 if (pipeline->kernels_md5 == device->kernels_md5[pso_type]) {
301void ShaderCache::load_kernel(
DeviceKernel device_kernel,
303 MetalPipelineType pso_type)
308 if (compile_threads.empty()) {
311 int max_mtlcompiler_threads = 2;
313# if defined(MAC_OS_VERSION_13_3)
314 if (@available(macOS 13.3, *)) {
316 max_mtlcompiler_threads =
max(2,
317 int([mtlDevice maximumConcurrentCompilationTaskCount]) - 1);
321 metal_printf(
"Spawning %d Cycles kernel compilation threads", max_mtlcompiler_threads);
322 for (
int i = 0;
i < max_mtlcompiler_threads;
i++) {
323 compile_threads.emplace_back([
this] { this->compile_thread_func(); });
328 if (!should_load_kernel(device_kernel, device, pso_type)) {
332 incomplete_requests++;
333 if (pso_type != PSO_GENERIC) {
334 incomplete_specialization_requests++;
341 pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
342 pipeline->originating_device_id = device->device_id;
343 pipeline->kernel_data_ = device->launch_params->data;
344 pipeline->pso_type = pso_type;
345 pipeline->mtlDevice = mtlDevice;
346 pipeline->kernels_md5 = device->kernels_md5[pso_type];
347 pipeline->mtlLibrary = device->mtlLibrary[pso_type];
348 pipeline->device_kernel = device_kernel;
349 pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
351 if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
352 pipeline->threads_per_threadgroup = occupancy_tuning[device_kernel].threads_per_threadgroup;
353 pipeline->num_threads_per_block = occupancy_tuning[device_kernel].num_threads_per_block;
357 pipeline->use_metalrt = device->use_metalrt;
358 pipeline->kernel_features = device->kernel_features;
362 request_queue.push_back(std::move(pipeline));
364 cond_var.notify_one();
367MetalKernelPipeline *ShaderCache::get_best_pipeline(
DeviceKernel kernel,
const MetalDevice *device)
369 while (running && !device->has_error) {
371 MetalKernelPipeline *best_match =
nullptr;
374 for (
auto &candidate : pipelines[kernel]) {
375 if (candidate->loaded &&
376 candidate->kernels_md5 == device->kernels_md5[candidate->pso_type])
379 if (!best_match || candidate->pso_type > best_match->pso_type) {
380 best_match = candidate.get();
387 if (best_match->usage_count == 0 && best_match->pso_type != PSO_GENERIC) {
388 metal_printf(
"Swapping in %s version of %s",
389 kernel_type_as_string(best_match->pso_type),
392 best_match->usage_count += 1;
397 std::this_thread::sleep_for(std::chrono::milliseconds(100));
402bool MetalKernelPipeline::should_use_binary_archive()
const
405 if (@available(macOS 15.4, *)) {
406 if (
auto *
str = getenv(
"CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
407 if (atoi(
str) != 0) {
418 if (pso_type == PSO_GENERIC) {
438static MTLFunctionConstantValues *GetConstantValues(
const KernelData *
data =
nullptr)
440 MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues
new];
442 MTLDataType MTLDataType_int = MTLDataTypeInt;
443 MTLDataType MTLDataType_float = MTLDataTypeFloat;
444 MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
445 KernelData zero_data = {0};
449 [constant_values setConstantValue:&zero_data type:MTLDataType_int atIndex:
Kernel_DummyConstant];
451 bool next_member_is_specialized =
true;
453# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
455# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
456 [constant_values setConstantValue:next_member_is_specialized ? (void *)&data->parent.name : \
458 type:MTLDataType_##_type \
459 atIndex:KernelData_##parent##_##name]; \
460 next_member_is_specialized = true;
464 [constant_values setConstantValue:&
data->kernel_features
468 return constant_values;
471void MetalDispatchPipeline::free_intersection_function_tables()
473 for (
int table = 0; table < METALRT_TABLE_NUM; table++) {
474 if (intersection_func_table[table]) {
475 [intersection_func_table[table] release];
476 intersection_func_table[table] = nil;
481MetalDispatchPipeline::~MetalDispatchPipeline()
483 free_intersection_function_tables();
486bool MetalDispatchPipeline::update(MetalDevice *metal_device,
DeviceKernel kernel)
488 const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
490 if (!best_pipeline) {
494 if (pipeline_id == best_pipeline->pipeline_id) {
498 pipeline_id = best_pipeline->pipeline_id;
499 pipeline = best_pipeline->pipeline;
500 pso_type = best_pipeline->pso_type;
501 num_threads_per_block = best_pipeline->num_threads_per_block;
505 free_intersection_function_tables();
507 for (
int table = 0; table < METALRT_TABLE_NUM; table++) {
509 MTLIntersectionFunctionTableDescriptor *ift_desc =
510 [[MTLIntersectionFunctionTableDescriptor alloc]
init];
511 ift_desc.functionCount = best_pipeline->table_functions[table].count;
512 intersection_func_table[table] = [this->pipeline
513 newIntersectionFunctionTableWithDescriptor:ift_desc];
516 int size = int([best_pipeline->table_functions[table]
count]);
517 for (
int i = 0;
i <
size;
i++) {
518 id<MTLFunctionHandle> handle = [pipeline
519 functionHandleWithFunction:best_pipeline->table_functions[table][
i]];
520 [intersection_func_table[table] setFunction:handle atIndex:
i];
529id<MTLFunction> MetalKernelPipeline::make_intersection_function(
const char *function_name)
531 MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
532 desc.name = [@(function_name)
copy];
534 if (pso_type != PSO_GENERIC) {
535 desc.constantValues = GetConstantValues(&kernel_data_);
538 desc.constantValues = GetConstantValues();
541 NSError *
error =
nullptr;
542 id<MTLFunction> rt_intersection_function = [mtlLibrary newFunctionWithDescriptor:desc
545 if (rt_intersection_function == nil) {
546 NSString *err = [
error localizedDescription];
547 string errors = [err UTF8String];
550 "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
553 rt_intersection_function.label = [@(function_name)
copy];
555 return rt_intersection_function;
558void MetalKernelPipeline::compile()
560 const std::string function_name = std::string(
"cycles_metal_") +
563 NSError *
error =
nullptr;
565 MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
566 func_desc.name = [@(function_name.c_str())
copy];
568 if (pso_type != PSO_GENERIC) {
569 func_desc.constantValues = GetConstantValues(&kernel_data_);
572 func_desc.constantValues = GetConstantValues();
575 function = [mtlLibrary newFunctionWithDescriptor:func_desc
error:&
error];
577 if (function == nil) {
578 NSString *err = [
error localizedDescription];
579 string errors = [err UTF8String];
580 metal_printf(
"Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
584 function.label = [@(function_name.c_str())
copy];
586 NSArray *linked_functions = nil;
590 NSMutableSet *unique_functions = [[NSMutableSet alloc]
init];
592 auto add_intersection_functions = [&](
int table_index,
594 const char *curve_fn =
nullptr,
595 const char *point_fn =
nullptr) {
596 table_functions[table_index] = [NSArray
597 arrayWithObjects:make_intersection_function(tri_fn),
598 curve_fn ? make_intersection_function(curve_fn) : nil,
599 point_fn ? make_intersection_function(point_fn) : nil,
602 [unique_functions addObjectsFromArray:table_functions[table_index]];
605 add_intersection_functions(METALRT_TABLE_DEFAULT,
606 "__intersection__tri",
607 "__intersection__curve",
608 "__intersection__point");
609 add_intersection_functions(METALRT_TABLE_SHADOW,
610 "__intersection__tri_shadow",
611 "__intersection__curve_shadow",
612 "__intersection__point_shadow");
613 add_intersection_functions(METALRT_TABLE_SHADOW_ALL,
614 "__intersection__tri_shadow_all",
615 "__intersection__curve_shadow_all",
616 "__intersection__point_shadow_all");
617 add_intersection_functions(METALRT_TABLE_VOLUME,
"__intersection__volume_tri");
618 add_intersection_functions(METALRT_TABLE_LOCAL,
"__intersection__local_tri");
619 add_intersection_functions(METALRT_TABLE_LOCAL_MBLUR,
"__intersection__local_tri_mblur");
620 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT,
621 "__intersection__local_tri_single_hit");
622 add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT_MBLUR,
623 "__intersection__local_tri_single_hit_mblur");
625 linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
626 sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
627 return [f1.label compare:f2.label];
629 unique_functions = nil;
632 MTLComputePipelineDescriptor *computePipelineStateDescriptor =
633 [[MTLComputePipelineDescriptor alloc]
init];
635 computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
636 computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
637 computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
639 computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
640 computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth =
true;
642 computePipelineStateDescriptor.computeFunction = function;
645 if (linked_functions) {
646 computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc]
init];
647 computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
649 computePipelineStateDescriptor.maxCallStackDepth = 1;
651 computePipelineStateDescriptor.maxCallStackDepth = 2;
654 MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
656 bool use_binary_archive = should_use_binary_archive();
657 bool loading_existing_archive =
false;
658 bool creating_new_archive =
false;
660 id<MTLBinaryArchive> archive = nil;
661 string metalbin_path;
662 string metalbin_name;
663 if (use_binary_archive) {
664 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
665 string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
667 local_md5.
append(kernels_md5);
668 local_md5.
append(osVersion);
669 local_md5.
append((uint8_t *)&this->threads_per_threadgroup,
670 sizeof(this->threads_per_threadgroup));
673 string device_name = [mtlDevice.name UTF8String];
674 for (
char &c : device_name) {
675 if ((c <
'0' || c >
'9') && (c <
'a' || c >
'z') && (c <
'A' || c >
'Z')) {
680 metalbin_name = device_name;
682 metalbin_name =
path_join(metalbin_name, kernel_type_as_string(pso_type));
691 creating_new_archive = !loading_existing_archive;
693 MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc]
init];
694 if (loading_existing_archive) {
695 archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
697 NSError *
error = nil;
698 archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc
error:&
error];
700 const char *err =
error ? [[
error localizedDescription] UTF8String] :
nullptr;
701 metal_printf(
"newBinaryArchiveWithDescriptor failed: %s", err ? err :
"nil");
703 [archiveDesc release];
705 if (loading_existing_archive) {
706 pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
707 computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
711 bool recreate_archive =
false;
714 auto do_compilation = [&]() {
715 __block
bool compilation_finished =
false;
716 __block
string error_str;
718 if (loading_existing_archive || !
DebugFlags().metal.use_async_pso_creation) {
722 NSError *
error = nil;
723 pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
727 const char *err =
error ? [[
error localizedDescription] UTF8String] :
nullptr;
728 error_str = err ? err :
"nil";
734 newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
736 completionHandler:^(id<MTLComputePipelineState> computePipelineState,
737 MTLComputePipelineReflection * ,
739 pipeline = computePipelineState;
746 const char *err =
error ?
747 [[
error localizedDescription] UTF8String] :
749 error_str = err ? err :
"nil";
751 compilation_finished =
true;
755 while (ShaderCache::running && !compilation_finished) {
756 std::this_thread::sleep_for(std::chrono::milliseconds(5));
760 if (creating_new_archive && pipeline) {
763 if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
766 NSString *errStr = [
error localizedDescription];
767 metal_printf(
"Failed to add PSO to archive:\n%s", errStr ? [errStr UTF8String] :
"nil");
773 "newComputePipelineStateWithDescriptor failed for \"%s\"%s. "
776 (archive && !recreate_archive) ?
" Archive may be incomplete or corrupt - attempting "
789 if (pipeline == nil && archive) {
790 recreate_archive =
true;
791 pipelineOptions = MTLPipelineOptionNone;
797 double duration =
time_dt() - starttime;
799 if (pipeline == nil) {
800 metal_printf(
"%16s | %2d | %-55s | %7.2fs | FAILED!",
801 kernel_type_as_string(pso_type),
808 if (!num_threads_per_block) {
809 num_threads_per_block =
round_down(pipeline.maxTotalThreadsPerThreadgroup,
810 pipeline.threadExecutionWidth);
811 num_threads_per_block = std::max(num_threads_per_block, (
int)pipeline.threadExecutionWidth);
814 if (ShaderCache::running) {
815 if (creating_new_archive || recreate_archive) {
816 if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
error:&
error])
818 metal_printf(
"Failed to save binary archive to %s, error:\n%s",
819 metalbin_path.c_str(),
820 [[
error localizedDescription] UTF8String]);
829 [computePipelineStateDescriptor release];
830 computePipelineStateDescriptor = nil;
832 if (!use_binary_archive) {
833 metal_printf(
"%16s | %2d | %-55s | %7.2fs",
834 kernel_type_as_string(pso_type),
840 metal_printf(
"%16s | %2d | %-55s | %7.2fs | %s: %s",
841 kernel_type_as_string(pso_type),
845 creating_new_archive ?
" new" :
"load",
846 metalbin_name.c_str());
850bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
852 auto *shader_cache = get_shader_cache(device->mtlDevice);
854 shader_cache->load_kernel((
DeviceKernel)
i, device, pso_type);
859void MetalDeviceKernels::wait_for_all()
861 for (
int i = 0;
i < g_shaderCacheCount;
i++) {
862 g_shaderCache[
i].second->wait_for_all();
866int MetalDeviceKernels::num_incomplete_specialization_requests()
871 for (
int i = 0;
i < g_shaderCacheCount;
i++) {
872 total += g_shaderCache[
i].second->incomplete_specialization_requests;
877int MetalDeviceKernels::get_loaded_kernel_count(
const MetalDevice *device,
878 MetalPipelineType pso_type)
880 auto *shader_cache = get_shader_cache(device->mtlDevice);
883 if (shader_cache->should_load_kernel((
DeviceKernel)
i, device, pso_type)) {
890bool MetalDeviceKernels::should_load_kernels(
const MetalDevice *device, MetalPipelineType pso_type)
895const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(
const MetalDevice *device,
898 return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
901bool MetalDeviceKernels::is_benchmark_warmup()
903 NSArray *args = [[NSProcessInfo processInfo] arguments];
904 for (
int i = 0;
i < args.count;
i++) {
905 if (
const char *arg = [[args objectAtIndex:
i] cStringUsingEncoding:NSASCIIStringEncoding]) {
906 if (!strcmp(arg,
"--warm-up")) {
914void MetalDeviceKernels::static_deinitialize()
916 for (
int i = 0;
i < g_shaderCacheCount;
i++) {
917 g_shaderCache[
i] = DeviceShaderCache();
BMesh const char void * data
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
void append(const uint8_t *data, const int nbytes)
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
@ KernelData_kernel_features
#define assert(assertion)
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_SHADER_EVAL_VOLUME_DENSITY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
static void error(const char *str)
static void init(bNodeTree *, bNode *node)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
string path_join(const string &dir, const string &file)
bool path_cache_kernel_exists_and_mark_used(const string &path)
void path_cache_kernel_mark_added_and_clear_old(const string &new_path, const size_t max_old_kernel_of_same_type)
void path_create_directories(const string &filepath)
bool path_remove(const string &path)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t round_down(const size_t x, const size_t multiple)