24MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
25 :
DeviceQueue(device), metal_device_(device), stats_(device->stats)
28 command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc]
init];
29 command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
31 mtlDevice_ = device->mtlDevice;
32 mtlCommandQueue_ = device->mtlComputeCommandQueue;
34 shared_event_ = [mtlDevice_ newSharedEvent];
38 event_queue_ = dispatch_queue_create(
"com.cycles.metal.event_queue",
nullptr);
39 shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
41 wait_semaphore_ = dispatch_semaphore_create(0);
43 if (
auto *
str = getenv(
"CYCLES_METAL_PROFILING")) {
44 if (atoi(
str) && [mtlDevice_ supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
47 profiling_enabled_ =
true;
48 label_command_encoders_ =
true;
51 if (getenv(
"CYCLES_METAL_DEBUG")) {
53 verbose_tracing_ =
true;
54 label_command_encoders_ =
true;
61void MetalDeviceQueue::setup_capture()
65 if (
auto *capture_kernel_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
67 capture_kernel_ =
DeviceKernel(atoi(capture_kernel_str));
70 capture_dispatch_counter_ = 0;
71 if (
auto *capture_dispatch_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
72 capture_dispatch_counter_ = atoi(capture_dispatch_str);
74 printf(
"Capture dispatch number %d\n", capture_dispatch_counter_);
77 else if (
auto *capture_samples_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
80 capture_samples_ =
true;
81 capture_reset_counter_ = atoi(capture_samples_str);
83 capture_dispatch_counter_ = INT_MAX;
84 if (
auto *capture_limit_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
86 capture_dispatch_counter_ = atoi(capture_limit_str);
89 printf(
"Capturing sample block %d (dispatch limit: %d)\n",
90 capture_reset_counter_,
91 capture_dispatch_counter_);
99 MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
100 mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
101 mtlCaptureScope_.label = [NSString stringWithFormat:
@"Cycles kernel dispatch"];
102 [captureManager setDefaultCaptureScope:mtlCaptureScope_];
104 label_command_encoders_ =
true;
106 if (
auto *capture_url = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_URL")) {
107 if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
109 MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc]
init];
110 captureDescriptor.captureObject = mtlCaptureScope_;
111 captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
112 captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
115 if (![captureManager startCaptureWithDescriptor:captureDescriptor
error:&
error]) {
116 NSString *err = [
error localizedDescription];
117 printf(
"Start capture failed: %s\n", [err UTF8String]);
120 printf(
"Capture started (URL: %s)\n", capture_url);
121 is_capturing_to_disk_ =
true;
125 printf(
"Capture to file is not supported\n");
130void MetalDeviceQueue::update_capture(
DeviceKernel kernel)
134 capture_dispatch_counter_ -= 1;
142 if (capture_dispatch_counter_ < 0) {
148 if (kernel == capture_kernel_) {
150 if (capture_dispatch_counter_ == 0) {
153 capture_dispatch_counter_ -= 1;
158 if (capture_samples_) {
160 if (capture_reset_counter_ == 0) {
165 capture_reset_counter_ -= 1;
171void MetalDeviceQueue::begin_capture()
174 if (mtlCommandBuffer_) {
177 [mtlCaptureScope_ beginScope];
178 printf(
"[mtlCaptureScope_ beginScope]\n");
179 is_capturing_ =
true;
182void MetalDeviceQueue::end_capture()
184 [mtlCaptureScope_ endScope];
185 is_capturing_ =
false;
186 printf(
"[mtlCaptureScope_ endScope]\n");
188 if (is_capturing_to_disk_) {
189 [[MTLCaptureManager sharedCaptureManager] stopCapture];
190 has_captured_to_disk_ =
true;
191 is_capturing_to_disk_ =
false;
192 is_capturing_ =
false;
193 printf(
"Capture stopped\n");
197MetalDeviceQueue::~MetalDeviceQueue()
201 assert(mtlCommandBuffer_ == nil);
202 assert(command_buffers_submitted_ == command_buffers_completed_);
204 close_compute_encoder();
205 close_blit_encoder();
207 [shared_event_listener_ release];
208 [shared_event_ release];
209 [command_buffer_desc_ release];
211 if (mtlCaptureScope_) {
212 [mtlCaptureScope_ release];
219 int64_t num_pathtracing_dispatches = 0;
221 auto &stat = timing_stats_[
i];
225 num_dispatches += stat.num_dispatches;
226 num_pathtracing_dispatches += pathtracing_kernel ? stat.num_dispatches : 0;
228 bool has_extra = (num_pathtracing_dispatches && num_dispatches > num_pathtracing_dispatches);
230 if (num_dispatches) {
231 printf(
"\nMetal %sdispatch stats:\n", num_pathtracing_dispatches ?
"path-tracing " :
"");
239 auto divider = string(header.length(),
'-');
240 printf(
"%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
243 auto &stat = timing_stats_[
i];
247 if ((pathtracing_kernel && num_pathtracing_dispatches) || stat.num_dispatches > 0) {
248 printf(
"%-40s %16llu %12llu %12llu %9.4f %9.2f\n",
250 stat.total_work_size,
252 stat.total_work_size / stat.num_dispatches,
257 printf(
"%s\n", divider.c_str());
260 printf(
"%s\n", divider.c_str());
261 printf(
"%-40s %16s %12llu %12s %9.4f %9.2f\n",
"",
"", num_dispatches,
"",
total_time, 100.0);
262 printf(
"%s\n\n", divider.c_str());
266int MetalDeviceQueue::num_concurrent_states(
const size_t state_size)
const
276 if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
278 size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
279 size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
284 size_t min_headroom = std::max(system_ram / 8,
size_t(1024 * 1024 * 1024));
285 size_t total_state_size =
result * state_size;
286 if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
288 metal_printf(
"Doubling state count to exploit available RAM (new size = %d)",
result);
294int MetalDeviceQueue::num_concurrent_busy_states(
const size_t state_size)
const
297 return num_concurrent_states(state_size) / 4;
300int MetalDeviceQueue::num_sort_partitions(
int max_num_paths,
uint max_scene_shaders)
const
302 int sort_partition_elements = MetalInfo::optimal_sort_partition_elements();
307 if (max_scene_shaders < 300 && sort_partition_elements > 0) {
308 return max(max_num_paths / sort_partition_elements, 1);
315bool MetalDeviceQueue::supports_local_atomic_sort()
const
317 return metal_device_->use_local_atomic_sort();
320static void zero_resource(
void *address_in_arg_buffer,
int index = 0)
326template<
class T>
void write_resource(
void *address_in_arg_buffer,
T resource,
int index = 0)
328 zero_resource(address_in_arg_buffer, index);
331 pptr[index] = metal_gpuResourceID(
resource);
335template<>
void write_resource(
void *address_in_arg_buffer, id<MTLBuffer> buffer,
int index)
337 zero_resource(address_in_arg_buffer, index);
340 pptr[index] = metal_gpuAddress(buffer);
344static id<MTLBuffer> patch_resource(
void *address_in_arg_buffer,
int index = 0)
347 if (MetalDevice::MetalMem *mmem = (MetalDevice::MetalMem *)pptr[index]) {
348 write_resource<id<MTLBuffer>>(address_in_arg_buffer, mmem->mtlBuffer, index);
349 return mmem->mtlBuffer;
354void MetalDeviceQueue::init_execution()
358 for (
uint64_t slot = 0; slot < metal_device_->blas_array.size(); ++slot) {
359 write_resource(blas_array, metal_device_->blas_array[slot], slot);
363 id<MTLBuffer> &texture_bindings = metal_device_->texture_bindings;
364 std::vector<id<MTLResource>> &texture_slot_map = metal_device_->texture_slot_map;
371 memset(bindings, 0, texture_bindings.length);
372 for (
int slot = 0; slot < texture_info.
size(); ++slot) {
373 if (texture_slot_map[slot]) {
374 if (metal_device_->is_texture(texture_info[slot])) {
375 write_resource(bindings, id<MTLTexture>(texture_slot_map[slot]), slot);
379 write_resource(&texture_info[slot].
data, id<MTLBuffer>(texture_slot_map[slot]), 0);
393 update_capture(kernel);
395 if (metal_device_->have_error()) {
404 id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
406 if (profiling_enabled_) {
407 command_encoder_labels_.push_back({kernel,
work_size, current_encoder_idx_});
409 if (label_command_encoders_) {
411 mtlComputeCommandEncoder.label = [NSString
412 stringWithFormat:
@"Metal queue launch %s, work_size %d",
417 if (!active_pipelines_[kernel].
update(metal_device_, kernel)) {
418 metal_device_->set_error(
422 MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
424 uint8_t dynamic_args[512] = {0};
427 size_t dynamic_bytes_written = 0;
428 size_t max_size_in_bytes = 0;
429 for (
size_t i = 0;
i < args.
count;
i++) {
430 size_t size_in_bytes = args.
sizes[
i];
431 max_size_in_bytes =
max(max_size_in_bytes, size_in_bytes);
432 dynamic_bytes_written =
round_up(dynamic_bytes_written, size_in_bytes);
433 memcpy(dynamic_args + dynamic_bytes_written, args.
values[
i], size_in_bytes);
435 if (id<MTLBuffer> buffer = patch_resource(dynamic_args + dynamic_bytes_written)) {
436 [mtlComputeCommandEncoder useResource:buffer
437 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
440 dynamic_bytes_written += size_in_bytes;
444 dynamic_bytes_written =
round_up(dynamic_bytes_written, max_size_in_bytes);
447 assert(dynamic_bytes_written <=
sizeof(dynamic_args));
449 uint64_t ancillary_args[ANCILLARY_SLOT_COUNT] = {0};
452 int ancillary_index = 0;
453 write_resource(ancillary_args, metal_device_->texture_bindings, ancillary_index++);
455 if (metal_device_->use_metalrt) {
456 write_resource(ancillary_args, metal_device_->accel_struct, ancillary_index++);
457 write_resource(ancillary_args, metal_device_->blas_buffer, ancillary_index++);
460 for (
int table_idx = 0; table_idx < METALRT_TABLE_NUM; table_idx++) {
462 ancillary_args, active_pipeline.intersection_func_table[table_idx], ancillary_index++);
464 assert(ancillary_index == ANCILLARY_SLOT_COUNT);
468 if (metal_device_->use_metalrt) {
469 for (
int table = 0; table < METALRT_TABLE_NUM; table++) {
470 if (active_pipeline.intersection_func_table[table]) {
471 [active_pipeline.intersection_func_table[table]
472 setBuffer:metal_device_->launch_params_buffer
475 [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
476 usage:MTLResourceUsageRead];
481 [mtlComputeCommandEncoder setBytes:dynamic_args
length:dynamic_bytes_written atIndex:0];
482 [mtlComputeCommandEncoder setBuffer:metal_device_->launch_params_buffer offset:0 atIndex:1];
483 [mtlComputeCommandEncoder setBytes:ancillary_args
length:
sizeof(ancillary_args) atIndex:2];
486 if (@available(macos 12.0, *)) {
488 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
490 [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
491 if (metal_device_->blas_buffer) {
492 [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
493 usage:MTLResourceUsageRead];
495 [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
496 count:metal_device_->unique_blas_array.size()
497 usage:MTLResourceUsageRead];
502 [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
505 const int num_threads_per_block = active_pipeline.num_threads_per_block;
507 int shared_mem_bytes = 0;
520 shared_mem_bytes = (int)
round_up((num_threads_per_block + 1) *
sizeof(
int), 16);
525 int key_count = metal_device_->launch_params->data.max_shaders;
526 shared_mem_bytes = (int)
round_up(key_count *
sizeof(
int), 16);
534 if (shared_mem_bytes) {
535 assert(shared_mem_bytes <= 32 * 1024);
536 [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
539 MTLSize size_threads_per_dispatch = MTLSizeMake(
work_size, 1, 1);
540 MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
541 [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
542 threadsPerThreadgroup:size_threads_per_threadgroup];
544 [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
547 if (command_buffer.status != MTLCommandBufferStatusCompleted) {
549 int(command_buffer.status));
551 if (command_buffer.error) {
553 const char *errCStr = [[NSString stringWithFormat:
@"%@", command_buffer.error]
556 kernel_type_as_string(active_pipeline.pso_type),
562 metal_device_->set_error(
str);
566 if (verbose_tracing_ || is_capturing_) {
571 if (verbose_tracing_) {
574 "_____________________________________.____________________.______________._________"
576 "______________________________________\n");
579 printf(
"%-40s| %7d threads |%5.2fms | buckets [",
582 last_completion_time_ * 1000.0);
583 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
584 for (
auto &it : metal_device_->metal_mem_map) {
585 const string c_integrator_queue_counter =
"integrator_queue_counter";
586 if (it.first->name == c_integrator_queue_counter) {
588 it.first->host_pointer)
591 printf(
"%s%d",
i == 0 ?
"" :
",", queue_counter->num_queued[
i]);
603 return !(metal_device_->have_error());
607void MetalDeviceQueue::flush_timing_stats()
609 for (
auto label : command_encoder_labels_) {
610 TimingStats &stat = timing_stats_[label.kernel];
612 double completion_time_gpu;
613 NSData *computeTimeStamps = [metal_device_->mtlCounterSampleBuffer
614 resolveCounterRange:NSMakeRange(label.timing_id, 2)];
615 MTLCounterResultTimestamp *timestamps = (MTLCounterResultTimestamp *)(computeTimeStamps.bytes);
617 uint64_t begTime = timestamps[0].timestamp;
618 uint64_t endTime = timestamps[1].timestamp;
619 completion_time_gpu = (endTime - begTime) / (
double)NSEC_PER_SEC;
621 stat.num_dispatches++;
622 stat.total_time += completion_time_gpu;
623 stat.total_work_size += label.work_size;
624 last_completion_time_ = completion_time_gpu;
626 command_encoder_labels_.clear();
629bool MetalDeviceQueue::synchronize()
632 if (has_captured_to_disk_ || metal_device_->have_error()) {
636 close_compute_encoder();
637 close_blit_encoder();
639 if (mtlCommandBuffer_) {
642 uint64_t shared_event_id_ = this->shared_event_id_++;
644 __block dispatch_semaphore_t block_sema = wait_semaphore_;
645 [shared_event_ notifyListener:shared_event_listener_
646 atValue:shared_event_id_
647 block:^(id<MTLSharedEvent> ,
uint64_t ) {
648 dispatch_semaphore_signal(block_sema);
651 [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
652 [mtlCommandBuffer_ commit];
653 dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
655 [mtlCommandBuffer_ release];
657 metal_device_->flush_delayed_free_list();
659 mtlCommandBuffer_ = nil;
660 flush_timing_stats();
665 return !(metal_device_->have_error());
672 if (metal_device_->have_error()) {
684 metal_device_->mem_alloc(mem);
690 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
691 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
692 if (mmem.mtlBuffer) {
693 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
694 [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
697 metal_device_->mem_zero(mem);
705 if (metal_device_->have_error()) {
715 metal_device_->mem_alloc(mem);
731 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
734 for (
auto &it : metal_device_->metal_mem_map) {
737 MTLResourceUsage usage = MTLResourceUsageRead;
739 usage |= MTLResourceUsageWrite;
742 if (it.second->mtlBuffer) {
744 [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
746 else if (it.second->mtlTexture) {
748 [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
753 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings usage:MTLResourceUsageRead];
756id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(
DeviceKernel kernel)
760 if (profiling_enabled_) {
762 close_compute_encoder();
765 if (mtlComputeEncoder_) {
766 if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
767 MTLDispatchTypeSerial)
770 prepare_resources(kernel);
772 return mtlComputeEncoder_;
774 close_compute_encoder();
777 close_blit_encoder();
779 if (!mtlCommandBuffer_) {
780 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
781 [mtlCommandBuffer_ retain];
784 if (profiling_enabled_) {
785 MTLComputePassDescriptor *desc = [[MTLComputePassDescriptor alloc]
init];
787 current_encoder_idx_ = (counter_sample_buffer_curr_idx_.fetch_add(2) %
788 MAX_SAMPLE_BUFFER_LENGTH);
789 [desc.sampleBufferAttachments[0] setSampleBuffer:metal_device_->mtlCounterSampleBuffer];
790 [desc.sampleBufferAttachments[0] setStartOfEncoderSampleIndex:current_encoder_idx_];
791 [desc.sampleBufferAttachments[0] setEndOfEncoderSampleIndex:current_encoder_idx_ + 1];
793 [desc setDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial];
795 mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDescriptor:desc];
798 mtlComputeEncoder_ = [mtlCommandBuffer_
799 computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
800 MTLDispatchTypeSerial];
803 [mtlComputeEncoder_ retain];
807 prepare_resources(kernel);
809 return mtlComputeEncoder_;
812id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
814 if (mtlBlitEncoder_) {
815 return mtlBlitEncoder_;
818 close_compute_encoder();
820 if (!mtlCommandBuffer_) {
821 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
822 [mtlCommandBuffer_ retain];
825 mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
826 [mtlBlitEncoder_ retain];
827 return mtlBlitEncoder_;
830void MetalDeviceQueue::close_compute_encoder()
832 if (mtlComputeEncoder_) {
833 [mtlComputeEncoder_ endEncoding];
834 [mtlComputeEncoder_ release];
835 mtlComputeEncoder_ = nil;
839void MetalDeviceQueue::close_blit_encoder()
841 if (mtlBlitEncoder_) {
842 [mtlBlitEncoder_ endEncoding];
843 [mtlBlitEncoder_ release];
844 mtlBlitEncoder_ = nil;
848void *MetalDeviceQueue::native_queue()
850 return mtlCommandQueue_;
855 return make_unique<MetalDeviceGraphicsInterop>(
this);
BMesh const char void * data
unsigned long long int uint64_t
device_ptr device_pointer
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define assert(assertion)
float length(VecOp< float, D >) RET
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
static void error(const char *str)
static void update(bNodeTree *ntree)
static void init(bNodeTree *, bNode *node)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
size_t system_physical_ram()
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)