18# define MAX_SAMPLE_BUFFER_LENGTH 4096
22MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
23 :
DeviceQueue(device), metal_device_(device), stats_(device->stats)
26 command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc]
init];
27 command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
29 mtlDevice_ = device->mtlDevice;
30 mtlCommandQueue_ = device->mtlComputeCommandQueue;
32 shared_event_ = [mtlDevice_ newSharedEvent];
36 event_queue_ = dispatch_queue_create(
"com.cycles.metal.event_queue",
NULL);
37 shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
39 wait_semaphore_ = dispatch_semaphore_create(0);
41 if (
auto str = getenv(
"CYCLES_METAL_PROFILING")) {
42 if (atoi(
str) && [mtlDevice_ supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
45 profiling_enabled_ =
true;
46 label_command_encoders_ =
true;
49 NSArray<id<MTLCounterSet>> *counterSets = [mtlDevice_ counterSets];
52 MTLCounterSampleBufferDescriptor *desc = [[MTLCounterSampleBufferDescriptor alloc]
init];
53 [desc setStorageMode:MTLStorageModeShared];
54 [desc setLabel:
@"CounterSampleBuffer"];
55 [desc setSampleCount:MAX_SAMPLE_BUFFER_LENGTH];
56 [desc setCounterSet:counterSets[0]];
57 counter_sample_buffer_ = [mtlDevice_ newCounterSampleBufferWithDescriptor:desc
59 [counter_sample_buffer_ retain];
62 if (getenv(
"CYCLES_METAL_DEBUG")) {
64 verbose_tracing_ =
true;
65 label_command_encoders_ =
true;
72void MetalDeviceQueue::setup_capture()
76 if (
auto capture_kernel_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
78 capture_kernel_ =
DeviceKernel(atoi(capture_kernel_str));
81 capture_dispatch_counter_ = 0;
82 if (
auto capture_dispatch_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
83 capture_dispatch_counter_ = atoi(capture_dispatch_str);
85 printf(
"Capture dispatch number %d\n", capture_dispatch_counter_);
88 else if (
auto capture_samples_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
91 capture_samples_ =
true;
92 capture_reset_counter_ = atoi(capture_samples_str);
94 capture_dispatch_counter_ = INT_MAX;
95 if (
auto capture_limit_str = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
97 capture_dispatch_counter_ = atoi(capture_limit_str);
100 printf(
"Capturing sample block %d (dispatch limit: %d)\n",
101 capture_reset_counter_,
102 capture_dispatch_counter_);
110 MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
111 mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
112 mtlCaptureScope_.label = [NSString stringWithFormat:
@"Cycles kernel dispatch"];
113 [captureManager setDefaultCaptureScope:mtlCaptureScope_];
115 label_command_encoders_ =
true;
117 if (
auto capture_url = getenv(
"CYCLES_DEBUG_METAL_CAPTURE_URL")) {
118 if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
120 MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc]
init];
121 captureDescriptor.captureObject = mtlCaptureScope_;
122 captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
123 captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
126 if (![captureManager startCaptureWithDescriptor:captureDescriptor
error:&
error]) {
127 NSString *err = [
error localizedDescription];
128 printf(
"Start capture failed: %s\n", [err UTF8String]);
131 printf(
"Capture started (URL: %s)\n", capture_url);
132 is_capturing_to_disk_ =
true;
136 printf(
"Capture to file is not supported\n");
141void MetalDeviceQueue::update_capture(
DeviceKernel kernel)
145 capture_dispatch_counter_ -= 1;
153 if (capture_dispatch_counter_ < 0) {
159 if (kernel == capture_kernel_) {
161 if (capture_dispatch_counter_ == 0) {
164 capture_dispatch_counter_ -= 1;
169 if (capture_samples_) {
171 if (capture_reset_counter_ == 0) {
176 capture_reset_counter_ -= 1;
182void MetalDeviceQueue::begin_capture()
185 if (mtlCommandBuffer_) {
188 [mtlCaptureScope_ beginScope];
189 printf(
"[mtlCaptureScope_ beginScope]\n");
190 is_capturing_ =
true;
193void MetalDeviceQueue::end_capture()
195 [mtlCaptureScope_ endScope];
196 is_capturing_ =
false;
197 printf(
"[mtlCaptureScope_ endScope]\n");
199 if (is_capturing_to_disk_) {
200 [[MTLCaptureManager sharedCaptureManager] stopCapture];
201 has_captured_to_disk_ =
true;
202 is_capturing_to_disk_ =
false;
203 is_capturing_ =
false;
204 printf(
"Capture stopped\n");
208MetalDeviceQueue::~MetalDeviceQueue()
212 assert(mtlCommandBuffer_ == nil);
213 assert(command_buffers_submitted_ == command_buffers_completed_);
215 close_compute_encoder();
216 close_blit_encoder();
218 [shared_event_listener_ release];
219 [shared_event_ release];
220 [command_buffer_desc_ release];
222 if (counter_sample_buffer_) {
223 [counter_sample_buffer_ release];
226 if (mtlCaptureScope_) {
227 [mtlCaptureScope_ release];
234 int64_t num_pathtracing_dispatches = 0;
236 auto &stat = timing_stats_[i];
240 num_dispatches += stat.num_dispatches;
241 num_pathtracing_dispatches += pathtracing_kernel ? stat.num_dispatches : 0;
243 bool has_extra = (num_pathtracing_dispatches && num_dispatches > num_pathtracing_dispatches);
245 if (num_dispatches) {
246 printf(
"\nMetal %sdispatch stats:\n", num_pathtracing_dispatches ?
"path-tracing " :
"");
254 auto divider = string(header.length(),
'-');
255 printf(
"%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
258 auto &stat = timing_stats_[i];
262 if ((pathtracing_kernel && num_pathtracing_dispatches) || stat.num_dispatches > 0) {
263 printf(
"%-40s %16llu %12llu %12llu %9.4f %9.2f\n",
265 stat.total_work_size,
267 stat.total_work_size / stat.num_dispatches,
272 printf(
"%s\n", divider.c_str());
275 printf(
"%s\n", divider.c_str());
276 printf(
"%-40s %16s %12llu %12s %9.4f %9.2f\n",
"",
"", num_dispatches,
"",
total_time, 100.0);
277 printf(
"%s\n\n", divider.c_str());
281int MetalDeviceQueue::num_concurrent_states(
const size_t state_size)
const
283 static int result = 0;
291 if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
293 size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
294 size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
299 size_t min_headroom = std::max(system_ram / 8,
size_t(1024 * 1024 * 1024));
300 size_t total_state_size = result * state_size;
301 if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
303 metal_printf(
"Doubling state count to exploit available RAM (new size = %d)\n", result);
309int MetalDeviceQueue::num_concurrent_busy_states(
const size_t state_size)
const
312 return num_concurrent_states(state_size) / 4;
315int MetalDeviceQueue::num_sort_partition_elements()
const
317 return MetalInfo::optimal_sort_partition_elements();
320bool MetalDeviceQueue::supports_local_atomic_sort()
const
322 return metal_device_->use_local_atomic_sort();
325void MetalDeviceQueue::init_execution()
328 metal_device_->load_texture_info();
338 update_capture(kernel);
340 if (metal_device_->have_error()) {
347 id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
349 if (profiling_enabled_) {
350 command_encoder_labels_.push_back({kernel,
work_size, current_encoder_idx_});
354 size_t arg_buffer_length = 0;
355 for (
size_t i = 0; i < args.
count; i++) {
356 size_t size_in_bytes = args.
sizes[i];
357 arg_buffer_length =
round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
360 arg_buffer_length =
round_up(arg_buffer_length, 256);
363 size_t globals_offsets = arg_buffer_length;
365 arg_buffer_length =
round_up(arg_buffer_length, 256);
368 size_t metal_offsets = arg_buffer_length;
369 arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
370 arg_buffer_length =
round_up(arg_buffer_length,
371 metal_device_->mtlAncillaryArgEncoder.alignment);
375 memset(init_arg_buffer, 0, arg_buffer_length);
378 size_t bytes_written = 0;
379 for (
size_t i = 0; i < args.
count; i++) {
380 size_t size_in_bytes = args.
sizes[i];
381 bytes_written =
round_up(bytes_written, size_in_bytes);
383 memcpy(init_arg_buffer + bytes_written, args.
values[i], size_in_bytes);
385 bytes_written += size_in_bytes;
392 size_t plain_old_launch_data_size =
sizeof(
KernelParamsMetal) - plain_old_launch_data_offset;
393 memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
394 (
uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
395 plain_old_launch_data_size);
398 MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
399 if ([mtlDevice_ hasUnifiedMemory]) {
400 arg_buffer_options = MTLResourceStorageModeShared;
403 id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
412 for (
size_t i = 0; i < args.
count; i++) {
413 size_t size_in_bytes = args.
sizes[i];
414 bytes_written =
round_up(bytes_written, size_in_bytes);
416 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
417 offset:bytes_written];
418 if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.
values[i]) {
419 [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
420 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
421 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
426 if (@available(macos 12.0, *)) {
427 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
431 bytes_written += size_in_bytes;
435 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
436 offset:globals_offsets];
438 if (label_command_encoders_) {
440 mtlComputeCommandEncoder.label = [NSString
441 stringWithFormat:
@"Metal queue launch %s, work_size %d",
449 for (
size_t offset = 0; offset < pointer_block_end; offset +=
sizeof(
device_ptr)) {
451 MetalDevice::MetalMem *mmem = *(
452 MetalDevice::MetalMem **)((
uint8_t *)&metal_device_->launch_params + offset);
453 if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
454 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
456 atIndex:pointer_index];
459 if (@available(macos 12.0, *)) {
460 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil
462 atIndex:pointer_index];
468 if (!active_pipelines_[kernel].
update(metal_device_, kernel)) {
469 metal_device_->set_error(
473 MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
476 [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
477 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
480 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
483 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
487 if (@available(macos 12.0, *)) {
489 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
490 [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
491 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
493 atIndex:(METALRT_TABLE_NUM + 4)];
496 for (
int table = 0; table < METALRT_TABLE_NUM; table++) {
497 if (active_pipeline.intersection_func_table[table]) {
498 [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
499 offset:globals_offsets
501 [metal_device_->mtlAncillaryArgEncoder
502 setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
504 [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
505 usage:MTLResourceUsageRead];
508 [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
513 bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
516 if (arg_buffer.storageMode == MTLStorageModeManaged) {
517 [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
520 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
521 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
522 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
525 if (@available(macos 12.0, *)) {
527 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
529 [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
530 [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
531 usage:MTLResourceUsageRead];
532 [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
533 count:metal_device_->unique_blas_array.size()
534 usage:MTLResourceUsageRead];
539 [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
542 const int num_threads_per_block = active_pipeline.num_threads_per_block;
544 int shared_mem_bytes = 0;
557 shared_mem_bytes = (
int)
round_up((num_threads_per_block + 1) *
sizeof(
int), 16);
562 int key_count = metal_device_->launch_params.data.max_shaders;
563 shared_mem_bytes = (
int)
round_up(key_count *
sizeof(
int), 16);
571 if (shared_mem_bytes) {
572 assert(shared_mem_bytes <= 32 * 1024);
573 [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
576 MTLSize size_threads_per_dispatch = MTLSizeMake(
work_size, 1, 1);
577 MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
578 [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
579 threadsPerThreadgroup:size_threads_per_threadgroup];
581 [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
584 if (command_buffer.status != MTLCommandBufferStatusCompleted) {
586 int(command_buffer.status));
588 if (command_buffer.error) {
590 const char *errCStr = [[NSString stringWithFormat:
@"%@", command_buffer.error]
593 kernel_type_as_string(active_pipeline.pso_type),
599 metal_device_->set_error(
str);
603 if (verbose_tracing_ || is_capturing_) {
608 if (verbose_tracing_) {
611 "_____________________________________.____________________.______________._________"
613 "______________________________________\n");
616 printf(
"%-40s| %7d threads |%5.2fms | buckets [",
619 last_completion_time_ * 1000.0);
620 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
621 for (
auto &it : metal_device_->metal_mem_map) {
622 const string c_integrator_queue_counter =
"integrator_queue_counter";
623 if (it.first->name == c_integrator_queue_counter) {
626 void device_copy_from__IntegratorQueueCounter()
631 ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
634 it.first->host_pointer)
637 printf(
"%s%d", i == 0 ?
"" :
",",
int(queue_counter->num_queued[i]));
646 return !(metal_device_->have_error());
650void MetalDeviceQueue::flush_timing_stats()
652 for (
auto label : command_encoder_labels_) {
653 TimingStats &stat = timing_stats_[
label.kernel];
655 double completion_time_gpu;
656 NSData *computeTimeStamps = [counter_sample_buffer_
657 resolveCounterRange:NSMakeRange(
label.timing_id, 2)];
658 MTLCounterResultTimestamp *timestamps = (MTLCounterResultTimestamp *)(computeTimeStamps.bytes);
660 uint64_t begTime = timestamps[0].timestamp;
661 uint64_t endTime = timestamps[1].timestamp;
662 completion_time_gpu = (endTime - begTime) / (
double)NSEC_PER_SEC;
664 stat.num_dispatches++;
665 stat.total_time += completion_time_gpu;
666 stat.total_work_size +=
label.work_size;
667 last_completion_time_ = completion_time_gpu;
669 command_encoder_labels_.clear();
672bool MetalDeviceQueue::synchronize()
675 if (has_captured_to_disk_ || metal_device_->have_error()) {
679 close_compute_encoder();
680 close_blit_encoder();
682 if (mtlCommandBuffer_) {
685 uint64_t shared_event_id_ = this->shared_event_id_++;
687 __block dispatch_semaphore_t block_sema = wait_semaphore_;
688 [shared_event_ notifyListener:shared_event_listener_
689 atValue:shared_event_id_
690 block:^(id<MTLSharedEvent> ,
uint64_t ) {
691 dispatch_semaphore_signal(block_sema);
694 [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
695 [mtlCommandBuffer_ commit];
696 dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
698 [mtlCommandBuffer_ release];
700 for (
const CopyBack &mmem : copy_back_mem_) {
701 memcpy((
uchar *)mmem.host_pointer, (
uchar *)mmem.gpu_mem, mmem.size);
703 copy_back_mem_.clear();
705 temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
706 metal_device_->flush_delayed_free_list();
708 mtlCommandBuffer_ = nil;
709 flush_timing_stats();
712 return !(metal_device_->have_error());
719 if (metal_device_->have_error()) {
731 metal_device_->mem_alloc(mem);
737 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
738 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
739 if (mmem.mtlBuffer) {
740 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
741 [blitEncoder fillBuffer:mmem.mtlBuffer
range:NSMakeRange(mmem.offset, mmem.size) value:0];
744 metal_device_->mem_zero(mem);
752 if (metal_device_->have_error()) {
762 metal_device_->mem_alloc(mem);
768 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
769 auto result = metal_device_->metal_mem_map.find(&mem);
770 if (result != metal_device_->metal_mem_map.end()) {
775 MetalDevice::MetalMem &mmem = *result->second;
776 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
778 id<MTLBuffer> buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
781 MTLResourceStorageModeShared,
785 [blitEncoder copyFromBuffer:buffer
787 toBuffer:mmem.mtlBuffer
788 destinationOffset:mmem.offset
792 metal_device_->mem_copy_to(mem);
800 if (metal_device_->have_error()) {
813 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
814 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
815 if (mmem.mtlBuffer) {
819 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
820 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
821 [blitEncoder synchronizeResource:mmem.mtlBuffer];
824 if (mtlCommandBuffer_) {
825 copy_back_mem_.push_back({mem.
host_pointer, mmem.hostPtr, size});
837 metal_device_->mem_copy_from(mem);
844 std::lock_guard<std::recursive_mutex>
lock(metal_device_->metal_mem_map_mutex);
847 for (
auto &it : metal_device_->metal_mem_map) {
850 MTLResourceUsage usage = MTLResourceUsageRead;
852 usage |= MTLResourceUsageWrite;
855 if (it.second->mtlBuffer) {
857 [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
859 else if (it.second->mtlTexture) {
861 [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
866 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
867 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
868 [mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
871id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(
DeviceKernel kernel)
875 if (profiling_enabled_) {
877 close_compute_encoder();
880 if (mtlComputeEncoder_) {
881 if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
882 MTLDispatchTypeSerial)
885 prepare_resources(kernel);
887 return mtlComputeEncoder_;
889 close_compute_encoder();
892 close_blit_encoder();
894 if (!mtlCommandBuffer_) {
895 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
896 [mtlCommandBuffer_ retain];
899 if (profiling_enabled_) {
900 MTLComputePassDescriptor *desc = [[MTLComputePassDescriptor alloc]
init];
902 current_encoder_idx_ = (counter_sample_buffer_curr_idx_.fetch_add(2) %
903 MAX_SAMPLE_BUFFER_LENGTH);
904 [desc.sampleBufferAttachments[0] setSampleBuffer:counter_sample_buffer_];
905 [desc.sampleBufferAttachments[0] setStartOfEncoderSampleIndex:current_encoder_idx_];
906 [desc.sampleBufferAttachments[0] setEndOfEncoderSampleIndex:current_encoder_idx_ + 1];
908 [desc setDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial];
910 mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDescriptor:desc];
913 mtlComputeEncoder_ = [mtlCommandBuffer_
914 computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
915 MTLDispatchTypeSerial];
918 [mtlComputeEncoder_ retain];
922 prepare_resources(kernel);
924 return mtlComputeEncoder_;
927id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
929 if (mtlBlitEncoder_) {
930 return mtlBlitEncoder_;
933 close_compute_encoder();
935 if (!mtlCommandBuffer_) {
936 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
937 [mtlCommandBuffer_ retain];
940 mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
941 [mtlBlitEncoder_ retain];
942 return mtlBlitEncoder_;
945void MetalDeviceQueue::close_compute_encoder()
947 if (mtlComputeEncoder_) {
948 [mtlComputeEncoder_ endEncoding];
949 [mtlComputeEncoder_ release];
950 mtlComputeEncoder_ = nil;
954void MetalDeviceQueue::close_blit_encoder()
956 if (mtlBlitEncoder_) {
957 [mtlBlitEncoder_ endEncoding];
958 [mtlBlitEncoder_ release];
959 mtlBlitEncoder_ = nil;
963void *MetalDeviceQueue::native_queue()
965 return mtlCommandQueue_;
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
device_ptr device_pointer
void device_copy_from(size_t y, size_t w, size_t h, size_t elem)
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define VLOG_DEVICE_STATS
static void error(const char *str)
static void update(bNodeTree *ntree)
unsigned __int64 uint64_t
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
size_t system_physical_ram()
ccl_device_inline size_t round_up(size_t x, size_t multiple)