Blender V5.0
queue.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include <algorithm>
8# include <mutex>
9
10# include "device/metal/queue.h"
11
14# include "device/metal/kernel.h"
15
16# include "util/path.h"
17# include "util/string.h"
18# include "util/time.h"
19
21
22/* MetalDeviceQueue */
23
24MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
25 : DeviceQueue(device), metal_device_(device), stats_(device->stats)
26{
27 @autoreleasepool {
28 command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc] init];
29 command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
30
31 mtlDevice_ = device->mtlDevice;
32 mtlCommandQueue_ = device->mtlComputeCommandQueue;
33
34 shared_event_ = [mtlDevice_ newSharedEvent];
35 shared_event_id_ = 1;
36
37 /* Shareable event listener */
38 event_queue_ = dispatch_queue_create("com.cycles.metal.event_queue", nullptr);
39 shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
40
41 wait_semaphore_ = dispatch_semaphore_create(0);
42
43 if (auto *str = getenv("CYCLES_METAL_PROFILING")) {
44 if (atoi(str) && [mtlDevice_ supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
45 {
46 /* Enable per-kernel timing breakdown (shown at end of render). */
47 profiling_enabled_ = true;
48 label_command_encoders_ = true;
49 }
50 }
51 if (getenv("CYCLES_METAL_DEBUG")) {
52 /* Enable very verbose tracing (shows every dispatch). */
53 verbose_tracing_ = true;
54 label_command_encoders_ = true;
55 }
56
57 setup_capture();
58 }
59}
60
61void MetalDeviceQueue::setup_capture()
62{
63 capture_kernel_ = DeviceKernel(-1);
64
65 if (auto *capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
66 /* CYCLES_DEBUG_METAL_CAPTURE_KERNEL captures a single dispatch of the specified kernel. */
67 capture_kernel_ = DeviceKernel(atoi(capture_kernel_str));
68 printf("Capture kernel: %d = %s\n", capture_kernel_, device_kernel_as_string(capture_kernel_));
69
70 capture_dispatch_counter_ = 0;
71 if (auto *capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
72 capture_dispatch_counter_ = atoi(capture_dispatch_str);
73
74 printf("Capture dispatch number %d\n", capture_dispatch_counter_);
75 }
76 }
77 else if (auto *capture_samples_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
78 /* CYCLES_DEBUG_METAL_CAPTURE_SAMPLES captures a block of dispatches from reset#(N) to
79 * reset#(N+1). */
80 capture_samples_ = true;
81 capture_reset_counter_ = atoi(capture_samples_str);
82
83 capture_dispatch_counter_ = INT_MAX;
84 if (auto *capture_limit_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
85 /* CYCLES_DEBUG_METAL_CAPTURE_LIMIT sets the maximum number of dispatches to capture. */
86 capture_dispatch_counter_ = atoi(capture_limit_str);
87 }
88
89 printf("Capturing sample block %d (dispatch limit: %d)\n",
90 capture_reset_counter_,
91 capture_dispatch_counter_);
92 }
93 else {
94 /* No capturing requested. */
95 return;
96 }
97
98 /* Enable .gputrace capture for the specified DeviceKernel. */
99 MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
100 mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
101 mtlCaptureScope_.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
102 [captureManager setDefaultCaptureScope:mtlCaptureScope_];
103
104 label_command_encoders_ = true;
105
106 if (auto *capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
107 if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
108
109 MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
110 captureDescriptor.captureObject = mtlCaptureScope_;
111 captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
112 captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
113
114 NSError *error;
115 if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
116 NSString *err = [error localizedDescription];
117 printf("Start capture failed: %s\n", [err UTF8String]);
118 }
119 else {
120 printf("Capture started (URL: %s)\n", capture_url);
121 is_capturing_to_disk_ = true;
122 }
123 }
124 else {
125 printf("Capture to file is not supported\n");
126 }
127 }
128}
129
130void MetalDeviceQueue::update_capture(DeviceKernel kernel)
131{
132 /* Handle capture end triggers. */
133 if (is_capturing_) {
134 capture_dispatch_counter_ -= 1;
135 if (capture_dispatch_counter_ <= 0 || kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
136 /* End capture if we've hit the dispatch limit or we hit a "reset". */
137 end_capture();
138 }
139 return;
140 }
141
142 if (capture_dispatch_counter_ < 0) {
143 /* We finished capturing. */
144 return;
145 }
146
147 /* Handle single-capture start trigger. */
148 if (kernel == capture_kernel_) {
149 /* Start capturing when we hit the Nth dispatch of the specified kernel. */
150 if (capture_dispatch_counter_ == 0) {
151 begin_capture();
152 }
153 capture_dispatch_counter_ -= 1;
154 return;
155 }
156
157 /* Handle multi-capture start trigger. */
158 if (capture_samples_) {
159 /* Start capturing when the reset countdown is at 0. */
160 if (capture_reset_counter_ == 0) {
161 begin_capture();
162 }
163
164 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
165 capture_reset_counter_ -= 1;
166 }
167 return;
168 }
169}
170
171void MetalDeviceQueue::begin_capture()
172{
173 /* Start gputrace capture. */
174 if (mtlCommandBuffer_) {
175 synchronize();
176 }
177 [mtlCaptureScope_ beginScope];
178 printf("[mtlCaptureScope_ beginScope]\n");
179 is_capturing_ = true;
180}
181
182void MetalDeviceQueue::end_capture()
183{
184 [mtlCaptureScope_ endScope];
185 is_capturing_ = false;
186 printf("[mtlCaptureScope_ endScope]\n");
187
188 if (is_capturing_to_disk_) {
189 [[MTLCaptureManager sharedCaptureManager] stopCapture];
190 has_captured_to_disk_ = true;
191 is_capturing_to_disk_ = false;
192 is_capturing_ = false;
193 printf("Capture stopped\n");
194 }
195}
196
197MetalDeviceQueue::~MetalDeviceQueue()
198{
199 /* Tidying up here isn't really practical - we should expect and require the work
200 * queue to be empty here. */
201 assert(mtlCommandBuffer_ == nil);
202 assert(command_buffers_submitted_ == command_buffers_completed_);
203
204 close_compute_encoder();
205 close_blit_encoder();
206
207 [shared_event_listener_ release];
208 [shared_event_ release];
209 [command_buffer_desc_ release];
210
211 if (mtlCaptureScope_) {
212 [mtlCaptureScope_ release];
213 }
214
215 double total_time = 0.0;
216
217 /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
218 int64_t num_dispatches = 0;
219 int64_t num_pathtracing_dispatches = 0;
220 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
221 auto &stat = timing_stats_[i];
222 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
224 total_time += stat.total_time;
225 num_dispatches += stat.num_dispatches;
226 num_pathtracing_dispatches += pathtracing_kernel ? stat.num_dispatches : 0;
227 }
228 bool has_extra = (num_pathtracing_dispatches && num_dispatches > num_pathtracing_dispatches);
229
230 if (num_dispatches) {
231 printf("\nMetal %sdispatch stats:\n", num_pathtracing_dispatches ? "path-tracing " : "");
232 auto header = string_printf("%-40s %16s %12s %12s %9s %9s",
233 "Kernel name",
234 "Total threads",
235 "Dispatches",
236 "Avg. T/D",
237 "Time/s",
238 "Time/%");
239 auto divider = string(header.length(), '-');
240 printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
241
242 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
243 auto &stat = timing_stats_[i];
244
245 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
247 if ((pathtracing_kernel && num_pathtracing_dispatches) || stat.num_dispatches > 0) {
248 printf("%-40s %16llu %12llu %12llu %9.4f %9.2f\n",
250 stat.total_work_size,
251 stat.num_dispatches,
252 stat.total_work_size / stat.num_dispatches,
253 stat.total_time,
254 stat.total_time * 100.0 / total_time);
255 }
256 if (has_extra && i == DEVICE_KERNEL_INTEGRATOR_RESET) {
257 printf("%s\n", divider.c_str());
258 }
259 }
260 printf("%s\n", divider.c_str());
261 printf("%-40s %16s %12llu %12s %9.4f %9.2f\n", "", "", num_dispatches, "", total_time, 100.0);
262 printf("%s\n\n", divider.c_str());
263 }
264}
265
266int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
267{
268 static int result = 0;
269 if (result) {
270 return result;
271 }
272
273 result = 4194304;
274
275 /* Increasing the state count doesn't notably benefit M1-family systems. */
276 if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
277 size_t system_ram = system_physical_ram();
278 size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
279 size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
280
281 /* Determine whether we can double the state count, and leave enough GPU-available memory
282 * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
283 * keep dispatch sizes high and minimize work submission overheads. */
284 size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
285 size_t total_state_size = result * state_size;
286 if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
287 result *= 2;
288 metal_printf("Doubling state count to exploit available RAM (new size = %d)", result);
289 }
290 }
291 return result;
292}
293
294int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
295{
296 /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
297 return num_concurrent_states(state_size) / 4;
298}
299
300int MetalDeviceQueue::num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
301{
302 int sort_partition_elements = MetalInfo::optimal_sort_partition_elements();
303 /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
304 * a more sophisticated heuristic we simply disable sort partitioning if the shader count is
305 * high.
306 */
307 if (max_scene_shaders < 300 && sort_partition_elements > 0) {
308 return max(max_num_paths / sort_partition_elements, 1);
309 }
310 else {
311 return 1;
312 }
313}
314
315bool MetalDeviceQueue::supports_local_atomic_sort() const
316{
317 return metal_device_->use_local_atomic_sort();
318}
319
320static void zero_resource(void *address_in_arg_buffer, int index = 0)
321{
322 uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
323 pptr[index] = 0;
324}
325
326template<class T> void write_resource(void *address_in_arg_buffer, T resource, int index = 0)
327{
328 zero_resource(address_in_arg_buffer, index);
329 uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
330 if (resource) {
331 pptr[index] = metal_gpuResourceID(resource);
332 }
333}
334
335template<> void write_resource(void *address_in_arg_buffer, id<MTLBuffer> buffer, int index)
336{
337 zero_resource(address_in_arg_buffer, index);
338 uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
339 if (buffer) {
340 pptr[index] = metal_gpuAddress(buffer);
341 }
342}
343
344static id<MTLBuffer> patch_resource(void *address_in_arg_buffer, int index = 0)
345{
346 uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
347 if (MetalDevice::MetalMem *mmem = (MetalDevice::MetalMem *)pptr[index]) {
348 write_resource<id<MTLBuffer>>(address_in_arg_buffer, mmem->mtlBuffer, index);
349 return mmem->mtlBuffer;
350 }
351 return nil;
352}
353
354void MetalDeviceQueue::init_execution()
355{
356 /* Populate blas_array. */
357 uint64_t *blas_array = (uint64_t *)metal_device_->blas_buffer.contents;
358 for (uint64_t slot = 0; slot < metal_device_->blas_array.size(); ++slot) {
359 write_resource(blas_array, metal_device_->blas_array[slot], slot);
360 }
361
362 device_vector<TextureInfo> &texture_info = metal_device_->texture_info;
363 id<MTLBuffer> &texture_bindings = metal_device_->texture_bindings;
364 std::vector<id<MTLResource>> &texture_slot_map = metal_device_->texture_slot_map;
365
366 /* Ensure texture_info is allocated before populating. */
367 texture_info.copy_to_device();
368
369 /* Populate texture bindings. */
370 uint64_t *bindings = (uint64_t *)texture_bindings.contents;
371 memset(bindings, 0, texture_bindings.length);
372 for (int slot = 0; slot < texture_info.size(); ++slot) {
373 if (texture_slot_map[slot]) {
374 if (metal_device_->is_texture(texture_info[slot])) {
375 write_resource(bindings, id<MTLTexture>(texture_slot_map[slot]), slot);
376 }
377 else {
378 /* The GPU address of a 1D buffer texture is written into the slot data field. */
379 write_resource(&texture_info[slot].data, id<MTLBuffer>(texture_slot_map[slot]), 0);
380 }
381 }
382 }
383
384 /* Synchronize memory copies. */
385 synchronize();
386}
387
388bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
389 const int work_size,
390 const DeviceKernelArguments &args)
391{
392 @autoreleasepool {
393 update_capture(kernel);
394
395 if (metal_device_->have_error()) {
396 return false;
397 }
398
399 debug_enqueue_begin(kernel, work_size);
400
401 LOG_TRACE << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
402 << work_size;
403
404 id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
405
406 if (profiling_enabled_) {
407 command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_});
408 }
409 if (label_command_encoders_) {
410 /* Add human-readable labels if we're doing any form of debugging / profiling. */
411 mtlComputeCommandEncoder.label = [NSString
412 stringWithFormat:@"Metal queue launch %s, work_size %d",
414 work_size];
415 }
416
417 if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
418 metal_device_->set_error(
419 string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
420 return false;
421 }
422 MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
423
424 uint8_t dynamic_args[512] = {0};
425
426 /* Prepare the dynamic "enqueue" arguments */
427 size_t dynamic_bytes_written = 0;
428 size_t max_size_in_bytes = 0;
429 for (size_t i = 0; i < args.count; i++) {
430 size_t size_in_bytes = args.sizes[i];
431 max_size_in_bytes = max(max_size_in_bytes, size_in_bytes);
432 dynamic_bytes_written = round_up(dynamic_bytes_written, size_in_bytes);
433 memcpy(dynamic_args + dynamic_bytes_written, args.values[i], size_in_bytes);
435 if (id<MTLBuffer> buffer = patch_resource(dynamic_args + dynamic_bytes_written)) {
436 [mtlComputeCommandEncoder useResource:buffer
437 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
438 }
439 }
440 dynamic_bytes_written += size_in_bytes;
441 }
442 /* Apply conventional struct alignment (stops asserts firing when API validation is enabled).
443 */
444 dynamic_bytes_written = round_up(dynamic_bytes_written, max_size_in_bytes);
445
446 /* Check that the dynamic args didn't overflow. */
447 assert(dynamic_bytes_written <= sizeof(dynamic_args));
448
449 uint64_t ancillary_args[ANCILLARY_SLOT_COUNT] = {0};
450
451 /* Encode ancillaries */
452 int ancillary_index = 0;
453 write_resource(ancillary_args, metal_device_->texture_bindings, ancillary_index++);
454
455 if (metal_device_->use_metalrt) {
456 write_resource(ancillary_args, metal_device_->accel_struct, ancillary_index++);
457 write_resource(ancillary_args, metal_device_->blas_buffer, ancillary_index++);
458
459 /* Write the intersection function table. */
460 for (int table_idx = 0; table_idx < METALRT_TABLE_NUM; table_idx++) {
461 write_resource(
462 ancillary_args, active_pipeline.intersection_func_table[table_idx], ancillary_index++);
463 }
464 assert(ancillary_index == ANCILLARY_SLOT_COUNT);
465 }
466
467 /* Encode ancillaries */
468 if (metal_device_->use_metalrt) {
469 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
470 if (active_pipeline.intersection_func_table[table]) {
471 [active_pipeline.intersection_func_table[table]
472 setBuffer:metal_device_->launch_params_buffer
473 offset:0
474 atIndex:1];
475 [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
476 usage:MTLResourceUsageRead];
477 }
478 }
479 }
480
481 [mtlComputeCommandEncoder setBytes:dynamic_args length:dynamic_bytes_written atIndex:0];
482 [mtlComputeCommandEncoder setBuffer:metal_device_->launch_params_buffer offset:0 atIndex:1];
483 [mtlComputeCommandEncoder setBytes:ancillary_args length:sizeof(ancillary_args) atIndex:2];
484
485 if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
486 if (@available(macos 12.0, *)) {
487
488 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
489 /* Mark all Accelerations resources as used */
490 [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
491 if (metal_device_->blas_buffer) {
492 [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
493 usage:MTLResourceUsageRead];
494 }
495 [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
496 count:metal_device_->unique_blas_array.size()
497 usage:MTLResourceUsageRead];
498 }
499 }
500 }
501
502 [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
503
504 /* Compute kernel launch parameters. */
505 const int num_threads_per_block = active_pipeline.num_threads_per_block;
506
507 int shared_mem_bytes = 0;
508
509 switch (kernel) {
518 /* See parallel_active_index.h for why this amount of shared memory is needed.
519 * Rounded up to 16 bytes for Metal */
520 shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
521 break;
522
525 int key_count = metal_device_->launch_params->data.max_shaders;
526 shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
527 break;
528 }
529
530 default:
531 break;
532 }
533
534 if (shared_mem_bytes) {
535 assert(shared_mem_bytes <= 32 * 1024);
536 [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
537 }
538
539 MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
540 MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
541 [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
542 threadsPerThreadgroup:size_threads_per_threadgroup];
543
544 [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
545 /* Enhanced command buffer errors */
546 string str;
547 if (command_buffer.status != MTLCommandBufferStatusCompleted) {
548 str = string_printf("Command buffer not completed. status = %d. ",
549 int(command_buffer.status));
550 }
551 if (command_buffer.error) {
552 @autoreleasepool {
553 const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error]
554 UTF8String];
555 str += string_printf("(%s.%s):\n%s\n",
556 kernel_type_as_string(active_pipeline.pso_type),
558 errCStr);
559 }
560 }
561 if (!str.empty()) {
562 metal_device_->set_error(str);
563 }
564 }];
565
566 if (verbose_tracing_ || is_capturing_) {
567 /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
568 synchronize();
569
570 /* Show queue counters and dispatch timing. */
571 if (verbose_tracing_) {
572 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
573 printf(
574 "_____________________________________.____________________.______________._________"
575 "__"
576 "______________________________________\n");
577 }
578
579 printf("%-40s| %7d threads |%5.2fms | buckets [",
581 work_size,
582 last_completion_time_ * 1000.0);
583 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
584 for (auto &it : metal_device_->metal_mem_map) {
585 const string c_integrator_queue_counter = "integrator_queue_counter";
586 if (it.first->name == c_integrator_queue_counter) {
587 if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
588 it.first->host_pointer)
589 {
590 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
591 printf("%s%d", i == 0 ? "" : ",", queue_counter->num_queued[i]);
592 }
593 }
594 break;
595 }
596 }
597 printf("]\n");
598 }
599 }
600
601 debug_enqueue_end();
602
603 return !(metal_device_->have_error());
604 }
605}
606
607void MetalDeviceQueue::flush_timing_stats()
608{
609 for (auto label : command_encoder_labels_) {
610 TimingStats &stat = timing_stats_[label.kernel];
611
612 double completion_time_gpu;
613 NSData *computeTimeStamps = [metal_device_->mtlCounterSampleBuffer
614 resolveCounterRange:NSMakeRange(label.timing_id, 2)];
615 MTLCounterResultTimestamp *timestamps = (MTLCounterResultTimestamp *)(computeTimeStamps.bytes);
616
617 uint64_t begTime = timestamps[0].timestamp;
618 uint64_t endTime = timestamps[1].timestamp;
619 completion_time_gpu = (endTime - begTime) / (double)NSEC_PER_SEC;
620
621 stat.num_dispatches++;
622 stat.total_time += completion_time_gpu;
623 stat.total_work_size += label.work_size;
624 last_completion_time_ = completion_time_gpu;
625 }
626 command_encoder_labels_.clear();
627}
628
629bool MetalDeviceQueue::synchronize()
630{
631 @autoreleasepool {
632 if (has_captured_to_disk_ || metal_device_->have_error()) {
633 return false;
634 }
635
636 close_compute_encoder();
637 close_blit_encoder();
638
639 if (mtlCommandBuffer_) {
641
642 uint64_t shared_event_id_ = this->shared_event_id_++;
643
644 __block dispatch_semaphore_t block_sema = wait_semaphore_;
645 [shared_event_ notifyListener:shared_event_listener_
646 atValue:shared_event_id_
647 block:^(id<MTLSharedEvent> /*sharedEvent*/, uint64_t /*value*/) {
648 dispatch_semaphore_signal(block_sema);
649 }];
650
651 [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
652 [mtlCommandBuffer_ commit];
653 dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
654
655 [mtlCommandBuffer_ release];
656
657 metal_device_->flush_delayed_free_list();
658
659 mtlCommandBuffer_ = nil;
660 flush_timing_stats();
661 }
662
663 debug_synchronize();
664
665 return !(metal_device_->have_error());
666 }
667}
668
669void MetalDeviceQueue::zero_to_device(device_memory &mem)
670{
671 @autoreleasepool {
672 if (metal_device_->have_error()) {
673 return;
674 }
675
676 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
677
678 if (mem.memory_size() == 0) {
679 return;
680 }
681
682 /* Allocate on demand. */
683 if (mem.device_pointer == 0) {
684 metal_device_->mem_alloc(mem);
685 }
686
687 /* Zero memory on device. */
688 assert(mem.device_pointer != 0);
689
690 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
691 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
692 if (mmem.mtlBuffer) {
693 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
694 [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
695 }
696 else {
697 metal_device_->mem_zero(mem);
698 }
699 }
700}
701
702void MetalDeviceQueue::copy_to_device(device_memory &mem)
703{
704 @autoreleasepool {
705 if (metal_device_->have_error()) {
706 return;
707 }
708
709 if (mem.memory_size() == 0) {
710 return;
711 }
712
713 /* Allocate on demand. */
714 if (mem.device_pointer == 0) {
715 metal_device_->mem_alloc(mem);
716 }
717
718 assert(mem.device_pointer != 0);
719 assert(mem.host_pointer != nullptr);
720 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
721 }
722}
723
724void MetalDeviceQueue::copy_from_device(device_memory & /*mem*/)
725{
726 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
727}
728
729void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/)
730{
731 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
732
733 /* declare resource usage */
734 for (auto &it : metal_device_->metal_mem_map) {
735 device_memory *mem = it.first;
736
737 MTLResourceUsage usage = MTLResourceUsageRead;
738 if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
739 usage |= MTLResourceUsageWrite;
740 }
741
742 if (it.second->mtlBuffer) {
743 /* METAL_WIP - use array version (i.e. useResources) */
744 [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
745 }
746 else if (it.second->mtlTexture) {
747 /* METAL_WIP - use array version (i.e. useResources) */
748 [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
749 }
750 }
751
752 /* ancillaries */
753 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings usage:MTLResourceUsageRead];
754}
755
756id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
757{
758 bool concurrent = int(kernel) < int(DEVICE_KERNEL_INTEGRATOR_NUM);
759
760 if (profiling_enabled_) {
761 /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
762 close_compute_encoder();
763 }
764
765 if (mtlComputeEncoder_) {
766 if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
767 MTLDispatchTypeSerial)
768 {
769 /* declare usage of MTLBuffers etc */
770 prepare_resources(kernel);
771
772 return mtlComputeEncoder_;
773 }
774 close_compute_encoder();
775 }
776
777 close_blit_encoder();
778
779 if (!mtlCommandBuffer_) {
780 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
781 [mtlCommandBuffer_ retain];
782 }
783
784 if (profiling_enabled_) {
785 MTLComputePassDescriptor *desc = [[MTLComputePassDescriptor alloc] init];
786
787 current_encoder_idx_ = (counter_sample_buffer_curr_idx_.fetch_add(2) %
788 MAX_SAMPLE_BUFFER_LENGTH);
789 [desc.sampleBufferAttachments[0] setSampleBuffer:metal_device_->mtlCounterSampleBuffer];
790 [desc.sampleBufferAttachments[0] setStartOfEncoderSampleIndex:current_encoder_idx_];
791 [desc.sampleBufferAttachments[0] setEndOfEncoderSampleIndex:current_encoder_idx_ + 1];
792
793 [desc setDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial];
794
795 mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDescriptor:desc];
796 }
797 else {
798 mtlComputeEncoder_ = [mtlCommandBuffer_
799 computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
800 MTLDispatchTypeSerial];
801 }
802
803 [mtlComputeEncoder_ retain];
804 [mtlComputeEncoder_ setLabel:@(device_kernel_as_string(kernel))];
805
806 /* declare usage of MTLBuffers etc */
807 prepare_resources(kernel);
808
809 return mtlComputeEncoder_;
810}
811
812id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
813{
814 if (mtlBlitEncoder_) {
815 return mtlBlitEncoder_;
816 }
817
818 close_compute_encoder();
819
820 if (!mtlCommandBuffer_) {
821 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
822 [mtlCommandBuffer_ retain];
823 }
824
825 mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
826 [mtlBlitEncoder_ retain];
827 return mtlBlitEncoder_;
828}
829
830void MetalDeviceQueue::close_compute_encoder()
831{
832 if (mtlComputeEncoder_) {
833 [mtlComputeEncoder_ endEncoding];
834 [mtlComputeEncoder_ release];
835 mtlComputeEncoder_ = nil;
836 }
837}
838
839void MetalDeviceQueue::close_blit_encoder()
840{
841 if (mtlBlitEncoder_) {
842 [mtlBlitEncoder_ endEncoding];
843 [mtlBlitEncoder_ release];
844 mtlBlitEncoder_ = nil;
845 }
846}
847
848void *MetalDeviceQueue::native_queue()
849{
850 return mtlCommandQueue_;
851}
852
853unique_ptr<DeviceGraphicsInterop> MetalDeviceQueue::graphics_interop_create()
854{
855 return make_unique<MetalDeviceGraphicsInterop>(this);
856}
857
859
860#endif /* WITH_METAL */
unsigned int uint
volatile int lock
BMesh const char void * data
long long int int64_t
unsigned long long int uint64_t
size_t size() const
@ MEM_TEXTURE
@ MEM_READ_ONLY
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define str(s)
#define resource
#define assert(assertion)
#define printf(...)
float length(VecOp< float, D >) RET
int count
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define LOG_TRACE
Definition log.h:108
#define T
static void error(const char *str)
static void update(bNodeTree *ntree)
static void init(bNodeTree *, bNode *node)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
void * values[MAX_ARGS]
size_t sizes[MAX_ARGS]
Type types[MAX_ARGS]
size_t system_physical_ram()
Definition system.cpp:227
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
Definition types_base.h:57
wmTimer * timer
double total_time