Blender V4.3
queue.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include "device/metal/queue.h"
8
10# include "device/metal/kernel.h"
11
12# include "util/path.h"
13# include "util/string.h"
14# include "util/time.h"
15
17
18# define MAX_SAMPLE_BUFFER_LENGTH 4096
19
20/* MetalDeviceQueue */
21
22MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
23 : DeviceQueue(device), metal_device_(device), stats_(device->stats)
24{
25 @autoreleasepool {
26 command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc] init];
27 command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
28
29 mtlDevice_ = device->mtlDevice;
30 mtlCommandQueue_ = device->mtlComputeCommandQueue;
31
32 shared_event_ = [mtlDevice_ newSharedEvent];
33 shared_event_id_ = 1;
34
35 /* Shareable event listener */
36 event_queue_ = dispatch_queue_create("com.cycles.metal.event_queue", NULL);
37 shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
38
39 wait_semaphore_ = dispatch_semaphore_create(0);
40
41 if (auto str = getenv("CYCLES_METAL_PROFILING")) {
42 if (atoi(str) && [mtlDevice_ supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
43 {
44 /* Enable per-kernel timing breakdown (shown at end of render). */
45 profiling_enabled_ = true;
46 label_command_encoders_ = true;
47
48 /* Create a global counter sampling buffer. */
49 NSArray<id<MTLCounterSet>> *counterSets = [mtlDevice_ counterSets];
50
51 NSError *error = nil;
52 MTLCounterSampleBufferDescriptor *desc = [[MTLCounterSampleBufferDescriptor alloc] init];
53 [desc setStorageMode:MTLStorageModeShared];
54 [desc setLabel:@"CounterSampleBuffer"];
55 [desc setSampleCount:MAX_SAMPLE_BUFFER_LENGTH];
56 [desc setCounterSet:counterSets[0]];
57 counter_sample_buffer_ = [mtlDevice_ newCounterSampleBufferWithDescriptor:desc
58 error:&error];
59 [counter_sample_buffer_ retain];
60 }
61 }
62 if (getenv("CYCLES_METAL_DEBUG")) {
63 /* Enable very verbose tracing (shows every dispatch). */
64 verbose_tracing_ = true;
65 label_command_encoders_ = true;
66 }
67
68 setup_capture();
69 }
70}
71
72void MetalDeviceQueue::setup_capture()
73{
74 capture_kernel_ = DeviceKernel(-1);
75
76 if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
77 /* CYCLES_DEBUG_METAL_CAPTURE_KERNEL captures a single dispatch of the specified kernel. */
78 capture_kernel_ = DeviceKernel(atoi(capture_kernel_str));
79 printf("Capture kernel: %d = %s\n", capture_kernel_, device_kernel_as_string(capture_kernel_));
80
81 capture_dispatch_counter_ = 0;
82 if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
83 capture_dispatch_counter_ = atoi(capture_dispatch_str);
84
85 printf("Capture dispatch number %d\n", capture_dispatch_counter_);
86 }
87 }
88 else if (auto capture_samples_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
89 /* CYCLES_DEBUG_METAL_CAPTURE_SAMPLES captures a block of dispatches from reset#(N) to
90 * reset#(N+1). */
91 capture_samples_ = true;
92 capture_reset_counter_ = atoi(capture_samples_str);
93
94 capture_dispatch_counter_ = INT_MAX;
95 if (auto capture_limit_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
96 /* CYCLES_DEBUG_METAL_CAPTURE_LIMIT sets the maximum number of dispatches to capture. */
97 capture_dispatch_counter_ = atoi(capture_limit_str);
98 }
99
100 printf("Capturing sample block %d (dispatch limit: %d)\n",
101 capture_reset_counter_,
102 capture_dispatch_counter_);
103 }
104 else {
105 /* No capturing requested. */
106 return;
107 }
108
109 /* Enable .gputrace capture for the specified DeviceKernel. */
110 MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
111 mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
112 mtlCaptureScope_.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
113 [captureManager setDefaultCaptureScope:mtlCaptureScope_];
114
115 label_command_encoders_ = true;
116
117 if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
118 if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
119
120 MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
121 captureDescriptor.captureObject = mtlCaptureScope_;
122 captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
123 captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
124
125 NSError *error;
126 if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
127 NSString *err = [error localizedDescription];
128 printf("Start capture failed: %s\n", [err UTF8String]);
129 }
130 else {
131 printf("Capture started (URL: %s)\n", capture_url);
132 is_capturing_to_disk_ = true;
133 }
134 }
135 else {
136 printf("Capture to file is not supported\n");
137 }
138 }
139}
140
141void MetalDeviceQueue::update_capture(DeviceKernel kernel)
142{
143 /* Handle capture end triggers. */
144 if (is_capturing_) {
145 capture_dispatch_counter_ -= 1;
146 if (capture_dispatch_counter_ <= 0 || kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
147 /* End capture if we've hit the dispatch limit or we hit a "reset". */
148 end_capture();
149 }
150 return;
151 }
152
153 if (capture_dispatch_counter_ < 0) {
154 /* We finished capturing. */
155 return;
156 }
157
158 /* Handle single-capture start trigger. */
159 if (kernel == capture_kernel_) {
160 /* Start capturing when the we hit the Nth dispatch of the specified kernel. */
161 if (capture_dispatch_counter_ == 0) {
162 begin_capture();
163 }
164 capture_dispatch_counter_ -= 1;
165 return;
166 }
167
168 /* Handle multi-capture start trigger. */
169 if (capture_samples_) {
170 /* Start capturing when the reset countdown is at 0. */
171 if (capture_reset_counter_ == 0) {
172 begin_capture();
173 }
174
175 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
176 capture_reset_counter_ -= 1;
177 }
178 return;
179 }
180}
181
182void MetalDeviceQueue::begin_capture()
183{
184 /* Start gputrace capture. */
185 if (mtlCommandBuffer_) {
186 synchronize();
187 }
188 [mtlCaptureScope_ beginScope];
189 printf("[mtlCaptureScope_ beginScope]\n");
190 is_capturing_ = true;
191}
192
193void MetalDeviceQueue::end_capture()
194{
195 [mtlCaptureScope_ endScope];
196 is_capturing_ = false;
197 printf("[mtlCaptureScope_ endScope]\n");
198
199 if (is_capturing_to_disk_) {
200 [[MTLCaptureManager sharedCaptureManager] stopCapture];
201 has_captured_to_disk_ = true;
202 is_capturing_to_disk_ = false;
203 is_capturing_ = false;
204 printf("Capture stopped\n");
205 }
206}
207
208MetalDeviceQueue::~MetalDeviceQueue()
209{
210 /* Tidying up here isn't really practical - we should expect and require the work
211 * queue to be empty here. */
212 assert(mtlCommandBuffer_ == nil);
213 assert(command_buffers_submitted_ == command_buffers_completed_);
214
215 close_compute_encoder();
216 close_blit_encoder();
217
218 [shared_event_listener_ release];
219 [shared_event_ release];
220 [command_buffer_desc_ release];
221
222 if (counter_sample_buffer_) {
223 [counter_sample_buffer_ release];
224 }
225
226 if (mtlCaptureScope_) {
227 [mtlCaptureScope_ release];
228 }
229
230 double total_time = 0.0;
231
232 /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
233 int64_t num_dispatches = 0;
234 int64_t num_pathtracing_dispatches = 0;
235 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
236 auto &stat = timing_stats_[i];
237 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
239 total_time += stat.total_time;
240 num_dispatches += stat.num_dispatches;
241 num_pathtracing_dispatches += pathtracing_kernel ? stat.num_dispatches : 0;
242 }
243 bool has_extra = (num_pathtracing_dispatches && num_dispatches > num_pathtracing_dispatches);
244
245 if (num_dispatches) {
246 printf("\nMetal %sdispatch stats:\n", num_pathtracing_dispatches ? "path-tracing " : "");
247 auto header = string_printf("%-40s %16s %12s %12s %9s %9s",
248 "Kernel name",
249 "Total threads",
250 "Dispatches",
251 "Avg. T/D",
252 "Time/s",
253 "Time/%");
254 auto divider = string(header.length(), '-');
255 printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
256
257 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
258 auto &stat = timing_stats_[i];
259
260 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
262 if ((pathtracing_kernel && num_pathtracing_dispatches) || stat.num_dispatches > 0) {
263 printf("%-40s %16llu %12llu %12llu %9.4f %9.2f\n",
265 stat.total_work_size,
266 stat.num_dispatches,
267 stat.total_work_size / stat.num_dispatches,
268 stat.total_time,
269 stat.total_time * 100.0 / total_time);
270 }
271 if (has_extra && i == DEVICE_KERNEL_INTEGRATOR_RESET) {
272 printf("%s\n", divider.c_str());
273 }
274 }
275 printf("%s\n", divider.c_str());
276 printf("%-40s %16s %12llu %12s %9.4f %9.2f\n", "", "", num_dispatches, "", total_time, 100.0);
277 printf("%s\n\n", divider.c_str());
278 }
279}
280
281int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
282{
283 static int result = 0;
284 if (result) {
285 return result;
286 }
287
288 result = 4194304;
289
290 /* Increasing the state count doesn't notably benefit M1-family systems. */
291 if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
292 size_t system_ram = system_physical_ram();
293 size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
294 size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
295
296 /* Determine whether we can double the state count, and leave enough GPU-available memory
297 * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
298 * keep dispatch sizes high and minimize work submission overheads. */
299 size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
300 size_t total_state_size = result * state_size;
301 if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
302 result *= 2;
303 metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
304 }
305 }
306 return result;
307}
308
309int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
310{
311 /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
312 return num_concurrent_states(state_size) / 4;
313}
314
315int MetalDeviceQueue::num_sort_partition_elements() const
316{
317 return MetalInfo::optimal_sort_partition_elements();
318}
319
320bool MetalDeviceQueue::supports_local_atomic_sort() const
321{
322 return metal_device_->use_local_atomic_sort();
323}
324
325void MetalDeviceQueue::init_execution()
326{
327 /* Synchronize all textures and memory copies before executing task. */
328 metal_device_->load_texture_info();
329
330 synchronize();
331}
332
333bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
334 const int work_size,
335 DeviceKernelArguments const &args)
336{
337 @autoreleasepool {
338 update_capture(kernel);
339
340 if (metal_device_->have_error()) {
341 return false;
342 }
343
344 VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
345 << work_size;
346
347 id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
348
349 if (profiling_enabled_) {
350 command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_});
351 }
352
353 /* Determine size requirement for argument buffer. */
354 size_t arg_buffer_length = 0;
355 for (size_t i = 0; i < args.count; i++) {
356 size_t size_in_bytes = args.sizes[i];
357 arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
358 }
359 /* 256 is the Metal offset alignment for constant address space bindings */
360 arg_buffer_length = round_up(arg_buffer_length, 256);
361
362 /* Globals placed after "vanilla" arguments. */
363 size_t globals_offsets = arg_buffer_length;
364 arg_buffer_length += sizeof(KernelParamsMetal);
365 arg_buffer_length = round_up(arg_buffer_length, 256);
366
367 /* Metal ancillary bindless pointers. */
368 size_t metal_offsets = arg_buffer_length;
369 arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
370 arg_buffer_length = round_up(arg_buffer_length,
371 metal_device_->mtlAncillaryArgEncoder.alignment);
372
373 /* Temporary buffer used to prepare arg_buffer */
374 uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
375 memset(init_arg_buffer, 0, arg_buffer_length);
376
377 /* Prepare the non-pointer "enqueue" arguments */
378 size_t bytes_written = 0;
379 for (size_t i = 0; i < args.count; i++) {
380 size_t size_in_bytes = args.sizes[i];
381 bytes_written = round_up(bytes_written, size_in_bytes);
382 if (args.types[i] != DeviceKernelArguments::POINTER) {
383 memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
384 }
385 bytes_written += size_in_bytes;
386 }
387
388 /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
389 /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
390 size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
391 offsetof(IntegratorStateGPU, sort_partition_divisor);
392 size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
393 memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
394 (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
395 plain_old_launch_data_size);
396
397 /* Allocate an argument buffer. */
398 MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
399 if ([mtlDevice_ hasUnifiedMemory]) {
400 arg_buffer_options = MTLResourceStorageModeShared;
401 }
402
403 id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
404 mtlCommandBuffer_,
405 arg_buffer_length,
406 arg_buffer_options,
407 init_arg_buffer,
408 stats_);
409
410 /* Encode the pointer "enqueue" arguments */
411 bytes_written = 0;
412 for (size_t i = 0; i < args.count; i++) {
413 size_t size_in_bytes = args.sizes[i];
414 bytes_written = round_up(bytes_written, size_in_bytes);
415 if (args.types[i] == DeviceKernelArguments::POINTER) {
416 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
417 offset:bytes_written];
418 if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
419 [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
420 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
421 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
422 offset:0
423 atIndex:0];
424 }
425 else {
426 if (@available(macos 12.0, *)) {
427 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
428 }
429 }
430 }
431 bytes_written += size_in_bytes;
432 }
433
434 /* Encode KernelParamsMetal buffers */
435 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
436 offset:globals_offsets];
437
438 if (label_command_encoders_) {
439 /* Add human-readable labels if we're doing any form of debugging / profiling. */
440 mtlComputeCommandEncoder.label = [NSString
441 stringWithFormat:@"Metal queue launch %s, work_size %d",
443 work_size];
444 }
445
446 /* this relies on IntegratorStateGPU layout being contiguous device_ptrs. */
447 const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
448 offsetof(IntegratorStateGPU, sort_partition_divisor);
449 for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
450 int pointer_index = int(offset / sizeof(device_ptr));
451 MetalDevice::MetalMem *mmem = *(
452 MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset);
453 if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
454 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
455 offset:0
456 atIndex:pointer_index];
457 }
458 else {
459 if (@available(macos 12.0, *)) {
460 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil
461 offset:0
462 atIndex:pointer_index];
463 }
464 }
465 }
466 bytes_written = globals_offsets + sizeof(KernelParamsMetal);
467
468 if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
469 metal_device_->set_error(
470 string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
471 return false;
472 }
473 MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
474
475 /* Encode ancillaries */
476 [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
477 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
478 offset:0
479 atIndex:0];
480 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
481 offset:0
482 atIndex:1];
483 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
484 offset:0
485 atIndex:2];
486
487 if (@available(macos 12.0, *)) {
488 if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
489 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
490 [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
491 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
492 offset:0
493 atIndex:(METALRT_TABLE_NUM + 4)];
494 }
495
496 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
497 if (active_pipeline.intersection_func_table[table]) {
498 [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
499 offset:globals_offsets
500 atIndex:1];
501 [metal_device_->mtlAncillaryArgEncoder
502 setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
503 atIndex:4 + table];
504 [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
505 usage:MTLResourceUsageRead];
506 }
507 else {
508 [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
509 atIndex:4 + table];
510 }
511 }
512 }
513 bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
514 }
515
516 if (arg_buffer.storageMode == MTLStorageModeManaged) {
517 [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
518 }
519
520 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
521 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
522 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
523
524 if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
525 if (@available(macos 12.0, *)) {
526
527 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
528 /* Mark all Accelerations resources as used */
529 [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
530 [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
531 usage:MTLResourceUsageRead];
532 [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
533 count:metal_device_->unique_blas_array.size()
534 usage:MTLResourceUsageRead];
535 }
536 }
537 }
538
539 [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
540
541 /* Compute kernel launch parameters. */
542 const int num_threads_per_block = active_pipeline.num_threads_per_block;
543
544 int shared_mem_bytes = 0;
545
546 switch (kernel) {
555 /* See parallel_active_index.h for why this amount of shared memory is needed.
556 * Rounded up to 16 bytes for Metal */
557 shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
558 break;
559
562 int key_count = metal_device_->launch_params.data.max_shaders;
563 shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
564 break;
565 }
566
567 default:
568 break;
569 }
570
571 if (shared_mem_bytes) {
572 assert(shared_mem_bytes <= 32 * 1024);
573 [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
574 }
575
576 MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
577 MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
578 [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
579 threadsPerThreadgroup:size_threads_per_threadgroup];
580
581 [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
582 /* Enhanced command buffer errors */
583 string str;
584 if (command_buffer.status != MTLCommandBufferStatusCompleted) {
585 str = string_printf("Command buffer not completed. status = %d. ",
586 int(command_buffer.status));
587 }
588 if (command_buffer.error) {
589 @autoreleasepool {
590 const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error]
591 UTF8String];
592 str += string_printf("(%s.%s):\n%s\n",
593 kernel_type_as_string(active_pipeline.pso_type),
595 errCStr);
596 }
597 }
598 if (!str.empty()) {
599 metal_device_->set_error(str);
600 }
601 }];
602
603 if (verbose_tracing_ || is_capturing_) {
604 /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
605 synchronize();
606
607 /* Show queue counters and dispatch timing. */
608 if (verbose_tracing_) {
609 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
610 printf(
611 "_____________________________________.____________________.______________._________"
612 "__"
613 "______________________________________\n");
614 }
615
616 printf("%-40s| %7d threads |%5.2fms | buckets [",
618 work_size,
619 last_completion_time_ * 1000.0);
620 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
621 for (auto &it : metal_device_->metal_mem_map) {
622 const string c_integrator_queue_counter = "integrator_queue_counter";
623 if (it.first->name == c_integrator_queue_counter) {
624 /* Workaround "device_copy_from" being protected. */
625 struct MyDeviceMemory : device_memory {
626 void device_copy_from__IntegratorQueueCounter()
627 {
628 device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter));
629 }
630 };
631 ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
632
633 if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
634 it.first->host_pointer)
635 {
636 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++)
637 printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i]));
638 }
639 break;
640 }
641 }
642 printf("]\n");
643 }
644 }
645
646 return !(metal_device_->have_error());
647 }
648}
649
650void MetalDeviceQueue::flush_timing_stats()
651{
652 for (auto label : command_encoder_labels_) {
653 TimingStats &stat = timing_stats_[label.kernel];
654
655 double completion_time_gpu;
656 NSData *computeTimeStamps = [counter_sample_buffer_
657 resolveCounterRange:NSMakeRange(label.timing_id, 2)];
658 MTLCounterResultTimestamp *timestamps = (MTLCounterResultTimestamp *)(computeTimeStamps.bytes);
659
660 uint64_t begTime = timestamps[0].timestamp;
661 uint64_t endTime = timestamps[1].timestamp;
662 completion_time_gpu = (endTime - begTime) / (double)NSEC_PER_SEC;
663
664 stat.num_dispatches++;
665 stat.total_time += completion_time_gpu;
666 stat.total_work_size += label.work_size;
667 last_completion_time_ = completion_time_gpu;
668 }
669 command_encoder_labels_.clear();
670}
671
672bool MetalDeviceQueue::synchronize()
673{
674 @autoreleasepool {
675 if (has_captured_to_disk_ || metal_device_->have_error()) {
676 return false;
677 }
678
679 close_compute_encoder();
680 close_blit_encoder();
681
682 if (mtlCommandBuffer_) {
684
685 uint64_t shared_event_id_ = this->shared_event_id_++;
686
687 __block dispatch_semaphore_t block_sema = wait_semaphore_;
688 [shared_event_ notifyListener:shared_event_listener_
689 atValue:shared_event_id_
690 block:^(id<MTLSharedEvent> /*sharedEvent*/, uint64_t /*value*/) {
691 dispatch_semaphore_signal(block_sema);
692 }];
693
694 [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
695 [mtlCommandBuffer_ commit];
696 dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
697
698 [mtlCommandBuffer_ release];
699
700 for (const CopyBack &mmem : copy_back_mem_) {
701 memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size);
702 }
703 copy_back_mem_.clear();
704
705 temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
706 metal_device_->flush_delayed_free_list();
707
708 mtlCommandBuffer_ = nil;
709 flush_timing_stats();
710 }
711
712 return !(metal_device_->have_error());
713 }
714}
715
716void MetalDeviceQueue::zero_to_device(device_memory &mem)
717{
718 @autoreleasepool {
719 if (metal_device_->have_error()) {
720 return;
721 }
722
723 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
724
725 if (mem.memory_size() == 0) {
726 return;
727 }
728
729 /* Allocate on demand. */
730 if (mem.device_pointer == 0) {
731 metal_device_->mem_alloc(mem);
732 }
733
734 /* Zero memory on device. */
735 assert(mem.device_pointer != 0);
736
737 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
738 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
739 if (mmem.mtlBuffer) {
740 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
741 [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
742 }
743 else {
744 metal_device_->mem_zero(mem);
745 }
746 }
747}
748
749void MetalDeviceQueue::copy_to_device(device_memory &mem)
750{
751 @autoreleasepool {
752 if (metal_device_->have_error()) {
753 return;
754 }
755
756 if (mem.memory_size() == 0) {
757 return;
758 }
759
760 /* Allocate on demand. */
761 if (mem.device_pointer == 0) {
762 metal_device_->mem_alloc(mem);
763 }
764
765 assert(mem.device_pointer != 0);
766 assert(mem.host_pointer != nullptr);
767
768 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
769 auto result = metal_device_->metal_mem_map.find(&mem);
770 if (result != metal_device_->metal_mem_map.end()) {
771 if (mem.host_pointer == mem.shared_pointer) {
772 return;
773 }
774
775 MetalDevice::MetalMem &mmem = *result->second;
776 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
777
778 id<MTLBuffer> buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
779 mtlCommandBuffer_,
780 mmem.size,
781 MTLResourceStorageModeShared,
782 mem.host_pointer,
783 stats_);
784
785 [blitEncoder copyFromBuffer:buffer
786 sourceOffset:0
787 toBuffer:mmem.mtlBuffer
788 destinationOffset:mmem.offset
789 size:mmem.size];
790 }
791 else {
792 metal_device_->mem_copy_to(mem);
793 }
794 }
795}
796
797void MetalDeviceQueue::copy_from_device(device_memory &mem)
798{
799 @autoreleasepool {
800 if (metal_device_->have_error()) {
801 return;
802 }
803
804 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
805
806 if (mem.memory_size() == 0) {
807 return;
808 }
809
810 assert(mem.device_pointer != 0);
811 assert(mem.host_pointer != nullptr);
812
813 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
814 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
815 if (mmem.mtlBuffer) {
816 const size_t size = mem.memory_size();
817
818 if (mem.device_pointer) {
819 if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
820 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
821 [blitEncoder synchronizeResource:mmem.mtlBuffer];
822 }
823 if (mem.host_pointer != mmem.hostPtr) {
824 if (mtlCommandBuffer_) {
825 copy_back_mem_.push_back({mem.host_pointer, mmem.hostPtr, size});
826 }
827 else {
828 memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size);
829 }
830 }
831 }
832 else {
833 memset((char *)mem.host_pointer, 0, size);
834 }
835 }
836 else {
837 metal_device_->mem_copy_from(mem);
838 }
839 }
840}
841
842void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/)
843{
844 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
845
846 /* declare resource usage */
847 for (auto &it : metal_device_->metal_mem_map) {
848 device_memory *mem = it.first;
849
850 MTLResourceUsage usage = MTLResourceUsageRead;
851 if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
852 usage |= MTLResourceUsageWrite;
853 }
854
855 if (it.second->mtlBuffer) {
856 /* METAL_WIP - use array version (i.e. useResources) */
857 [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
858 }
859 else if (it.second->mtlTexture) {
860 /* METAL_WIP - use array version (i.e. useResources) */
861 [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
862 }
863 }
864
865 /* ancillaries */
866 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
867 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
868 [mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
869}
870
871id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
872{
873 bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
874
875 if (profiling_enabled_) {
876 /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
877 close_compute_encoder();
878 }
879
880 if (mtlComputeEncoder_) {
881 if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
882 MTLDispatchTypeSerial)
883 {
884 /* declare usage of MTLBuffers etc */
885 prepare_resources(kernel);
886
887 return mtlComputeEncoder_;
888 }
889 close_compute_encoder();
890 }
891
892 close_blit_encoder();
893
894 if (!mtlCommandBuffer_) {
895 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
896 [mtlCommandBuffer_ retain];
897 }
898
899 if (profiling_enabled_) {
900 MTLComputePassDescriptor *desc = [[MTLComputePassDescriptor alloc] init];
901
902 current_encoder_idx_ = (counter_sample_buffer_curr_idx_.fetch_add(2) %
903 MAX_SAMPLE_BUFFER_LENGTH);
904 [desc.sampleBufferAttachments[0] setSampleBuffer:counter_sample_buffer_];
905 [desc.sampleBufferAttachments[0] setStartOfEncoderSampleIndex:current_encoder_idx_];
906 [desc.sampleBufferAttachments[0] setEndOfEncoderSampleIndex:current_encoder_idx_ + 1];
907
908 [desc setDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial];
909
910 mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDescriptor:desc];
911 }
912 else {
913 mtlComputeEncoder_ = [mtlCommandBuffer_
914 computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
915 MTLDispatchTypeSerial];
916 }
917
918 [mtlComputeEncoder_ retain];
919 [mtlComputeEncoder_ setLabel:@(device_kernel_as_string(kernel))];
920
921 /* declare usage of MTLBuffers etc */
922 prepare_resources(kernel);
923
924 return mtlComputeEncoder_;
925}
926
927id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
928{
929 if (mtlBlitEncoder_) {
930 return mtlBlitEncoder_;
931 }
932
933 close_compute_encoder();
934
935 if (!mtlCommandBuffer_) {
936 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
937 [mtlCommandBuffer_ retain];
938 }
939
940 mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
941 [mtlBlitEncoder_ retain];
942 return mtlBlitEncoder_;
943}
944
945void MetalDeviceQueue::close_compute_encoder()
946{
947 if (mtlComputeEncoder_) {
948 [mtlComputeEncoder_ endEncoding];
949 [mtlComputeEncoder_ release];
950 mtlComputeEncoder_ = nil;
951 }
952}
953
954void MetalDeviceQueue::close_blit_encoder()
955{
956 if (mtlBlitEncoder_) {
957 [mtlBlitEncoder_ endEncoding];
958 [mtlBlitEncoder_ release];
959 mtlBlitEncoder_ = nil;
960 }
961}
962
963void *MetalDeviceQueue::native_queue()
964{
965 return mtlCommandQueue_;
966}
967
969
970#endif /* WITH_METAL */
unsigned char uchar
volatile int lock
void init()
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void device_copy_from(size_t y, size_t w, size_t h, size_t elem)
Definition memory.cpp:89
#define printf
@ MEM_TEXTURE
@ MEM_READ_ONLY
const char * label
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define NULL
#define offsetof(t, d)
CCL_NAMESPACE_BEGIN struct KernelParamsMetal KernelParamsMetal
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
#define str(s)
IndexRange range
int count
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define VLOG_DEVICE_STATS
Definition log.h:78
static void error(const char *str)
static void update(bNodeTree *ntree)
__int64 int64_t
Definition stdint.h:89
unsigned char uint8_t
Definition stdint.h:78
unsigned __int64 uint64_t
Definition stdint.h:90
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
void * values[MAX_ARGS]
size_t sizes[MAX_ARGS]
Type types[MAX_ARGS]
size_t system_physical_ram()
Definition system.cpp:234
ccl_device_inline size_t round_up(size_t x, size_t multiple)
Definition util/types.h:58
uint64_t device_ptr
Definition util/types.h:45
wmTimer * timer
double total_time