Blender V5.0
oneapi/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2025 Intel Corporation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_ONEAPI
6
7/* <algorithm> is needed until included upstream in sycl/detail/property_list_base.hpp */
8# include <algorithm>
9# include <sycl/sycl.hpp>
10
12
13# include "util/log.h"
14
15# ifdef WITH_EMBREE_GPU
16# include "bvh/embree.h"
17# endif
18
19# if defined(WITH_OPENIMAGEDENOISE)
20# include <OpenImageDenoise/config.h>
21# if OIDN_VERSION >= 20300
22# include "util/openimagedenoise.h" // IWYU pragma: keep
23# endif
24# endif
25
28
30
31# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
32/* These declarations are missing from embree headers when compiling from a compiler that doesn't
33 * support SYCL. */
34extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
35extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
36extern "C" void rtcSetDeviceSYCLDevice(RTCDevice device, const sycl::device sycl_device);
37# endif
38
40
41static std::vector<sycl::device> available_sycl_devices(bool *multiple_dgpus_detected);
42static int parse_driver_build_version(const sycl::device &device);
43
44static void queue_error_cb(const char *message, void *user_ptr)
45{
46 if (user_ptr) {
47 *reinterpret_cast<std::string *>(user_ptr) = message;
48 }
49}
50
51OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
52 : GPUDevice(info, stats, profiler, headless)
53{
54 /* Verify that base class types can be used with specific backend types */
55 static_assert(sizeof(texMemObject) ==
56 sizeof(sycl::ext::oneapi::experimental::sampled_image_handle));
57 static_assert(sizeof(arrayMemObject) ==
58 sizeof(sycl::ext::oneapi::experimental::image_mem_handle));
59
60 need_texture_info = false;
61 use_hardware_raytracing = info.use_hardware_raytracing;
62
63 oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
64
65 bool is_finished_ok = create_queue(device_queue_,
66 info.num,
67# ifdef WITH_EMBREE_GPU
68 use_hardware_raytracing ? (void *)&embree_device : nullptr,
69# else
70 nullptr,
71# endif
72 &is_several_intel_dgpu_devices_detected);
73
74 if (is_finished_ok == false) {
75 set_error("oneAPI queue initialization error: got runtime exception \"" +
76 oneapi_error_string_ + "\"");
77 }
78 else {
79 LOG_TRACE << "oneAPI queue has been successfully created for the device \"" << info.description
80 << "\"";
81 assert(device_queue_);
82 }
83
84# ifdef WITH_EMBREE_GPU
85 use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
86# else
87 use_hardware_raytracing = false;
88# endif
89
90 if (use_hardware_raytracing) {
91 LOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
92 }
93
94 size_t globals_segment_size;
95 is_finished_ok = kernel_globals_size(globals_segment_size);
96 if (is_finished_ok == false) {
97 set_error("oneAPI constant memory initialization got runtime exception \"" +
98 oneapi_error_string_ + "\"");
99 }
100 else {
101 LOG_TRACE << "Successfully created global/constant memory segment (kernel globals object)";
102 }
103
104 kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
105 usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
106
107 kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
108
109 kg_memory_size_ = globals_segment_size;
110
111 max_memory_on_device_ = get_memcapacity();
112 init_host_memory();
113 can_map_host = true;
114
115 const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
116 if (headroom_str != nullptr) {
117 const long long override_headroom = (float)atoll(headroom_str);
118 device_working_headroom = override_headroom;
119 device_texture_headroom = override_headroom;
120 }
121 LOG_TRACE << "oneAPI memory headroom size: "
122 << string_human_readable_size(device_working_headroom);
123}
124
125OneapiDevice::~OneapiDevice()
126{
127# ifdef WITH_EMBREE_GPU
128 if (embree_device) {
129 rtcReleaseDevice(embree_device);
130 }
131# endif
132
133 texture_info.free();
134 usm_free(device_queue_, kg_memory_);
135 usm_free(device_queue_, kg_memory_device_);
136
137 const_mem_map_.clear();
138
139 if (device_queue_) {
140 free_queue(device_queue_);
141 }
142}
143
144bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
145{
146 return false;
147}
148
149bool OneapiDevice::can_use_hardware_raytracing_for_features(const uint requested_features) const
150{
151 /* MNEE and Ray-trace kernels work correctly with Hardware Ray-tracing starting with Embree 4.1.
152 */
153# if defined(RTC_VERSION) && RTC_VERSION < 40100
154 return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
155# else
156 (void)requested_features;
157 return true;
158# endif
159}
160
161BVHLayoutMask OneapiDevice::get_bvh_layout_mask(const uint requested_features) const
162{
163 return (use_hardware_raytracing &&
164 can_use_hardware_raytracing_for_features(requested_features)) ?
167}
168
169# ifdef WITH_EMBREE_GPU
170void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
171{
172 if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREEGPU) {
173 BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
174 if (refit) {
175 bvh_embree->refit(progress);
176 }
177 else {
178 bvh_embree->build(progress, &stats, embree_device, true);
179 }
180
181# if RTC_VERSION >= 40302
182 thread_scoped_lock lock(scene_data_mutex);
183 all_embree_scenes.push_back(bvh_embree->scene);
184# endif
185
186 if (bvh->params.top_level) {
187# if RTC_VERSION >= 40400
188 embree_traversable = rtcGetSceneTraversable(bvh_embree->scene);
189# else
190 embree_traversable = bvh_embree->scene;
191# endif
192# if RTC_VERSION >= 40302
193 RTCError error_code = bvh_embree->offload_scenes_to_gpu(all_embree_scenes);
194 if (error_code != RTC_ERROR_NONE) {
195 set_error(
196 string_printf("BVH failed to migrate to the GPU due to Embree library error (%s)",
197 bvh_embree->get_error_string(error_code)));
198 }
199 all_embree_scenes.clear();
200# endif
201 }
202 }
203 else {
204 Device::build_bvh(bvh, progress, refit);
205 }
206}
207# endif
208
209size_t OneapiDevice::get_free_mem() const
210{
211 /* Accurate: Use device info, which is practically useful only on dGPU.
212 * This is because for non-discrete GPUs, all GPU memory allocations would
213 * be in the RAM, thus having the same performance for device and host pointers,
214 * so there is no need to be very accurate about what would end where. */
215 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
216 const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
217 if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) {
218 return device.get_info<sycl::ext::intel::info::device::free_memory>();
219 }
220 /* Estimate: Capacity - in use. */
221 if (device_mem_in_use < max_memory_on_device_) {
222 return max_memory_on_device_ - device_mem_in_use;
223 }
224 return 0;
225}
226
227bool OneapiDevice::load_kernels(const uint requested_features)
228{
229 assert(device_queue_);
230
231 /* Kernel loading is expected to be a cumulative operation; for example, if
232 * a device is asked to load kernel A and then kernel B, then after these
233 * operations, both A and B should be available for use. So we need to store
234 * and use a cumulative mask of the requested kernel features, and not just
235 * the latest requested features.
236 */
237 kernel_features |= requested_features;
238
239 bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
240 if (is_finished_ok == false) {
241 set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
242 "\"");
243 return false;
244 }
245 LOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
246 assert(device_queue_);
247
248 if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
250 << "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
251 use_hardware_raytracing = false;
252 }
253
254 is_finished_ok = oneapi_load_kernels(
255 device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
256 if (is_finished_ok == false) {
257 set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
258 }
259 else {
260 LOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
261 }
262
263 if (is_finished_ok) {
264 reserve_private_memory(requested_features);
265 is_finished_ok = !have_error();
266 }
267
268 return is_finished_ok;
269}
270
271void OneapiDevice::reserve_private_memory(const uint kernel_features)
272{
273 size_t free_before = get_free_mem();
274
275 /* Use the biggest kernel for estimation. */
276 const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
278 (kernel_features & KERNEL_FEATURE_MNEE) ?
281
282 {
283 unique_ptr<DeviceQueue> queue = gpu_queue_create();
284
285 device_ptr d_path_index = 0;
286 device_ptr d_render_buffer = 0;
287 int d_work_size = 0;
288 DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
289
290 queue->init_execution();
291 /* Launch of the kernel seems to be sufficient to reserve all
292 * needed memory regardless of the execution global size.
293 * So, the smallest possible size is used here. */
294 queue->enqueue(test_kernel, 1, args);
295 queue->synchronize();
296 }
297
298 size_t free_after = get_free_mem();
299
300 LOG_INFO << "For kernel execution were reserved "
301 << string_human_readable_number(free_before - free_after) << " bytes. ("
302 << string_human_readable_size(free_before - free_after) << ")";
303}
304
305void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
306{
307 free = get_free_mem();
308 total = max_memory_on_device_;
309}
310
311bool OneapiDevice::alloc_device(void *&device_pointer, const size_t size)
312{
313 bool allocation_success = false;
314 device_pointer = usm_alloc_device(device_queue_, size);
315 if (device_pointer != nullptr) {
316 allocation_success = true;
317 /* Due to lazy memory initialization in GPU runtime we will force memory to
318 * appear in device memory via execution of a kernel using this memory. */
319 if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
320 set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
321 "\"");
322 usm_free(device_queue_, device_pointer);
323
324 device_pointer = nullptr;
325 allocation_success = false;
326 }
327 }
328
329 return allocation_success;
330}
331
332void OneapiDevice::free_device(void *device_pointer)
333{
334 usm_free(device_queue_, device_pointer);
335}
336
337bool OneapiDevice::shared_alloc(void *&shared_pointer, const size_t size)
338{
339 shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
340 return shared_pointer != nullptr;
341}
342
343void OneapiDevice::shared_free(void *shared_pointer)
344{
345 usm_free(device_queue_, shared_pointer);
346}
347
348void *OneapiDevice::shared_to_device_pointer(const void *shared_pointer)
349{
350 /* Device and host pointer are in the same address space
351 * as we're using Unified Shared Memory. */
352 return const_cast<void *>(shared_pointer);
353}
354
355void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size)
356{
357 usm_memcpy(device_queue_, device_pointer, host_pointer, size);
358}
359
360/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
361SyclQueue *OneapiDevice::sycl_queue()
362{
363 return device_queue_;
364}
365
366string OneapiDevice::oneapi_error_message()
367{
368 return string(oneapi_error_string_);
369}
370
371int OneapiDevice::scene_max_shaders()
372{
373 return scene_max_shaders_;
374}
375
376void *OneapiDevice::kernel_globals_device_pointer()
377{
378 return kg_memory_device_;
379}
380
381void *OneapiDevice::host_alloc(const MemoryType type, const size_t size)
382{
383 void *host_pointer = GPUDevice::host_alloc(type, size);
384
385# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
386 /* This extension is not working fully correctly with several
387 * Intel dGPUs present in the system, so it would be turned off in such cases. */
388 if (is_several_intel_dgpu_devices_detected == false && host_pointer) {
389 /* Import host_pointer into USM memory for faster host<->device data transfers. */
390 if (type == MEM_READ_WRITE || type == MEM_READ_ONLY) {
391 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
392 /* This API is properly implemented only in Level-Zero backend at the moment and we don't
393 * want it to fail at runtime, so we conservatively use it only for L0. */
394 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
395 sycl::ext::oneapi::experimental::prepare_for_device_copy(host_pointer, size, *queue);
396 }
397 }
398 }
399# endif
400
401 return host_pointer;
402}
403
404void OneapiDevice::host_free(const MemoryType type, void *host_pointer, const size_t size)
405{
406# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
407 if (is_several_intel_dgpu_devices_detected == false) {
408 if (type == MEM_READ_WRITE || type == MEM_READ_ONLY) {
409 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
410 /* This API is properly implemented only in Level-Zero backend at the moment and we don't
411 * want it to fail at runtime, so we conservatively use it only for L0. */
412 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
413 sycl::ext::oneapi::experimental::release_from_device_copy(host_pointer, *queue);
414 }
415 }
416 }
417# endif
418
419 GPUDevice::host_free(type, host_pointer, size);
420}
421
422void OneapiDevice::mem_alloc(device_memory &mem)
423{
424 if (mem.type == MEM_TEXTURE) {
425 assert(!"mem_alloc not supported for textures.");
426 }
427 else if (mem.type == MEM_GLOBAL) {
428 assert(!"mem_alloc not supported for global memory.");
429 }
430 else {
431 if (mem.name) {
432 LOG_TRACE << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
433 << string_human_readable_number(mem.memory_size()) << " bytes. ("
435 }
436 generic_alloc(mem);
437 }
438}
439
440void OneapiDevice::mem_copy_to(device_memory &mem)
441{
442 if (mem.name) {
443 LOG_TRACE << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
444 << string_human_readable_number(mem.memory_size()) << " bytes. ("
446 }
447
448 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
449 * because the associated GPU context may be in an invalid state at this point. */
450 if (have_error()) {
451 return;
452 }
453
454 if (mem.type == MEM_GLOBAL) {
455 global_copy_to(mem);
456 }
457 else if (mem.type == MEM_TEXTURE) {
458 tex_copy_to((device_texture &)mem);
459 }
460 else {
461 if (!mem.device_pointer) {
462 generic_alloc(mem);
463 }
464 generic_copy_to(mem);
465 }
466}
467
468void OneapiDevice::mem_move_to_host(device_memory &mem)
469{
470 if (mem.name) {
471 LOG_TRACE << "OneapiDevice::mem_move_to_host: \"" << mem.name << "\", "
472 << string_human_readable_number(mem.memory_size()) << " bytes. ("
474 }
475
476 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
477 * because the associated GPU context may be in an invalid state at this point. */
478 if (have_error()) {
479 return;
480 }
481
482 if (mem.type == MEM_GLOBAL) {
483 global_free(mem);
484 global_alloc(mem);
485 }
486 else if (mem.type == MEM_TEXTURE) {
487 tex_free((device_texture &)mem);
488 tex_alloc((device_texture &)mem);
489 }
490 else {
491 assert(0);
492 }
493}
494
495void OneapiDevice::mem_copy_from(
496 device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem)
497{
498 if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
499 assert(!"mem_copy_from not supported for textures.");
500 }
501 else if (mem.host_pointer) {
502 const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
503 const size_t offset = elem * y * w;
504
505 if (mem.name) {
506 LOG_TRACE << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
507 << string_human_readable_number(mem.memory_size()) << " bytes. ("
508 << string_human_readable_size(mem.memory_size()) << ") from offset " << offset
509 << " data " << size << " bytes";
510 }
511
512 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
513 * because the associated GPU context may be in an invalid state at this point. */
514 if (have_error()) {
515 return;
516 }
517
518 assert(device_queue_);
519
520 assert(size != 0);
521 if (mem.device_pointer) {
522 char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
523 char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
524 bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
525 if (is_finished_ok == false) {
526 set_error("oneAPI memory operation error: got runtime exception \"" +
527 oneapi_error_string_ + "\"");
528 }
529 }
530 }
531}
532
533void OneapiDevice::mem_zero(device_memory &mem)
534{
535 if (mem.name) {
536 LOG_TRACE << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
537 << string_human_readable_number(mem.memory_size()) << " bytes. ("
538 << string_human_readable_size(mem.memory_size()) << ")\n";
539 }
540
541 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
542 * because the associated GPU context may be in an invalid state at this point. */
543 if (have_error()) {
544 return;
545 }
546
547 if (!mem.device_pointer) {
548 mem_alloc(mem);
549 }
550 if (!mem.device_pointer) {
551 return;
552 }
553
554 assert(device_queue_);
555 bool is_finished_ok = usm_memset(
556 device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
557 if (is_finished_ok == false) {
558 set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
559 "\"");
560 }
561}
562
563void OneapiDevice::mem_free(device_memory &mem)
564{
565 if (mem.name) {
566 LOG_TRACE << "OneapiDevice::mem_free: \"" << mem.name << "\", "
567 << string_human_readable_number(mem.device_size) << " bytes. ("
569 }
570
571 if (mem.type == MEM_GLOBAL) {
572 global_free(mem);
573 }
574 else if (mem.type == MEM_TEXTURE) {
575 tex_free((device_texture &)mem);
576 }
577 else {
578 generic_free(mem);
579 }
580}
581
582device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem,
583 const size_t offset,
584 size_t /*size*/)
585{
586 return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
587 mem.memory_elements_size(offset));
588}
589
590void OneapiDevice::const_copy_to(const char *name, void *host, const size_t size)
591{
592 assert(name);
593
594 LOG_TRACE << "OneapiDevice::const_copy_to \"" << name << "\" object "
595 << string_human_readable_number(size) << " bytes. ("
597
598 if (strcmp(name, "data") == 0) {
599 assert(size <= sizeof(KernelData));
600 KernelData *const data = static_cast<KernelData *>(host);
601
602 /* We need this value when allocating local memory for integrator_sort_bucket_pass
603 * and integrator_sort_write_pass kernels. */
604 scene_max_shaders_ = data->max_shaders;
605
606# ifdef WITH_EMBREE_GPU
607 if (embree_traversable != nullptr) {
608 /* Update scene handle (since it is different for each device on multi devices).
609 * This must be a raw pointer copy since at some points during scene update this
610 * pointer may be invalid. */
611 data->device_bvh = embree_traversable;
612 }
613# endif
614 }
615
616 ConstMemMap::iterator i = const_mem_map_.find(name);
618
619 if (i == const_mem_map_.end()) {
620 unique_ptr<device_vector<uchar>> data_ptr = make_unique<device_vector<uchar>>(
621 this, name, MEM_READ_ONLY);
622 data_ptr->alloc(size);
623 data = data_ptr.get();
624 const_mem_map_.insert(ConstMemMap::value_type(name, std::move(data_ptr)));
625 }
626 else {
627 data = i->second.get();
628 }
629
630 assert(data->memory_size() <= size);
631 memcpy(data->data(), host, size);
632 data->copy_to_device();
633
634 set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
635
636 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
637}
638
639void OneapiDevice::global_alloc(device_memory &mem)
640{
641 assert(mem.name);
642
643 size_t size = mem.memory_size();
644 LOG_TRACE << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
645 << string_human_readable_number(size) << " bytes. ("
647
648 generic_alloc(mem);
649 generic_copy_to(mem);
650
651 set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
652
653 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
654}
655
656void OneapiDevice::global_copy_to(device_memory &mem)
657{
658 if (!mem.device_pointer) {
659 global_alloc(mem);
660 }
661 else {
662 generic_copy_to(mem);
663 }
664}
665
666void OneapiDevice::global_free(device_memory &mem)
667{
668 if (mem.device_pointer) {
669 generic_free(mem);
670 }
671}
672
673static sycl::ext::oneapi::experimental::image_descriptor image_desc(const device_texture &mem)
674{
675 /* Image Texture Storage */
676 sycl::image_channel_type channel_type;
677
678 switch (mem.data_type) {
679 case TYPE_UCHAR:
680 channel_type = sycl::image_channel_type::unorm_int8;
681 break;
682 case TYPE_UINT16:
683 channel_type = sycl::image_channel_type::unorm_int16;
684 break;
685 case TYPE_FLOAT:
686 channel_type = sycl::image_channel_type::fp32;
687 break;
688 case TYPE_HALF:
689 channel_type = sycl::image_channel_type::fp16;
690 break;
691 default:
692 assert(0);
693 }
694
695 sycl::ext::oneapi::experimental::image_descriptor param;
696 param.width = mem.data_width;
697 param.height = mem.data_height;
698 param.num_channels = mem.data_elements;
699 param.channel_type = channel_type;
700
701 param.verify();
702
703 return param;
704}
705
706void OneapiDevice::tex_alloc(device_texture &mem)
707{
708 assert(device_queue_);
709
710 size_t size = mem.memory_size();
711
712 sycl::addressing_mode address_mode = sycl::addressing_mode::none;
713 switch (mem.info.extension) {
714 case EXTENSION_REPEAT:
715 address_mode = sycl::addressing_mode::repeat;
716 break;
717 case EXTENSION_EXTEND:
718 address_mode = sycl::addressing_mode::clamp_to_edge;
719 break;
720 case EXTENSION_CLIP:
721 address_mode = sycl::addressing_mode::clamp;
722 break;
723 case EXTENSION_MIRROR:
724 address_mode = sycl::addressing_mode::mirrored_repeat;
725 break;
726 default:
727 assert(0);
728 break;
729 }
730
731 sycl::filtering_mode filter_mode;
733 filter_mode = sycl::filtering_mode::nearest;
734 }
735 else {
736 filter_mode = sycl::filtering_mode::linear;
737 }
738
739 /* Image Texture Storage */
740 sycl::image_channel_type channel_type;
741
742 switch (mem.data_type) {
743 case TYPE_UCHAR:
744 channel_type = sycl::image_channel_type::unorm_int8;
745 break;
746 case TYPE_UINT16:
747 channel_type = sycl::image_channel_type::unorm_int16;
748 break;
749 case TYPE_FLOAT:
750 channel_type = sycl::image_channel_type::fp32;
751 break;
752 case TYPE_HALF:
753 channel_type = sycl::image_channel_type::fp16;
754 break;
755 default:
756 assert(0);
757 return;
758 }
759
760 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
761
762 try {
763 Mem *cmem = nullptr;
764 sycl::ext::oneapi::experimental::image_mem_handle memHandle{0};
765 sycl::ext::oneapi::experimental::image_descriptor desc{};
766
767 if (mem.data_height > 0) {
768 const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
769 const size_t max_width = device.get_info<sycl::info::device::image2d_max_width>();
770 const size_t max_height = device.get_info<sycl::info::device::image2d_max_height>();
771
772 if (mem.data_width > max_width || mem.data_height > max_height) {
773 set_error(
774 string_printf("Maximum GPU 2D texture size exceeded (max %zux%zu, found %zux%zu)",
775 max_width,
776 max_height,
777 mem.data_width,
778 mem.data_height));
779 return;
780 }
781
782 /* 2D texture -- Tile optimized */
783 desc = sycl::ext::oneapi::experimental::image_descriptor(
784 {mem.data_width, mem.data_height, 0}, mem.data_elements, channel_type);
785
786 LOG_DEBUG << "Array 2D/3D allocate: " << mem.name << ", "
787 << string_human_readable_number(mem.memory_size()) << " bytes. ("
789
790 sycl::ext::oneapi::experimental::image_mem_handle memHandle =
791 sycl::ext::oneapi::experimental::alloc_image_mem(desc, *queue);
792 if (!memHandle.raw_handle) {
793 set_error("GPU texture allocation failed: Raw handle is null");
794 return;
795 }
796
797 /* Copy data from host to the texture properly based on the texture description */
798 queue->ext_oneapi_copy(mem.host_pointer, memHandle, desc);
799
800 mem.device_pointer = (device_ptr)memHandle.raw_handle;
801 mem.device_size = size;
802 stats.mem_alloc(size);
803
804 thread_scoped_lock lock(device_mem_map_mutex);
805 cmem = &device_mem_map[&mem];
806 cmem->texobject = 0;
807 cmem->array = (arrayMemObject)(memHandle.raw_handle);
808 }
809 else {
810 /* 1D texture -- Linear memory */
811 desc = sycl::ext::oneapi::experimental::image_descriptor(
812 {mem.data_width}, mem.data_elements, channel_type);
813 cmem = generic_alloc(mem);
814 if (!cmem) {
815 return;
816 }
817
818 queue->memcpy((void *)mem.device_pointer, mem.host_pointer, size);
819 }
820
821 queue->wait_and_throw();
822
823 /* Set Mapping and tag that we need to (re-)upload to device */
824 TextureInfo tex_info = mem.info;
825
826 sycl::ext::oneapi::experimental::bindless_image_sampler samp(
827 address_mode, sycl::coordinate_normalization_mode::normalized, filter_mode);
828
829 if (!is_nanovdb_type(mem.info.data_type)) {
830 sycl::ext::oneapi::experimental::sampled_image_handle imgHandle;
831
832 if (memHandle.raw_handle) {
833 /* Create 2D/3D texture handle */
834 imgHandle = sycl::ext::oneapi::experimental::create_image(memHandle, samp, desc, *queue);
835 }
836 else {
837 /* Create 1D texture */
838 imgHandle = sycl::ext::oneapi::experimental::create_image(
839 (void *)mem.device_pointer, 0, samp, desc, *queue);
840 }
841
842 thread_scoped_lock lock(device_mem_map_mutex);
843 cmem = &device_mem_map[&mem];
844 cmem->texobject = (texMemObject)(imgHandle.raw_handle);
845
846 tex_info.data = (uint64_t)cmem->texobject;
847 }
848 else {
849 tex_info.data = (uint64_t)mem.device_pointer;
850 }
851
852 {
853 /* Update texture info. */
854 thread_scoped_lock lock(texture_info_mutex);
855 const uint slot = mem.slot;
856 if (slot >= texture_info.size()) {
857 /* Allocate some slots in advance, to reduce amount of re-allocations. */
858 texture_info.resize(slot + 128);
859 }
860 texture_info[slot] = tex_info;
861 need_texture_info = true;
862 }
863 }
864 catch (sycl::exception const &e) {
865 set_error("GPU texture allocation failed: runtime exception \"" + string(e.what()) + "\"");
866 }
867}
868
869void OneapiDevice::tex_copy_to(device_texture &mem)
870{
871 if (!mem.device_pointer) {
872 tex_alloc(mem);
873 }
874 else {
875 if (mem.data_height > 0) {
876 /* 2D/3D texture -- Tile optimized */
877 sycl::ext::oneapi::experimental::image_descriptor desc = image_desc(mem);
878
879 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
880
881 try {
882 /* Copy data from host to the texture properly based on the texture description */
883 thread_scoped_lock lock(device_mem_map_mutex);
884 const Mem &cmem = device_mem_map[&mem];
885 sycl::ext::oneapi::experimental::image_mem_handle image_handle{
886 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
887 queue->ext_oneapi_copy(mem.host_pointer, image_handle, desc);
888
889# ifdef WITH_CYCLES_DEBUG
890 queue->wait_and_throw();
891# endif
892 }
893 catch (sycl::exception const &e) {
894 set_error("oneAPI texture copy error: got runtime exception \"" + string(e.what()) + "\"");
895 }
896 }
897 else {
898 generic_copy_to(mem);
899 }
900 }
901}
902
903void OneapiDevice::tex_free(device_texture &mem)
904{
905 if (mem.device_pointer) {
906 thread_scoped_lock lock(device_mem_map_mutex);
907 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
908 const Mem &cmem = device_mem_map[&mem];
909
910 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
911
912 if (cmem.texobject) {
913 /* Free bindless texture itself. */
914 sycl::ext::oneapi::experimental::sampled_image_handle image(cmem.texobject);
915 sycl::ext::oneapi::experimental::destroy_image_handle(image, *queue);
916 }
917
918 if (cmem.array) {
919 /* Free texture memory. */
920 sycl::ext::oneapi::experimental::image_mem_handle imgHandle{
921 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
922
923 try {
924 /* We have allocated only standard textures, so we also deallocate only them. */
925 sycl::ext::oneapi::experimental::free_image_mem(
926 imgHandle, sycl::ext::oneapi::experimental::image_type::standard, *queue);
927 }
928 catch (sycl::exception const &e) {
929 set_error("oneAPI texture deallocation error: got runtime exception \"" +
930 string(e.what()) + "\"");
931 }
932
933 stats.mem_free(mem.memory_size());
934 mem.device_pointer = 0;
935 mem.device_size = 0;
936 device_mem_map.erase(device_mem_map.find(&mem));
937 }
938 else {
939 lock.unlock();
940 generic_free(mem);
941 }
942 }
943}
944
945unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
946{
947 return make_unique<OneapiDeviceQueue>(this);
948}
949
950bool OneapiDevice::should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
951 const bool log)
952{
953# ifdef SYCL_LINEAR_MEMORY_INTEROP_AVAILABLE
954 if (interop_device.type != GraphicsInteropDevice::VULKAN) {
955 /* SYCL only supports interop with Vulkan and D3D. */
956 return false;
957 }
958
959 try {
960 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
961 if (!device.has(sycl::aspect::ext_oneapi_external_memory_import)) {
962 return false;
963 }
964
965 /* This extension is in the namespace "sycl::ext::intel",
966 * but also available on non-Intel GPUs. */
967 sycl::detail::uuid_type uuid = device.get_info<sycl::ext::intel::info::device::uuid>();
968 const bool found = (uuid.size() == interop_device.uuid.size() &&
969 memcmp(uuid.data(), interop_device.uuid.data(), uuid.size()) == 0);
970
971 if (log) {
972 if (found) {
973 LOG_INFO << "Graphics interop: found matching Vulkan device for oneAPI";
974 }
975 else {
976 LOG_INFO << "Graphics interop: no matching Vulkan device for oneAPI";
977 }
978
979 LOG_INFO << "Graphics Interop: oneAPI UUID " << string_hex(uuid.data(), uuid.size())
980 << ", Vulkan UUID "
981 << string_hex(interop_device.uuid.data(), interop_device.uuid.size());
982 }
983
984 return found;
985 }
986 catch (sycl::exception &e) {
987 LOG_ERROR << "Could not release external Vulkan memory: " << e.what();
988 }
989# endif
990 return false;
991}
992
993void *OneapiDevice::usm_aligned_alloc_host(const size_t memory_size, const size_t alignment)
994{
995 assert(device_queue_);
996 return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
997}
998
999void OneapiDevice::usm_free(void *usm_ptr)
1000{
1001 assert(device_queue_);
1002 usm_free(device_queue_, usm_ptr);
1003}
1004
1005void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
1006{
1007# ifndef NDEBUG
1008 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1009 sycl::info::device_type device_type =
1010 queue->get_device().get_info<sycl::info::device::device_type>();
1011 sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
1012 (void)usm_type;
1013# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1014 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
1015# else
1016 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
1017# endif
1018 assert(usm_type == main_memory_type ||
1019 (usm_type == sycl::usm::alloc::host &&
1020 (allow_host || device_type == sycl::info::device_type::cpu)) ||
1021 usm_type == sycl::usm::alloc::unknown);
1022# else
1023 /* Silence warning about unused arguments. */
1024 (void)queue_;
1025 (void)usm_ptr;
1026 (void)allow_host;
1027# endif
1028}
1029
1030bool OneapiDevice::create_queue(SyclQueue *&external_queue,
1031 const int device_index,
1032 void *embree_device_pointer,
1033 bool *is_several_intel_dgpu_devices_detected_pointer)
1034{
1035 bool finished_correct = true;
1036 *is_several_intel_dgpu_devices_detected_pointer = false;
1037
1038 try {
1039 std::vector<sycl::device> devices = available_sycl_devices(
1040 is_several_intel_dgpu_devices_detected_pointer);
1041 if (device_index < 0 || device_index >= devices.size()) {
1042 return false;
1043 }
1044
1045 sycl::queue *created_queue = nullptr;
1046 if (*is_several_intel_dgpu_devices_detected_pointer == false) {
1047 created_queue = new sycl::queue(devices[device_index], sycl::property::queue::in_order());
1048 }
1049 else {
1050 sycl::context device_context(devices[device_index]);
1051 created_queue = new sycl::queue(
1052 device_context, devices[device_index], sycl::property::queue::in_order());
1053 LOG_TRACE << "Separate context was generated for the new queue, as several available SYCL "
1054 "devices were detected";
1055 }
1056 external_queue = reinterpret_cast<SyclQueue *>(created_queue);
1057
1058# ifdef WITH_EMBREE_GPU
1059 if (embree_device_pointer) {
1060 RTCDevice *device_object_ptr = reinterpret_cast<RTCDevice *>(embree_device_pointer);
1061 *device_object_ptr = rtcNewSYCLDevice(created_queue->get_context(), "");
1062 if (*device_object_ptr == nullptr) {
1063 finished_correct = false;
1064 oneapi_error_string_ =
1065 "Hardware Raytracing is not available; please install "
1066 "\"intel-level-zero-gpu-raytracing\" to enable it or disable Embree on GPU.";
1067 }
1068 else {
1069 rtcSetDeviceSYCLDevice(*device_object_ptr, devices[device_index]);
1070 }
1071 }
1072# else
1073 (void)embree_device_pointer;
1074# endif
1075 }
1076 catch (const sycl::exception &e) {
1077 finished_correct = false;
1078 oneapi_error_string_ = e.what();
1079 }
1080 return finished_correct;
1081}
1082
1083void OneapiDevice::free_queue(SyclQueue *queue_)
1084{
1085 assert(queue_);
1086 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1087 delete queue;
1088}
1089
1090void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_,
1091 size_t memory_size,
1092 const size_t alignment)
1093{
1094 assert(queue_);
1095 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1096 return sycl::aligned_alloc_host(alignment, memory_size, *queue);
1097}
1098
1099void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
1100{
1101 assert(queue_);
1102 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1103 /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
1104 * and shared. For new project it could more beneficial to use USM shared memory, because it
1105 * provides automatic migration mechanism in order to allow to use the same pointer on host and
1106 * on device, without need to worry about explicit memory transfer operations, although usage of
1107 * USM shared imply some documented limitations on the memory usage in regards of parallel access
1108 * from different threads. But for Blender/Cycles this type of memory is not very suitable in
1109 * current application architecture, because Cycles is multi-thread application and already uses
1110 * two different pointer for host activity and device activity, and also has to perform all
1111 * needed memory transfer operations. So, USM device memory type has been used for oneAPI device
1112 * in order to better fit in Cycles architecture. */
1113# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1114 return sycl::malloc_device(memory_size, *queue);
1115# else
1116 return sycl::malloc_host(memory_size, *queue);
1117# endif
1118}
1119
1120void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
1121{
1122 assert(queue_);
1123 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1124 OneapiDevice::check_usm(queue_, usm_ptr, true);
1125 sycl::free(usm_ptr, *queue);
1126}
1127
1128bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, const size_t num_bytes)
1129{
1130 assert(queue_);
1131 /* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
1132 * runtime errors. It's better to avoid running memory operations in that case.
1133 * The render will be canceled and the queue will be destroyed anyway. */
1134 if (have_error()) {
1135 return false;
1136 }
1137
1138 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1139 OneapiDevice::check_usm(queue_, dest, true);
1140 OneapiDevice::check_usm(queue_, src, true);
1141 sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
1142 sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
1143 /* Unknown here means, that this is not an USM allocation, which implies that this is
1144 * some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
1145 if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
1146 (src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
1147 {
1148 memcpy(dest, src, num_bytes);
1149 return true;
1150 }
1151
1152 try {
1153 sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
1154# ifdef WITH_CYCLES_DEBUG
1155 /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
1156 * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
1157 */
1158 mem_event.wait_and_throw();
1159 return true;
1160# else
1161 bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
1162 src_type == sycl::usm::alloc::device;
1163 bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
1164 src_type == sycl::usm::alloc::unknown;
1165 /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
1166 * may not wait until the end of the transfer before using the memory.
1167 */
1168 if (from_device_to_host || host_or_device_memop_with_offset) {
1169 mem_event.wait();
1170 }
1171 return true;
1172# endif
1173 }
1174 catch (const sycl::exception &e) {
1175 oneapi_error_string_ = e.what();
1176 return false;
1177 }
1178}
1179
1180bool OneapiDevice::usm_memset(SyclQueue *queue_,
1181 void *usm_ptr,
1182 unsigned char value,
1183 const size_t num_bytes)
1184{
1185 assert(queue_);
1186 /* sycl::queue::memset may crash if the queue is in an invalid state due to previous
1187 * runtime errors. It's better to avoid running memory operations in that case.
1188 * The render will be canceled and the queue will be destroyed anyway. */
1189 if (have_error()) {
1190 return false;
1191 }
1192
1193 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1194 OneapiDevice::check_usm(queue_, usm_ptr, true);
1195 try {
1196 sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
1197# ifdef WITH_CYCLES_DEBUG
1198 /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
1199 * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
1200 */
1201 mem_event.wait_and_throw();
1202# else
1203 (void)mem_event;
1204# endif
1205 return true;
1206 }
1207 catch (const sycl::exception &e) {
1208 oneapi_error_string_ = e.what();
1209 return false;
1210 }
1211}
1212
1213bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
1214{
1215 assert(queue_);
1216 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1217 try {
1218 queue->wait_and_throw();
1219 return true;
1220 }
1221 catch (const sycl::exception &e) {
1222 oneapi_error_string_ = e.what();
1223 return false;
1224 }
1225}
1226
1227bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
1228{
1229 kernel_global_size = sizeof(KernelGlobalsGPU);
1230
1231 return true;
1232}
1233
1234void OneapiDevice::set_global_memory(SyclQueue *queue_,
1235 void *kernel_globals,
1236 const char *memory_name,
1237 void *memory_device_pointer)
1238{
1239 assert(queue_);
1240 assert(kernel_globals);
1241 assert(memory_name);
1242 assert(memory_device_pointer);
1243 KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
1244 OneapiDevice::check_usm(queue_, memory_device_pointer, true);
1245 OneapiDevice::check_usm(queue_, kernel_globals, true);
1246
1247 std::string matched_name(memory_name);
1248
1249/* This macro will change global ptr of KernelGlobals via name matching. */
1250# define KERNEL_DATA_ARRAY(type, name) \
1251 else if (#name == matched_name) { \
1252 globals->__##name = (type *)memory_device_pointer; \
1253 return; \
1254 }
1255 if (false) {
1256 }
1257 else if ("integrator_state" == matched_name) {
1258 globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
1259 return;
1260 }
1261 KERNEL_DATA_ARRAY(KernelData, data)
1262# include "kernel/data_arrays.h"
1263 else {
1264 std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
1265 << std::endl;
1266 assert(false);
1267 }
1268# undef KERNEL_DATA_ARRAY
1269}
1270
1271bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
1272 const int kernel,
1273 const size_t global_size,
1274 const size_t local_size,
1275 void **args)
1276{
1277 return oneapi_enqueue_kernel(kernel_context,
1278 kernel,
1279 global_size,
1280 local_size,
1281 kernel_features,
1282 use_hardware_raytracing,
1283 args);
1284}
1285
1286void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
1287 const DeviceKernel kernel,
1288 size_t &kernel_global_size,
1289 size_t &kernel_local_size)
1290{
1291 assert(queue);
1292 static const size_t preferred_work_group_size_intersect = 128;
1293 static const size_t preferred_work_group_size_shading = 256;
1294 static const size_t preferred_work_group_size_shading_simd8 = 64;
1295 /* Shader evaluation kernels seems to use some amount of shared memory, so better
1296 * to avoid usage of maximum work group sizes for them. */
1297 static const size_t preferred_work_group_size_shader_evaluation = 256;
1298 /* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
1299 * for now their work-group size is restricted to 512. */
1300 static const size_t preferred_work_group_size_cryptomatte = 512;
1301 static const size_t preferred_work_group_size_default = 1024;
1302
1303 const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
1304 const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
1305
1306 size_t preferred_work_group_size = 0;
1307 switch (kernel) {
1315 preferred_work_group_size = preferred_work_group_size_intersect;
1316 break;
1317
1327 const bool device_is_simd8 =
1328 (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1329 device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
1330 preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
1331 preferred_work_group_size_shading;
1332 } break;
1333
1335 preferred_work_group_size = preferred_work_group_size_cryptomatte;
1336 break;
1337
1342 preferred_work_group_size = preferred_work_group_size_shader_evaluation;
1343 break;
1344
1345 default:
1346 /* Do nothing and keep initial zero value. */
1347 break;
1348 }
1349
1350 /* Such order of logic allow us to override Blender default values, if needed,
1351 * yet respect them otherwise. */
1352 if (preferred_work_group_size == 0) {
1353 preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
1354 }
1355
1356 /* If there is no recommendation, then use manual default value. */
1357 if (preferred_work_group_size == 0) {
1358 preferred_work_group_size = preferred_work_group_size_default;
1359 }
1360
1361 kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
1362
1363 /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
1364 * we extend work size to fit uniformity requirements. */
1365 kernel_global_size = round_up(kernel_global_size, kernel_local_size);
1366
1367# ifdef WITH_ONEAPI_SYCL_HOST_TASK
1368 /* Kernels listed below need a specific number of work groups. */
1376 {
1377 /* Path array implementation is serial in case of SYCL Host Task execution. */
1378 kernel_global_size = 1;
1379 kernel_local_size = 1;
1380 }
1381# endif
1382
1383 assert(kernel_global_size % kernel_local_size == 0);
1384}
1385
1386/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
1387 * since Windows driver 101.3268. */
1388static const int lowest_supported_driver_version_win = 1018132;
1389# ifdef _WIN32
1390/* For Windows driver 101.8132, compute-runtime version is 34938.
1391 * And for Windows Workstation driver 32.0.101.6979 Q3.25, it is 34177.
1392 * This information is returned by `ocloc query OCL_DRIVER_VERSION`. */
1393static const int lowest_supported_driver_version_neo = 34177;
1394# else
1395static const int lowest_supported_driver_version_neo = 34666;
1396# endif
1397
1398int parse_driver_build_version(const sycl::device &device)
1399{
1400 const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
1401 int driver_build_version = 0;
1402
1403 size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
1404 if (second_dot_position != std::string::npos) {
1405 try {
1406 size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
1407 if (third_dot_position != std::string::npos) {
1408 const std::string &third_number_substr = driver_version.substr(
1409 second_dot_position + 1, third_dot_position - second_dot_position - 1);
1410 const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
1411 if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) {
1412 driver_build_version = std::stoi(third_number_substr) * 10000 +
1413 std::stoi(forth_number_substr);
1414 }
1415 /* This is actually not a correct version string (Major.Minor.Patch.Optional), see blender
1416 * bug report #137277, but there are several driver versions with this Intel bug existing
1417 * at this point, so it is worth working around this issue in Blender source code, allowing
1418 * users to actually use Intel GPU when it is possible. */
1419 else if (third_number_substr.length() == 5 && forth_number_substr.length() == 6) {
1420 driver_build_version = std::stoi(third_number_substr);
1421 }
1422 }
1423 else {
1424 const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
1425 driver_build_version = std::stoi(third_number_substr);
1426 }
1427 }
1428 catch (std::invalid_argument &) {
1429 }
1430 }
1431
1432 if (driver_build_version == 0) {
1433 LOG_WARNING << "Unable to parse unknown Intel GPU driver version. \"" << driver_version
1434 << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
1435 << " xx.xx.xxx.xxxx (Windows) for device \""
1436 << device.get_info<sycl::info::device::name>() << "\".";
1437 }
1438
1439 return driver_build_version;
1440}
1441
1442std::vector<sycl::device> available_sycl_devices(bool *multiple_dgpus_detected = nullptr)
1443{
1444 std::vector<sycl::device> available_devices;
1445 bool allow_all_devices = false;
1446 if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
1447 allow_all_devices = true;
1448 }
1449
1450 int level_zero_dgpu_counter = 0;
1451 try {
1452 const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
1453
1454 for (const sycl::platform &platform : oneapi_platforms) {
1455 /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and
1456 * OpenCL.
1457 */
1458 if (platform.get_backend() == sycl::backend::opencl) {
1459 continue;
1460 }
1461
1462 const std::vector<sycl::device> &oneapi_devices =
1463 (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
1464 platform.get_devices(sycl::info::device_type::gpu);
1465
1466 for (const sycl::device &device : oneapi_devices) {
1467 bool filter_out = false;
1468
1469 if (platform.get_backend() == sycl::backend::ext_oneapi_level_zero && device.is_gpu() &&
1470 device.get_info<sycl::info::device::host_unified_memory>() == false // dGPU
1471 )
1472 {
1473 level_zero_dgpu_counter++;
1474 }
1475
1476 if (!allow_all_devices) {
1477 /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
1478 * assuming they have either more than 96 Execution Units or not 7 threads per EU.
1479 * Official support can be broaden to older and smaller GPUs once ready. */
1480 if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
1481 filter_out = true;
1482 }
1483 else {
1484 /* Filtered-out defaults in-case these values aren't available. */
1485 int number_of_eus = 96;
1486 int threads_per_eu = 7;
1487 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1488 number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1489 }
1490 if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
1491 threads_per_eu =
1492 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1493 }
1494 /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
1495 if (number_of_eus <= 96 && threads_per_eu == 7) {
1496 filter_out = true;
1497 }
1498 /* if not already filtered out, check driver version. */
1499 bool check_driver_version = !filter_out;
1500 /* We don't know how to check driver version strings for non-Intel GPUs. */
1501 if (check_driver_version &&
1502 device.get_info<sycl::info::device::vendor>().find("Intel") == std::string::npos)
1503 {
1504 check_driver_version = false;
1505 }
1506 /* Because of https://github.com/oneapi-src/unified-runtime/issues/1777, future drivers
1507 * may break parsing done by a SYCL runtime from before the fix we expect in major
1508 * version 8. Parsed driver version would start with something different than current
1509 * "1.3.". To avoid blocking a device by mistake in the case of new driver / old SYCL
1510 * runtime, we disable driver version check in case LIBSYCL_MAJOR_VERSION is below 8
1511 * and actual driver version doesn't start with 1.3. */
1512# if __LIBSYCL_MAJOR_VERSION < 8
1513 if (check_driver_version &&
1514 !string_startswith(device.get_info<sycl::info::device::driver_version>(), "1.3."))
1515 {
1516 check_driver_version = false;
1517 }
1518# endif
1519 if (check_driver_version) {
1520 int driver_build_version = parse_driver_build_version(device);
1521 const int lowest_supported_driver_version = (driver_build_version > 100000) ?
1522 lowest_supported_driver_version_win :
1523 lowest_supported_driver_version_neo;
1524 if (driver_build_version < lowest_supported_driver_version) {
1525 filter_out = true;
1526
1527 LOG_WARNING << "Driver version for device \""
1528 << device.get_info<sycl::info::device::name>()
1529 << "\" is too old. Expected \"" << lowest_supported_driver_version
1530 << "\" or newer, but got \"" << driver_build_version << "\".";
1531 }
1532 }
1533 }
1534 }
1535
1536 /* NOTE(sirgienko) Due to some changes in the latest Intel Drivers, the currently used
1537 * DPC++ compiler will duplicate devices on some platforms, which have a discrete Intel GPU
1538 * together with 11th-14th Gen CPUs, with iGPU enabled. This will be fixed in upstream
1539 * DPC++ 6.3, but for now, in order to not confuse our Blender end-users with several
1540 * duplicated GPUs, we will avoid adding duplicates into the device list. */
1541 /* The order of adding devices is not important, as both duplicated GPUs are fully
1542 * functional and performant, so we can pick up the first one we find. */
1543 if (!filter_out) {
1544 for (const sycl::device &already_available_device : available_devices) {
1545 std::array<sycl::device, 2> devices = {already_available_device, device};
1546 std::vector<sycl::ext::intel::info::device::uuid::return_type> uuids;
1547 for (int i = 0; i < 2; i++) {
1548 /* As this is an Intel-specific enumeration issue - we are collecting Intel UUID
1549 * expecting it to be supported on Intel GPUs. */
1550 if (devices[i].has(sycl::aspect::ext_intel_device_info_uuid)) {
1551 uuids.push_back(devices[i].get_info<sycl::ext::intel::info::device::uuid>());
1552 }
1553 else if (devices[i].get_platform().get_info<sycl::info::platform::vendor>() ==
1554 "Intel(R) Corporation")
1555 {
1556 /* Better to ensure that our expectation that all Intel devices support the UUID
1557 * extension is correct. If one day this is not true, then we will at least have a
1558 * warning message in the log. */
1559 const std::string &device_name = devices[i].get_info<sycl::info::device::name>();
1560 LOG_WARNING << "Despite expectation, Intel oneAPI device '" << device_name
1561 << "' is not supporting Intel SYCL UUID extension.";
1562 }
1563 }
1564 if (uuids.size() == 2) {
1565 if (uuids[0] == uuids[1]) {
1566 const std::string &device_name = device.get_info<sycl::info::device::name>();
1567 const std::string &platform_name =
1568 device.get_platform().get_info<sycl::info::platform::name>();
1569 LOG_DEBUG
1570 << "Detecting that oneAPI device '" << device_name << "' of platform '"
1571 << platform_name
1572 << "' is identical (by UUID comparison) to an already added device in the "
1573 "list of available devices, so it will not be added again.";
1574 filter_out = true;
1575 break;
1576 }
1577 }
1578 }
1579 }
1580
1581 if (!filter_out) {
1582 available_devices.push_back(device);
1583 }
1584 }
1585 }
1586 }
1587 catch (sycl::exception &e) {
1588 LOG_WARNING << "An error has been encountered while enumerating SYCL devices: " << e.what();
1589 }
1590
1591 if (multiple_dgpus_detected) {
1592 *multiple_dgpus_detected = level_zero_dgpu_counter > 1;
1593 }
1594
1595 return available_devices;
1596}
1597
1598void OneapiDevice::architecture_information(const SyclDevice *device,
1599 string &name,
1600 bool &is_optimized)
1601{
1602 const sycl::ext::oneapi::experimental::architecture arch =
1603 reinterpret_cast<const sycl::device *>(device)
1604 ->get_info<sycl::ext::oneapi::experimental::info::device::architecture>();
1605
1606# define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \
1607 case sycl::ext::oneapi::experimental::architecture ::architecture_code: \
1608 name = #architecture_code; \
1609 is_optimized = is_arch_optimised; \
1610 break;
1611
1612 /* List of architectures that have been optimized by Intel and Blender developers.
1613 *
1614 * For example, Intel Rocket Lake iGPU (rkl) is not supported and not optimized,
1615 * while Intel Arc Alchemist dGPU (dg2) was optimized for.
1616 *
1617 * Devices can changed from unoptimized to optimized manually, after DPC++ has
1618 * been upgraded to support the architecture and CYCLES_ONEAPI_INTEL_BINARIES_ARCH
1619 * in CMake includes the architecture. */
1620 switch (arch) {
1621 FILL_ARCH_INFO(intel_gpu_bdw, false)
1622 FILL_ARCH_INFO(intel_gpu_skl, false)
1623 FILL_ARCH_INFO(intel_gpu_kbl, false)
1624 FILL_ARCH_INFO(intel_gpu_cfl, false)
1625 FILL_ARCH_INFO(intel_gpu_apl, false)
1626 FILL_ARCH_INFO(intel_gpu_glk, false)
1627 FILL_ARCH_INFO(intel_gpu_whl, false)
1628 FILL_ARCH_INFO(intel_gpu_aml, false)
1629 FILL_ARCH_INFO(intel_gpu_cml, false)
1630 FILL_ARCH_INFO(intel_gpu_icllp, false)
1631 FILL_ARCH_INFO(intel_gpu_ehl, false)
1632 FILL_ARCH_INFO(intel_gpu_tgllp, false)
1633 FILL_ARCH_INFO(intel_gpu_rkl, false)
1634 FILL_ARCH_INFO(intel_gpu_adl_s, false)
1635 FILL_ARCH_INFO(intel_gpu_adl_p, false)
1636 FILL_ARCH_INFO(intel_gpu_adl_n, false)
1637 FILL_ARCH_INFO(intel_gpu_dg1, false)
1638 FILL_ARCH_INFO(intel_gpu_dg2_g10, true)
1639 FILL_ARCH_INFO(intel_gpu_dg2_g11, true)
1640 FILL_ARCH_INFO(intel_gpu_dg2_g12, true)
1641 FILL_ARCH_INFO(intel_gpu_pvc, false)
1642 FILL_ARCH_INFO(intel_gpu_pvc_vg, false)
1643 /* intel_gpu_mtl_u == intel_gpu_mtl_s == intel_gpu_arl_u == intel_gpu_arl_s */
1644 FILL_ARCH_INFO(intel_gpu_mtl_u, true)
1645 FILL_ARCH_INFO(intel_gpu_mtl_h, true)
1646 FILL_ARCH_INFO(intel_gpu_bmg_g21, true)
1647 FILL_ARCH_INFO(intel_gpu_bmg_g31, true)
1648 FILL_ARCH_INFO(intel_gpu_lnl_m, true)
1649 FILL_ARCH_INFO(intel_gpu_ptl_h, true)
1650 FILL_ARCH_INFO(intel_gpu_ptl_u, true)
1651
1652 default:
1653 name = "unknown";
1654 is_optimized = false;
1655 break;
1656 }
1657}
1658
1659char *OneapiDevice::device_capabilities()
1660{
1661 std::stringstream capabilities;
1662
1663 const std::vector<sycl::device> &oneapi_devices = available_sycl_devices();
1664 for (const sycl::device &device : oneapi_devices) {
1665# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1666 const std::string &name = device.get_info<sycl::info::device::name>();
1667# else
1668 const std::string &name = "SYCL Host Task (Debug)";
1669# endif
1670
1671 capabilities << std::string("\t") << name << "\n";
1672 capabilities << "\t\tsycl::info::platform::name\t\t\t"
1673 << device.get_platform().get_info<sycl::info::platform::name>() << "\n";
1674
1675 string arch_name;
1676 bool is_optimised_for_arch;
1677 architecture_information(
1678 reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
1679 capabilities << "\t\tsycl::info::device::architecture\t\t\t";
1680 capabilities << arch_name << "\n";
1681 capabilities << "\t\tsycl::info::device::is_cycles_optimized\t\t\t";
1682 capabilities << is_optimised_for_arch << "\n";
1683
1684# define WRITE_ATTR(attribute_name, attribute_variable) \
1685 capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
1686 << "\n";
1687# define GET_ATTR(attribute) \
1688 { \
1689 capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \
1690 << device.get_info<sycl::info::device ::attribute>() << "\n"; \
1691 }
1692# define GET_INTEL_ATTR(attribute) \
1693 { \
1694 if (device.has(sycl::aspect::ext_intel_##attribute)) { \
1695 capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \
1696 << device.get_info<sycl::ext::intel::info::device ::attribute>() << "\n"; \
1697 } \
1698 }
1699# define GET_ASPECT(aspect_) \
1700 { \
1701 capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \
1702 << "\n"; \
1703 }
1704
1705 GET_ATTR(vendor)
1706 GET_ATTR(driver_version)
1707 GET_ATTR(max_compute_units)
1708 GET_ATTR(max_clock_frequency)
1709 GET_ATTR(global_mem_size)
1710 GET_INTEL_ATTR(pci_address)
1711 GET_INTEL_ATTR(gpu_eu_simd_width)
1712 GET_INTEL_ATTR(gpu_eu_count)
1713 GET_INTEL_ATTR(gpu_slices)
1714 GET_INTEL_ATTR(gpu_subslices_per_slice)
1715 GET_INTEL_ATTR(gpu_eu_count_per_subslice)
1716 GET_INTEL_ATTR(gpu_hw_threads_per_eu)
1717 GET_INTEL_ATTR(max_mem_bandwidth)
1718 GET_ATTR(max_work_group_size)
1719 GET_ATTR(max_work_item_dimensions)
1720 sycl::id<3> max_work_item_sizes =
1721 device.get_info<sycl::info::device::max_work_item_sizes<3>>();
1722 WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0))
1723 WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1))
1724 WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2))
1725
1726 GET_ATTR(max_num_sub_groups)
1727 for (size_t sub_group_size : device.get_info<sycl::info::device::sub_group_sizes>()) {
1728 WRITE_ATTR(sub_group_size[], sub_group_size)
1729 }
1730 GET_ATTR(sub_group_independent_forward_progress)
1731
1732 GET_ATTR(preferred_vector_width_char)
1733 GET_ATTR(preferred_vector_width_short)
1734 GET_ATTR(preferred_vector_width_int)
1735 GET_ATTR(preferred_vector_width_long)
1736 GET_ATTR(preferred_vector_width_float)
1737 GET_ATTR(preferred_vector_width_double)
1738 GET_ATTR(preferred_vector_width_half)
1739
1740 GET_ATTR(address_bits)
1741 GET_ATTR(max_mem_alloc_size)
1742 GET_ATTR(mem_base_addr_align)
1743 GET_ATTR(error_correction_support)
1744 GET_ATTR(is_available)
1745 GET_ATTR(host_unified_memory)
1746
1747 GET_ASPECT(cpu)
1748 GET_ASPECT(gpu)
1749 GET_ASPECT(fp16)
1750 GET_ASPECT(atomic64)
1751 GET_ASPECT(usm_host_allocations)
1752 GET_ASPECT(usm_device_allocations)
1753 GET_ASPECT(usm_shared_allocations)
1754 GET_ASPECT(usm_system_allocations)
1755
1756# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__
1757 GET_ASPECT(ext_oneapi_non_uniform_groups)
1758# endif
1759# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__
1760 GET_ASPECT(ext_oneapi_bindless_images)
1761# endif
1762# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__
1763 GET_ASPECT(ext_oneapi_interop_semaphore_import)
1764# endif
1765# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__
1766 GET_ASPECT(ext_oneapi_interop_semaphore_export)
1767# endif
1768
1769# undef GET_INTEL_ATTR
1770# undef GET_ASPECT
1771# undef GET_ATTR
1772# undef WRITE_ATTR
1773 capabilities << "\n";
1774 }
1775
1776 return ::strdup(capabilities.str().c_str());
1777}
1778
1779void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
1780{
1781 int num = 0;
1782 std::vector<sycl::device> devices = available_sycl_devices();
1783 for (sycl::device &device : devices) {
1784 const std::string &platform_name =
1785 device.get_platform().get_info<sycl::info::platform::name>();
1786# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1787 std::string name = device.get_info<sycl::info::device::name>();
1788# else
1789 std::string name = "SYCL Host Task (Debug)";
1790# endif
1791# ifdef WITH_EMBREE_GPU
1792 bool hwrt_support = rtcIsSYCLDeviceSupported(device);
1793# else
1794 bool hwrt_support = false;
1795# endif
1796# if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300
1797 bool oidn_support = oidnIsSYCLDeviceSupported(&device);
1798# else
1799 bool oidn_support = false;
1800# endif
1801 std::string id = "ONEAPI_" + platform_name + "_" + name;
1802
1803 string arch_name;
1804 bool is_optimised_for_arch;
1805 architecture_information(
1806 reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
1807
1808 if (device.has(sycl::aspect::ext_intel_pci_address)) {
1809 id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
1810 }
1811 (cb)(id.c_str(),
1812 name.c_str(),
1813 num,
1814 hwrt_support,
1815 oidn_support,
1816 is_optimised_for_arch,
1817 user_ptr);
1818 num++;
1819 }
1820}
1821
1822size_t OneapiDevice::get_memcapacity()
1823{
1824 return reinterpret_cast<sycl::queue *>(device_queue_)
1825 ->get_device()
1826 .get_info<sycl::info::device::global_mem_size>();
1827}
1828
1829int OneapiDevice::get_num_multiprocessors()
1830{
1831 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
1832 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1833 return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1834 }
1835 return device.get_info<sycl::info::device::max_compute_units>();
1836}
1837
1838int OneapiDevice::get_max_num_threads_per_multiprocessor()
1839{
1840 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
1841 if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1842 device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu))
1843 {
1844 return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
1845 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1846 }
1847 /* We'd want sycl::info::device::max_threads_per_compute_unit which doesn't exist yet.
1848 * max_work_group_size is the closest approximation but it can still be several times off. */
1849 return device.get_info<sycl::info::device::max_work_group_size>();
1850}
1851
1853
1854#endif
void BLI_kdtree_nd_ free(KDTree *tree)
ATTR_WARN_UNUSED_RESULT const size_t num
unsigned int uint
volatile int lock
BMesh const char void * data
return true
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
static PointerRNA * get_pointer_type(ButsContextPath *path, StructRNA *type)
BVHLayout bvh_layout
Definition params.h:83
bool top_level
Definition params.h:80
Definition bvh/bvh.h:67
BVHParams params
Definition bvh/bvh.h:69
string description
bool use_hardware_raytracing
virtual void host_free(const MemoryType type, void *host_pointer, const size_t size)
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void * host_alloc(const MemoryType type, const size_t size)
void mem_alloc(const size_t size)
Definition util/stats.h:18
void mem_free(const size_t size)
Definition util/stats.h:24
size_t memory_elements_size(const int elements)
nullptr float
@ MEM_TEXTURE
@ MEM_READ_WRITE
@ MEM_READ_ONLY
@ TYPE_UINT16
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:8
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
#define assert(assertion)
#define log
@ BVH_LAYOUT_BVH2
@ BVH_LAYOUT_EMBREEGPU
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_SHADER_EVAL_VOLUME_DENSITY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME_RAY_MARCHING
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
#define DCHECK(expression)
Definition log.h:135
#define LOG_DEBUG
Definition log.h:107
#define LOG_ERROR
Definition log.h:101
#define LOG_WARNING
Definition log.h:103
#define LOG_INFO
Definition log.h:106
#define LOG_TRACE
Definition log.h:108
int BVHLayoutMask
Definition params.h:50
const char * name
string string_human_readable_size(size_t size)
Definition string.cpp:257
string string_hex(const uint8_t *data, const size_t size)
Definition string.cpp:191
string string_human_readable_number(size_t num)
Definition string.cpp:276
bool string_startswith(const string_view s, const string_view start)
Definition string.cpp:104
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
CCL_NAMESPACE_END IntegratorStateGPU * integrator_state
uint64_t data
Definition texture.h:86
uint data_type
Definition texture.h:88
uint extension
Definition texture.h:91
uint interpolation
Definition texture.h:90
i
Definition text_draw.cc:230
@ INTERPOLATION_CLOSEST
Definition texture.h:25
@ EXTENSION_REPEAT
Definition texture.h:73
@ EXTENSION_CLIP
Definition texture.h:77
@ EXTENSION_EXTEND
Definition texture.h:75
@ EXTENSION_MIRROR
Definition texture.h:79
ccl_device_inline bool is_nanovdb_type(int type)
Definition texture.h:51
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
Definition types_base.h:57
uint64_t device_ptr
Definition types_base.h:44