Blender V4.5
oneapi/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_ONEAPI
6
7/* <algorithm> is needed until included upstream in sycl/detail/property_list_base.hpp */
8# include <algorithm>
9# include <sycl/sycl.hpp>
10
12
13# include "util/log.h"
14
15# ifdef WITH_EMBREE_GPU
16# include "bvh/embree.h"
17# endif
18
19# if defined(WITH_OPENIMAGEDENOISE)
20# include <OpenImageDenoise/config.h>
21# if OIDN_VERSION >= 20300
22# include "util/openimagedenoise.h" // IWYU pragma: keep
23# endif
24# endif
25
28
29# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
30/* These declarations are missing from embree headers when compiling from a compiler that doesn't
31 * support SYCL. */
32extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
33extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
34# endif
35
37
38static std::vector<sycl::device> available_sycl_devices(bool *multiple_dgpus_detected);
39static int parse_driver_build_version(const sycl::device &device);
40
41static void queue_error_cb(const char *message, void *user_ptr)
42{
43 if (user_ptr) {
44 *reinterpret_cast<std::string *>(user_ptr) = message;
45 }
46}
47
48OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
49 : GPUDevice(info, stats, profiler, headless)
50{
51 /* Verify that base class types can be used with specific backend types */
52 static_assert(sizeof(texMemObject) ==
53 sizeof(sycl::ext::oneapi::experimental::sampled_image_handle));
54 static_assert(sizeof(arrayMemObject) ==
55 sizeof(sycl::ext::oneapi::experimental::image_mem_handle));
56
57 need_texture_info = false;
58 use_hardware_raytracing = info.use_hardware_raytracing;
59
60 oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
61
62 bool is_finished_ok = create_queue(device_queue_,
63 info.num,
64# ifdef WITH_EMBREE_GPU
65 use_hardware_raytracing ? (void *)&embree_device : nullptr,
66# else
67 nullptr,
68# endif
69 &is_several_intel_dgpu_devices_detected);
70
71 if (is_finished_ok == false) {
72 set_error("oneAPI queue initialization error: got runtime exception \"" +
73 oneapi_error_string_ + "\"");
74 }
75 else {
76 VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
77 << info.description << "\"";
78 assert(device_queue_);
79 }
80
81# ifdef WITH_EMBREE_GPU
82 use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
83# else
84 use_hardware_raytracing = false;
85# endif
86
87 if (use_hardware_raytracing) {
88 VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
89 }
90
91 size_t globals_segment_size;
92 is_finished_ok = kernel_globals_size(globals_segment_size);
93 if (is_finished_ok == false) {
94 set_error("oneAPI constant memory initialization got runtime exception \"" +
95 oneapi_error_string_ + "\"");
96 }
97 else {
98 VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
99 }
100
101 kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
102 usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
103
104 kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
105
106 kg_memory_size_ = globals_segment_size;
107
108 max_memory_on_device_ = get_memcapacity();
109 init_host_memory();
110 can_map_host = true;
111
112 const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
113 if (headroom_str != nullptr) {
114 const long long override_headroom = (float)atoll(headroom_str);
115 device_working_headroom = override_headroom;
116 device_texture_headroom = override_headroom;
117 }
118 VLOG_DEBUG << "oneAPI memory headroom size: "
119 << string_human_readable_size(device_working_headroom);
120}
121
122OneapiDevice::~OneapiDevice()
123{
124# ifdef WITH_EMBREE_GPU
125 if (embree_device) {
126 rtcReleaseDevice(embree_device);
127 }
128# endif
129
130 texture_info.free();
131 usm_free(device_queue_, kg_memory_);
132 usm_free(device_queue_, kg_memory_device_);
133
134 const_mem_map_.clear();
135
136 if (device_queue_) {
137 free_queue(device_queue_);
138 }
139}
140
141bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
142{
143 return false;
144}
145
146bool OneapiDevice::can_use_hardware_raytracing_for_features(const uint requested_features) const
147{
148 /* MNEE and Ray-trace kernels work correctly with Hardware Ray-tracing starting with Embree 4.1.
149 */
150# if defined(RTC_VERSION) && RTC_VERSION < 40100
151 return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
152# else
153 (void)requested_features;
154 return true;
155# endif
156}
157
158BVHLayoutMask OneapiDevice::get_bvh_layout_mask(const uint requested_features) const
159{
160 return (use_hardware_raytracing &&
161 can_use_hardware_raytracing_for_features(requested_features)) ?
164}
165
166# ifdef WITH_EMBREE_GPU
167void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
168{
169 if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREEGPU) {
170 BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
171 if (refit) {
172 bvh_embree->refit(progress);
173 }
174 else {
175 bvh_embree->build(progress, &stats, embree_device, true);
176 }
177
178# if RTC_VERSION >= 40302
179 thread_scoped_lock lock(scene_data_mutex);
180 all_embree_scenes.push_back(bvh_embree->scene);
181# endif
182
183 if (bvh->params.top_level) {
184# if RTC_VERSION >= 40400
185 embree_traversable = rtcGetSceneTraversable(bvh_embree->scene);
186# else
187 embree_traversable = bvh_embree->scene;
188# endif
189# if RTC_VERSION >= 40302
190 RTCError error_code = bvh_embree->offload_scenes_to_gpu(all_embree_scenes);
191 if (error_code != RTC_ERROR_NONE) {
192 set_error(
193 string_printf("BVH failed to migrate to the GPU due to Embree library error (%s)",
194 bvh_embree->get_error_string(error_code)));
195 }
196 all_embree_scenes.clear();
197# endif
198 }
199 }
200 else {
202 }
203}
204# endif
205
206size_t OneapiDevice::get_free_mem() const
207{
208 /* Accurate: Use device info, which is practically useful only on dGPU.
209 * This is because for non-discrete GPUs, all GPU memory allocations would
210 * be in the RAM, thus having the same performance for device and host pointers,
211 * so there is no need to be very accurate about what would end where. */
212 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
213 const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
214 if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) {
215 return device.get_info<sycl::ext::intel::info::device::free_memory>();
216 }
217 /* Estimate: Capacity - in use. */
218 if (device_mem_in_use < max_memory_on_device_) {
219 return max_memory_on_device_ - device_mem_in_use;
220 }
221 return 0;
222}
223
224bool OneapiDevice::load_kernels(const uint requested_features)
225{
226 assert(device_queue_);
227
228 /* Kernel loading is expected to be a cumulative operation; for example, if
229 * a device is asked to load kernel A and then kernel B, then after these
230 * operations, both A and B should be available for use. So we need to store
231 * and use a cumulative mask of the requested kernel features, and not just
232 * the latest requested features.
233 */
234 kernel_features |= requested_features;
235
236 bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
237 if (is_finished_ok == false) {
238 set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
239 "\"");
240 return false;
241 }
242 VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
243 assert(device_queue_);
244
245 if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
247 << "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
248 use_hardware_raytracing = false;
249 }
250
251 is_finished_ok = oneapi_load_kernels(
252 device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
253 if (is_finished_ok == false) {
254 set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
255 }
256 else {
257 VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
258 }
259
260 if (is_finished_ok) {
261 reserve_private_memory(requested_features);
262 is_finished_ok = !have_error();
263 }
264
265 return is_finished_ok;
266}
267
268void OneapiDevice::reserve_private_memory(const uint kernel_features)
269{
270 size_t free_before = get_free_mem();
271
272 /* Use the biggest kernel for estimation. */
273 const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
275 (kernel_features & KERNEL_FEATURE_MNEE) ?
278
279 {
280 unique_ptr<DeviceQueue> queue = gpu_queue_create();
281
282 device_ptr d_path_index = 0;
283 device_ptr d_render_buffer = 0;
284 int d_work_size = 0;
285 DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
286
287 queue->init_execution();
288 /* Launch of the kernel seems to be sufficient to reserve all
289 * needed memory regardless of the execution global size.
290 * So, the smallest possible size is used here. */
291 queue->enqueue(test_kernel, 1, args);
292 queue->synchronize();
293 }
294
295 size_t free_after = get_free_mem();
296
297 VLOG_INFO << "For kernel execution were reserved "
298 << string_human_readable_number(free_before - free_after) << " bytes. ("
299 << string_human_readable_size(free_before - free_after) << ")";
300}
301
302void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
303{
304 free = get_free_mem();
305 total = max_memory_on_device_;
306}
307
308bool OneapiDevice::alloc_device(void *&device_pointer, const size_t size)
309{
310 bool allocation_success = false;
311 device_pointer = usm_alloc_device(device_queue_, size);
312 if (device_pointer != nullptr) {
313 allocation_success = true;
314 /* Due to lazy memory initialization in GPU runtime we will force memory to
315 * appear in device memory via execution of a kernel using this memory. */
316 if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
317 set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
318 "\"");
319 usm_free(device_queue_, device_pointer);
320
321 device_pointer = nullptr;
322 allocation_success = false;
323 }
324 }
325
326 return allocation_success;
327}
328
329void OneapiDevice::free_device(void *device_pointer)
330{
331 usm_free(device_queue_, device_pointer);
332}
333
334bool OneapiDevice::shared_alloc(void *&shared_pointer, const size_t size)
335{
336 shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
337 return shared_pointer != nullptr;
338}
339
340void OneapiDevice::shared_free(void *shared_pointer)
341{
342 usm_free(device_queue_, shared_pointer);
343}
344
345void *OneapiDevice::shared_to_device_pointer(const void *shared_pointer)
346{
347 /* Device and host pointer are in the same address space
348 * as we're using Unified Shared Memory. */
349 return const_cast<void *>(shared_pointer);
350}
351
352void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size)
353{
354 usm_memcpy(device_queue_, device_pointer, host_pointer, size);
355}
356
357/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
358SyclQueue *OneapiDevice::sycl_queue()
359{
360 return device_queue_;
361}
362
363string OneapiDevice::oneapi_error_message()
364{
365 return string(oneapi_error_string_);
366}
367
368int OneapiDevice::scene_max_shaders()
369{
370 return scene_max_shaders_;
371}
372
373void *OneapiDevice::kernel_globals_device_pointer()
374{
375 return kg_memory_device_;
376}
377
378void *OneapiDevice::host_alloc(const MemoryType type, const size_t size)
379{
380 void *host_pointer = GPUDevice::host_alloc(type, size);
381
382# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
383 /* This extension is not working fully correctly with several
384 * Intel dGPUs present in the system, so it would be turned off in such cases. */
385 if (is_several_intel_dgpu_devices_detected == false && host_pointer) {
386 /* Import host_pointer into USM memory for faster host<->device data transfers. */
387 if (type == MEM_READ_WRITE || type == MEM_READ_ONLY) {
388 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
389 /* This API is properly implemented only in Level-Zero backend at the moment and we don't
390 * want it to fail at runtime, so we conservatively use it only for L0. */
391 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
392 sycl::ext::oneapi::experimental::prepare_for_device_copy(host_pointer, size, *queue);
393 }
394 }
395 }
396# endif
397
398 return host_pointer;
399}
400
401void OneapiDevice::host_free(const MemoryType type, void *host_pointer, const size_t size)
402{
403# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
404 if (is_several_intel_dgpu_devices_detected == false) {
405 if (type == MEM_READ_WRITE || type == MEM_READ_ONLY) {
406 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
407 /* This API is properly implemented only in Level-Zero backend at the moment and we don't
408 * want it to fail at runtime, so we conservatively use it only for L0. */
409 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
410 sycl::ext::oneapi::experimental::release_from_device_copy(host_pointer, *queue);
411 }
412 }
413 }
414# endif
415
416 GPUDevice::host_free(type, host_pointer, size);
417}
418
419void OneapiDevice::mem_alloc(device_memory &mem)
420{
421 if (mem.type == MEM_TEXTURE) {
422 assert(!"mem_alloc not supported for textures.");
423 }
424 else if (mem.type == MEM_GLOBAL) {
425 assert(!"mem_alloc not supported for global memory.");
426 }
427 else {
428 if (mem.name) {
429 VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
430 << string_human_readable_number(mem.memory_size()) << " bytes. ("
432 }
433 generic_alloc(mem);
434 }
435}
436
437void OneapiDevice::mem_copy_to(device_memory &mem)
438{
439 if (mem.name) {
440 VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
441 << string_human_readable_number(mem.memory_size()) << " bytes. ("
443 }
444
445 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
446 * because the associated GPU context may be in an invalid state at this point. */
447 if (have_error()) {
448 return;
449 }
450
451 if (mem.type == MEM_GLOBAL) {
452 global_copy_to(mem);
453 }
454 else if (mem.type == MEM_TEXTURE) {
455 tex_copy_to((device_texture &)mem);
456 }
457 else {
458 if (!mem.device_pointer) {
459 generic_alloc(mem);
460 }
461 generic_copy_to(mem);
462 }
463}
464
465void OneapiDevice::mem_move_to_host(device_memory &mem)
466{
467 if (mem.name) {
468 VLOG_DEBUG << "OneapiDevice::mem_move_to_host: \"" << mem.name << "\", "
469 << string_human_readable_number(mem.memory_size()) << " bytes. ("
471 }
472
473 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
474 * because the associated GPU context may be in an invalid state at this point. */
475 if (have_error()) {
476 return;
477 }
478
479 if (mem.type == MEM_GLOBAL) {
480 global_free(mem);
481 global_alloc(mem);
482 }
483 else if (mem.type == MEM_TEXTURE) {
484 tex_free((device_texture &)mem);
485 tex_alloc((device_texture &)mem);
486 }
487 else {
488 assert(0);
489 }
490}
491
492void OneapiDevice::mem_copy_from(
493 device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem)
494{
495 if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
496 assert(!"mem_copy_from not supported for textures.");
497 }
498 else if (mem.host_pointer) {
499 const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
500 const size_t offset = elem * y * w;
501
502 if (mem.name) {
503 VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
504 << string_human_readable_number(mem.memory_size()) << " bytes. ("
505 << string_human_readable_size(mem.memory_size()) << ") from offset " << offset
506 << " data " << size << " bytes";
507 }
508
509 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
510 * because the associated GPU context may be in an invalid state at this point. */
511 if (have_error()) {
512 return;
513 }
514
515 assert(device_queue_);
516
517 assert(size != 0);
518 if (mem.device_pointer) {
519 char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
520 char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
521 bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
522 if (is_finished_ok == false) {
523 set_error("oneAPI memory operation error: got runtime exception \"" +
524 oneapi_error_string_ + "\"");
525 }
526 }
527 }
528}
529
530void OneapiDevice::mem_zero(device_memory &mem)
531{
532 if (mem.name) {
533 VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
534 << string_human_readable_number(mem.memory_size()) << " bytes. ("
535 << string_human_readable_size(mem.memory_size()) << ")\n";
536 }
537
538 /* After getting runtime errors we need to avoid performing oneAPI runtime operations
539 * because the associated GPU context may be in an invalid state at this point. */
540 if (have_error()) {
541 return;
542 }
543
544 if (!mem.device_pointer) {
545 mem_alloc(mem);
546 }
547 if (!mem.device_pointer) {
548 return;
549 }
550
551 assert(device_queue_);
552 bool is_finished_ok = usm_memset(
553 device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
554 if (is_finished_ok == false) {
555 set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
556 "\"");
557 }
558}
559
560void OneapiDevice::mem_free(device_memory &mem)
561{
562 if (mem.name) {
563 VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
564 << string_human_readable_number(mem.device_size) << " bytes. ("
566 }
567
568 if (mem.type == MEM_GLOBAL) {
569 global_free(mem);
570 }
571 else if (mem.type == MEM_TEXTURE) {
572 tex_free((device_texture &)mem);
573 }
574 else {
575 generic_free(mem);
576 }
577}
578
579device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem,
580 const size_t offset,
581 size_t /*size*/)
582{
583 return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
584 mem.memory_elements_size(offset));
585}
586
587void OneapiDevice::const_copy_to(const char *name, void *host, const size_t size)
588{
589 assert(name);
590
591 VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
592 << string_human_readable_number(size) << " bytes. ("
594
595 if (strcmp(name, "data") == 0) {
596 assert(size <= sizeof(KernelData));
597 KernelData *const data = static_cast<KernelData *>(host);
598
599 /* We need this value when allocating local memory for integrator_sort_bucket_pass
600 * and integrator_sort_write_pass kernels. */
601 scene_max_shaders_ = data->max_shaders;
602
603# ifdef WITH_EMBREE_GPU
604 if (embree_traversable != nullptr) {
605 /* Update scene handle (since it is different for each device on multi devices).
606 * This must be a raw pointer copy since at some points during scene update this
607 * pointer may be invalid. */
608 data->device_bvh = embree_traversable;
609 }
610# endif
611 }
612
613 ConstMemMap::iterator i = const_mem_map_.find(name);
615
616 if (i == const_mem_map_.end()) {
617 unique_ptr<device_vector<uchar>> data_ptr = make_unique<device_vector<uchar>>(
618 this, name, MEM_READ_ONLY);
619 data_ptr->alloc(size);
620 data = data_ptr.get();
621 const_mem_map_.insert(ConstMemMap::value_type(name, std::move(data_ptr)));
622 }
623 else {
624 data = i->second.get();
625 }
626
627 assert(data->memory_size() <= size);
628 memcpy(data->data(), host, size);
629 data->copy_to_device();
630
631 set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
632
633 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
634}
635
636void OneapiDevice::global_alloc(device_memory &mem)
637{
638 assert(mem.name);
639
640 size_t size = mem.memory_size();
641 VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
642 << string_human_readable_number(size) << " bytes. ("
644
645 generic_alloc(mem);
646 generic_copy_to(mem);
647
648 set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
649
650 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
651}
652
653void OneapiDevice::global_copy_to(device_memory &mem)
654{
655 if (!mem.device_pointer) {
656 global_alloc(mem);
657 }
658 else {
659 generic_copy_to(mem);
660 }
661}
662
663void OneapiDevice::global_free(device_memory &mem)
664{
665 if (mem.device_pointer) {
666 generic_free(mem);
667 }
668}
669
670static sycl::ext::oneapi::experimental::image_descriptor image_desc(const device_texture &mem)
671{
672 /* Image Texture Storage */
673 sycl::image_channel_type channel_type;
674
675 switch (mem.data_type) {
676 case TYPE_UCHAR:
677 channel_type = sycl::image_channel_type::unorm_int8;
678 break;
679 case TYPE_UINT16:
680 channel_type = sycl::image_channel_type::unorm_int16;
681 break;
682 case TYPE_FLOAT:
683 channel_type = sycl::image_channel_type::fp32;
684 break;
685 case TYPE_HALF:
686 channel_type = sycl::image_channel_type::fp16;
687 break;
688 default:
689 assert(0);
690 }
691
692 sycl::ext::oneapi::experimental::image_descriptor param;
693 param.width = mem.data_width;
694 param.height = mem.data_height;
695 param.depth = mem.data_depth == 1 ? 0 : mem.data_depth;
696 param.num_channels = mem.data_elements;
697 param.channel_type = channel_type;
698
699 param.verify();
700
701 return param;
702}
703
704void OneapiDevice::tex_alloc(device_texture &mem)
705{
706 assert(device_queue_);
707
708 size_t size = mem.memory_size();
709
710 sycl::addressing_mode address_mode = sycl::addressing_mode::none;
711 switch (mem.info.extension) {
712 case EXTENSION_REPEAT:
713 address_mode = sycl::addressing_mode::repeat;
714 break;
715 case EXTENSION_EXTEND:
716 address_mode = sycl::addressing_mode::clamp_to_edge;
717 break;
718 case EXTENSION_CLIP:
719 address_mode = sycl::addressing_mode::clamp;
720 break;
721 case EXTENSION_MIRROR:
722 address_mode = sycl::addressing_mode::mirrored_repeat;
723 break;
724 default:
725 assert(0);
726 break;
727 }
728
729 sycl::filtering_mode filter_mode;
731 filter_mode = sycl::filtering_mode::nearest;
732 }
733 else {
734 filter_mode = sycl::filtering_mode::linear;
735 }
736
737 /* Image Texture Storage */
738 sycl::image_channel_type channel_type;
739
740 switch (mem.data_type) {
741 case TYPE_UCHAR:
742 channel_type = sycl::image_channel_type::unorm_int8;
743 break;
744 case TYPE_UINT16:
745 channel_type = sycl::image_channel_type::unorm_int16;
746 break;
747 case TYPE_FLOAT:
748 channel_type = sycl::image_channel_type::fp32;
749 break;
750 case TYPE_HALF:
751 channel_type = sycl::image_channel_type::fp16;
752 break;
753 default:
754 assert(0);
755 return;
756 }
757
758 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
759
760 try {
761 Mem *cmem = nullptr;
762 sycl::ext::oneapi::experimental::image_mem_handle memHandle{0};
763 sycl::ext::oneapi::experimental::image_descriptor desc{};
764
765 if (mem.data_height > 0) {
766 const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
767 if (mem.data_depth > 1) {
768 const size_t max_width = device.get_info<sycl::info::device::image3d_max_width>();
769 const size_t max_height = device.get_info<sycl::info::device::image3d_max_height>();
770 const size_t max_depth = device.get_info<sycl::info::device::image3d_max_depth>();
771
772 if (mem.data_width > max_width || mem.data_height > max_height ||
773 mem.data_depth > max_depth)
774 {
775 set_error(string_printf(
776 "Maximum GPU 3D texture size exceeded (max %zux%zux%zu, found %zux%zux%zu)",
777 max_width,
778 max_height,
779 max_depth,
780 mem.data_width,
781 mem.data_height,
782 mem.data_depth));
783 return;
784 }
785 }
786 else {
787 const size_t max_width = device.get_info<sycl::info::device::image2d_max_width>();
788 const size_t max_height = device.get_info<sycl::info::device::image2d_max_height>();
789
790 if (mem.data_width > max_width || mem.data_height > max_height) {
791 set_error(
792 string_printf("Maximum GPU 2D texture size exceeded (max %zux%zu, found %zux%zu)",
793 max_width,
794 max_height,
795 mem.data_width,
796 mem.data_height));
797 return;
798 }
799 }
800
801 /* 2D/3D texture -- Tile optimized */
802 size_t depth = mem.data_depth == 1 ? 0 : mem.data_depth;
803 desc = sycl::ext::oneapi::experimental::image_descriptor(
804 {mem.data_width, mem.data_height, depth}, mem.data_elements, channel_type);
805
806 VLOG_WORK << "Array 2D/3D allocate: " << mem.name << ", "
807 << string_human_readable_number(mem.memory_size()) << " bytes. ("
809
810 sycl::ext::oneapi::experimental::image_mem_handle memHandle =
811 sycl::ext::oneapi::experimental::alloc_image_mem(desc, *queue);
812 if (!memHandle.raw_handle) {
813 set_error("GPU texture allocation failed: Raw handle is null");
814 return;
815 }
816
817 /* Copy data from host to the texture properly based on the texture description */
818 queue->ext_oneapi_copy(mem.host_pointer, memHandle, desc);
819
820 mem.device_pointer = (device_ptr)memHandle.raw_handle;
821 mem.device_size = size;
822 stats.mem_alloc(size);
823
824 thread_scoped_lock lock(device_mem_map_mutex);
825 cmem = &device_mem_map[&mem];
826 cmem->texobject = 0;
827 cmem->array = (arrayMemObject)(memHandle.raw_handle);
828 }
829 else {
830 /* 1D texture -- Linear memory */
831 desc = sycl::ext::oneapi::experimental::image_descriptor(
832 {mem.data_width}, mem.data_elements, channel_type);
833 cmem = generic_alloc(mem);
834 if (!cmem) {
835 return;
836 }
837
838 queue->memcpy((void *)mem.device_pointer, mem.host_pointer, size);
839 }
840
841 queue->wait_and_throw();
842
843 /* Set Mapping and tag that we need to (re-)upload to device */
844 TextureInfo tex_info = mem.info;
845
846 sycl::ext::oneapi::experimental::bindless_image_sampler samp(
847 address_mode, sycl::coordinate_normalization_mode::normalized, filter_mode);
848
853 {
854 sycl::ext::oneapi::experimental::sampled_image_handle imgHandle;
855
856 if (memHandle.raw_handle) {
857 /* Create 2D/3D texture handle */
858 imgHandle = sycl::ext::oneapi::experimental::create_image(memHandle, samp, desc, *queue);
859 }
860 else {
861 /* Create 1D texture */
862 imgHandle = sycl::ext::oneapi::experimental::create_image(
863 (void *)mem.device_pointer, 0, samp, desc, *queue);
864 }
865
866 thread_scoped_lock lock(device_mem_map_mutex);
867 cmem = &device_mem_map[&mem];
868 cmem->texobject = (texMemObject)(imgHandle.raw_handle);
869
870 tex_info.data = (uint64_t)cmem->texobject;
871 }
872 else {
873 tex_info.data = (uint64_t)mem.device_pointer;
874 }
875
876 {
877 /* Update texture info. */
878 thread_scoped_lock lock(texture_info_mutex);
879 const uint slot = mem.slot;
880 if (slot >= texture_info.size()) {
881 /* Allocate some slots in advance, to reduce amount of re-allocations. */
882 texture_info.resize(slot + 128);
883 }
884 texture_info[slot] = tex_info;
885 need_texture_info = true;
886 }
887 }
888 catch (sycl::exception const &e) {
889 set_error("GPU texture allocation failed: runtime exception \"" + string(e.what()) + "\"");
890 }
891}
892
893void OneapiDevice::tex_copy_to(device_texture &mem)
894{
895 if (!mem.device_pointer) {
896 tex_alloc(mem);
897 }
898 else {
899 if (mem.data_height > 0) {
900 /* 2D/3D texture -- Tile optimized */
901 sycl::ext::oneapi::experimental::image_descriptor desc = image_desc(mem);
902
903 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
904
905 try {
906 /* Copy data from host to the texture properly based on the texture description */
907 thread_scoped_lock lock(device_mem_map_mutex);
908 const Mem &cmem = device_mem_map[&mem];
909 sycl::ext::oneapi::experimental::image_mem_handle image_handle{
910 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
911 queue->ext_oneapi_copy(mem.host_pointer, image_handle, desc);
912
913# ifdef WITH_CYCLES_DEBUG
914 queue->wait_and_throw();
915# endif
916 }
917 catch (sycl::exception const &e) {
918 set_error("oneAPI texture copy error: got runtime exception \"" + string(e.what()) + "\"");
919 }
920 }
921 else {
922 generic_copy_to(mem);
923 }
924 }
925}
926
927void OneapiDevice::tex_free(device_texture &mem)
928{
929 if (mem.device_pointer) {
930 thread_scoped_lock lock(device_mem_map_mutex);
931 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
932 const Mem &cmem = device_mem_map[&mem];
933
934 sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
935
936 if (cmem.texobject) {
937 /* Free bindless texture itself. */
938 sycl::ext::oneapi::experimental::sampled_image_handle image(cmem.texobject);
939 sycl::ext::oneapi::experimental::destroy_image_handle(image, *queue);
940 }
941
942 if (cmem.array) {
943 /* Free texture memory. */
944 sycl::ext::oneapi::experimental::image_mem_handle imgHandle{
945 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
946
947 try {
948 /* We have allocated only standard textures, so we also deallocate only them. */
949 sycl::ext::oneapi::experimental::free_image_mem(
950 imgHandle, sycl::ext::oneapi::experimental::image_type::standard, *queue);
951 }
952 catch (sycl::exception const &e) {
953 set_error("oneAPI texture deallocation error: got runtime exception \"" +
954 string(e.what()) + "\"");
955 }
956
957 stats.mem_free(mem.memory_size());
958 mem.device_pointer = 0;
959 mem.device_size = 0;
960 device_mem_map.erase(device_mem_map.find(&mem));
961 }
962 else {
963 lock.unlock();
964 generic_free(mem);
965 }
966 }
967}
968
969unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
970{
971 return make_unique<OneapiDeviceQueue>(this);
972}
973
974bool OneapiDevice::should_use_graphics_interop(const GraphicsInteropDevice & /*interop_device*/,
975 const bool /*log*/)
976{
977 /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
978 * return false. */
979 return false;
980}
981
982void *OneapiDevice::usm_aligned_alloc_host(const size_t memory_size, const size_t alignment)
983{
984 assert(device_queue_);
985 return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
986}
987
988void OneapiDevice::usm_free(void *usm_ptr)
989{
990 assert(device_queue_);
991 usm_free(device_queue_, usm_ptr);
992}
993
994void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
995{
996# ifndef NDEBUG
997 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
998 sycl::info::device_type device_type =
999 queue->get_device().get_info<sycl::info::device::device_type>();
1000 sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
1001 (void)usm_type;
1002# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1003 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
1004# else
1005 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
1006# endif
1007 assert(usm_type == main_memory_type ||
1008 (usm_type == sycl::usm::alloc::host &&
1009 (allow_host || device_type == sycl::info::device_type::cpu)) ||
1010 usm_type == sycl::usm::alloc::unknown);
1011# else
1012 /* Silence warning about unused arguments. */
1013 (void)queue_;
1014 (void)usm_ptr;
1015 (void)allow_host;
1016# endif
1017}
1018
1019bool OneapiDevice::create_queue(SyclQueue *&external_queue,
1020 const int device_index,
1021 void *embree_device_pointer,
1022 bool *is_several_intel_dgpu_devices_detected_pointer)
1023{
1024 bool finished_correct = true;
1025 *is_several_intel_dgpu_devices_detected_pointer = false;
1026
1027 try {
1028 std::vector<sycl::device> devices = available_sycl_devices(
1029 is_several_intel_dgpu_devices_detected_pointer);
1030 if (device_index < 0 || device_index >= devices.size()) {
1031 return false;
1032 }
1033
1034 sycl::queue *created_queue = nullptr;
1035 if (*is_several_intel_dgpu_devices_detected_pointer == false) {
1036 created_queue = new sycl::queue(devices[device_index], sycl::property::queue::in_order());
1037 }
1038 else {
1039 sycl::context device_context(devices[device_index]);
1040 created_queue = new sycl::queue(
1041 device_context, devices[device_index], sycl::property::queue::in_order());
1042 VLOG_DEBUG << "Separate context was generated for the new queue, as several available SYCL "
1043 "devices were detected";
1044 }
1045 external_queue = reinterpret_cast<SyclQueue *>(created_queue);
1046
1047# ifdef WITH_EMBREE_GPU
1048 if (embree_device_pointer) {
1049 RTCDevice *device_object_ptr = reinterpret_cast<RTCDevice *>(embree_device_pointer);
1050 *device_object_ptr = rtcNewSYCLDevice(created_queue->get_context(), "");
1051 if (*device_object_ptr == nullptr) {
1052 finished_correct = false;
1053 oneapi_error_string_ =
1054 "Hardware Raytracing is not available; please install "
1055 "\"intel-level-zero-gpu-raytracing\" to enable it or disable Embree on GPU.";
1056 }
1057 }
1058# else
1059 (void)embree_device_pointer;
1060# endif
1061 }
1062 catch (const sycl::exception &e) {
1063 finished_correct = false;
1064 oneapi_error_string_ = e.what();
1065 }
1066 return finished_correct;
1067}
1068
1069void OneapiDevice::free_queue(SyclQueue *queue_)
1070{
1071 assert(queue_);
1072 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1073 delete queue;
1074}
1075
1076void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_,
1077 size_t memory_size,
1078 const size_t alignment)
1079{
1080 assert(queue_);
1081 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1082 return sycl::aligned_alloc_host(alignment, memory_size, *queue);
1083}
1084
1085void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
1086{
1087 assert(queue_);
1088 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1089 /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
1090 * and shared. For new project it could more beneficial to use USM shared memory, because it
1091 * provides automatic migration mechanism in order to allow to use the same pointer on host and
1092 * on device, without need to worry about explicit memory transfer operations, although usage of
1093 * USM shared imply some documented limitations on the memory usage in regards of parallel access
1094 * from different threads. But for Blender/Cycles this type of memory is not very suitable in
1095 * current application architecture, because Cycles is multi-thread application and already uses
1096 * two different pointer for host activity and device activity, and also has to perform all
1097 * needed memory transfer operations. So, USM device memory type has been used for oneAPI device
1098 * in order to better fit in Cycles architecture. */
1099# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1100 return sycl::malloc_device(memory_size, *queue);
1101# else
1102 return sycl::malloc_host(memory_size, *queue);
1103# endif
1104}
1105
1106void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
1107{
1108 assert(queue_);
1109 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1110 OneapiDevice::check_usm(queue_, usm_ptr, true);
1111 sycl::free(usm_ptr, *queue);
1112}
1113
1114bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, const size_t num_bytes)
1115{
1116 assert(queue_);
1117 /* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
1118 * runtime errors. It's better to avoid running memory operations in that case.
1119 * The render will be canceled and the queue will be destroyed anyway. */
1120 if (have_error()) {
1121 return false;
1122 }
1123
1124 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1125 OneapiDevice::check_usm(queue_, dest, true);
1126 OneapiDevice::check_usm(queue_, src, true);
1127 sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
1128 sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
1129 /* Unknown here means, that this is not an USM allocation, which implies that this is
1130 * some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
1131 if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
1132 (src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
1133 {
1134 memcpy(dest, src, num_bytes);
1135 return true;
1136 }
1137
1138 try {
1139 sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
1140# ifdef WITH_CYCLES_DEBUG
1141 /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
1142 * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
1143 */
1144 mem_event.wait_and_throw();
1145 return true;
1146# else
1147 bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
1148 src_type == sycl::usm::alloc::device;
1149 bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
1150 src_type == sycl::usm::alloc::unknown;
1151 /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
1152 * may not wait until the end of the transfer before using the memory.
1153 */
1154 if (from_device_to_host || host_or_device_memop_with_offset) {
1155 mem_event.wait();
1156 }
1157 return true;
1158# endif
1159 }
1160 catch (const sycl::exception &e) {
1161 oneapi_error_string_ = e.what();
1162 return false;
1163 }
1164}
1165
1166bool OneapiDevice::usm_memset(SyclQueue *queue_,
1167 void *usm_ptr,
1168 unsigned char value,
1169 const size_t num_bytes)
1170{
1171 assert(queue_);
1172 /* sycl::queue::memset may crash if the queue is in an invalid state due to previous
1173 * runtime errors. It's better to avoid running memory operations in that case.
1174 * The render will be canceled and the queue will be destroyed anyway. */
1175 if (have_error()) {
1176 return false;
1177 }
1178
1179 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1180 OneapiDevice::check_usm(queue_, usm_ptr, true);
1181 try {
1182 sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
1183# ifdef WITH_CYCLES_DEBUG
1184 /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
1185 * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
1186 */
1187 mem_event.wait_and_throw();
1188# else
1189 (void)mem_event;
1190# endif
1191 return true;
1192 }
1193 catch (const sycl::exception &e) {
1194 oneapi_error_string_ = e.what();
1195 return false;
1196 }
1197}
1198
1199bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
1200{
1201 assert(queue_);
1202 sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
1203 try {
1204 queue->wait_and_throw();
1205 return true;
1206 }
1207 catch (const sycl::exception &e) {
1208 oneapi_error_string_ = e.what();
1209 return false;
1210 }
1211}
1212
1213bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
1214{
1215 kernel_global_size = sizeof(KernelGlobalsGPU);
1216
1217 return true;
1218}
1219
1220void OneapiDevice::set_global_memory(SyclQueue *queue_,
1221 void *kernel_globals,
1222 const char *memory_name,
1223 void *memory_device_pointer)
1224{
1225 assert(queue_);
1226 assert(kernel_globals);
1227 assert(memory_name);
1228 assert(memory_device_pointer);
1229 KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
1230 OneapiDevice::check_usm(queue_, memory_device_pointer, true);
1231 OneapiDevice::check_usm(queue_, kernel_globals, true);
1232
1233 std::string matched_name(memory_name);
1234
1235/* This macro will change global ptr of KernelGlobals via name matching. */
1236# define KERNEL_DATA_ARRAY(type, name) \
1237 else if (#name == matched_name) { \
1238 globals->__##name = (type *)memory_device_pointer; \
1239 return; \
1240 }
1241 if (false) {
1242 }
1243 else if ("integrator_state" == matched_name) {
1244 globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
1245 return;
1246 }
1247 KERNEL_DATA_ARRAY(KernelData, data)
1248# include "kernel/data_arrays.h"
1249 else {
1250 std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
1251 << std::endl;
1252 assert(false);
1253 }
1254# undef KERNEL_DATA_ARRAY
1255}
1256
1257bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
1258 const int kernel,
1259 const size_t global_size,
1260 const size_t local_size,
1261 void **args)
1262{
1263 return oneapi_enqueue_kernel(kernel_context,
1264 kernel,
1265 global_size,
1266 local_size,
1267 kernel_features,
1268 use_hardware_raytracing,
1269 args);
1270}
1271
1272void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
1273 const DeviceKernel kernel,
1274 size_t &kernel_global_size,
1275 size_t &kernel_local_size)
1276{
1277 assert(queue);
1278 static const size_t preferred_work_group_size_intersect = 128;
1279 static const size_t preferred_work_group_size_shading = 256;
1280 static const size_t preferred_work_group_size_shading_simd8 = 64;
1281 /* Shader evaluation kernels seems to use some amount of shared memory, so better
1282 * to avoid usage of maximum work group sizes for them. */
1283 static const size_t preferred_work_group_size_shader_evaluation = 256;
1284 /* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
1285 * for now their work-group size is restricted to 512. */
1286 static const size_t preferred_work_group_size_cryptomatte = 512;
1287 static const size_t preferred_work_group_size_default = 1024;
1288
1289 const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
1290 const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
1291
1292 size_t preferred_work_group_size = 0;
1293 switch (kernel) {
1301 preferred_work_group_size = preferred_work_group_size_intersect;
1302 break;
1303
1312 const bool device_is_simd8 =
1313 (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1314 device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
1315 preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
1316 preferred_work_group_size_shading;
1317 } break;
1318
1320 preferred_work_group_size = preferred_work_group_size_cryptomatte;
1321 break;
1322
1326 preferred_work_group_size = preferred_work_group_size_shader_evaluation;
1327 break;
1328
1329 default:
1330 /* Do nothing and keep initial zero value. */
1331 break;
1332 }
1333
1334 /* Such order of logic allow us to override Blender default values, if needed,
1335 * yet respect them otherwise. */
1336 if (preferred_work_group_size == 0) {
1337 preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
1338 }
1339
1340 /* If there is no recommendation, then use manual default value. */
1341 if (preferred_work_group_size == 0) {
1342 preferred_work_group_size = preferred_work_group_size_default;
1343 }
1344
1345 kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
1346
1347 /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
1348 * we extend work size to fit uniformity requirements. */
1349 kernel_global_size = round_up(kernel_global_size, kernel_local_size);
1350
1351# ifdef WITH_ONEAPI_SYCL_HOST_TASK
1352 /* Kernels listed below need a specific number of work groups. */
1360 {
1361 /* Path array implementation is serial in case of SYCL Host Task execution. */
1362 kernel_global_size = 1;
1363 kernel_local_size = 1;
1364 }
1365# endif
1366
1367 assert(kernel_global_size % kernel_local_size == 0);
1368}
1369
1370/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
1371 * since Windows driver 101.3268. */
1372static const int lowest_supported_driver_version_win = 1016554;
1373# ifdef _WIN32
1374/* For Windows driver 101.6557, compute-runtime version is 31896.
1375 * This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
1376static const int lowest_supported_driver_version_neo = 31896;
1377# else
1378static const int lowest_supported_driver_version_neo = 31740;
1379# endif
1380
1381int parse_driver_build_version(const sycl::device &device)
1382{
1383 const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
1384 int driver_build_version = 0;
1385
1386 size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
1387 if (second_dot_position != std::string::npos) {
1388 try {
1389 size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
1390 if (third_dot_position != std::string::npos) {
1391 const std::string &third_number_substr = driver_version.substr(
1392 second_dot_position + 1, third_dot_position - second_dot_position - 1);
1393 const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
1394 if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) {
1395 driver_build_version = std::stoi(third_number_substr) * 10000 +
1396 std::stoi(forth_number_substr);
1397 }
1398 /* This is actually not a correct version string (Major.Minor.Patch.Optional), see blender
1399 * bug report #137277, but there are several driver versions with this Intel bug existing
1400 * at this point, so it is worth working around this issue in Blender source code, allowing
1401 * users to actually use Intel GPU when it is possible. */
1402 else if (third_number_substr.length() == 5 && forth_number_substr.length() == 6) {
1403 driver_build_version = std::stoi(third_number_substr);
1404 }
1405 }
1406 else {
1407 const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
1408 driver_build_version = std::stoi(third_number_substr);
1409 }
1410 }
1411 catch (std::invalid_argument &) {
1412 }
1413 }
1414
1415 if (driver_build_version == 0) {
1416 VLOG_WARNING << "Unable to parse unknown Intel GPU driver version. \"" << driver_version
1417 << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
1418 << " xx.xx.xxx.xxxx (Windows) for device \""
1419 << device.get_info<sycl::info::device::name>() << "\".";
1420 }
1421
1422 return driver_build_version;
1423}
1424
1425std::vector<sycl::device> available_sycl_devices(bool *multiple_dgpus_detected = nullptr)
1426{
1427 std::vector<sycl::device> available_devices;
1428 bool allow_all_devices = false;
1429 if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
1430 allow_all_devices = true;
1431 }
1432
1433 int level_zero_dgpu_counter = 0;
1434 try {
1435 const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
1436
1437 for (const sycl::platform &platform : oneapi_platforms) {
1438 /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and
1439 * OpenCL.
1440 */
1441 if (platform.get_backend() == sycl::backend::opencl) {
1442 continue;
1443 }
1444
1445 const std::vector<sycl::device> &oneapi_devices =
1446 (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
1447 platform.get_devices(sycl::info::device_type::gpu);
1448
1449 for (const sycl::device &device : oneapi_devices) {
1450 bool filter_out = false;
1451
1452 if (platform.get_backend() == sycl::backend::ext_oneapi_level_zero && device.is_gpu() &&
1453 device.get_info<sycl::info::device::host_unified_memory>() == false // dGPU
1454 )
1455 {
1456 level_zero_dgpu_counter++;
1457 }
1458
1459 if (!allow_all_devices) {
1460 /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
1461 * assuming they have either more than 96 Execution Units or not 7 threads per EU.
1462 * Official support can be broaden to older and smaller GPUs once ready. */
1463 if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
1464 filter_out = true;
1465 }
1466 else {
1467 /* Filtered-out defaults in-case these values aren't available. */
1468 int number_of_eus = 96;
1469 int threads_per_eu = 7;
1470 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1471 number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1472 }
1473 if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
1474 threads_per_eu =
1475 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1476 }
1477 /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
1478 if (number_of_eus <= 96 && threads_per_eu == 7) {
1479 filter_out = true;
1480 }
1481 /* if not already filtered out, check driver version. */
1482 bool check_driver_version = !filter_out;
1483 /* We don't know how to check driver version strings for non-Intel GPUs. */
1484 if (check_driver_version &&
1485 device.get_info<sycl::info::device::vendor>().find("Intel") == std::string::npos)
1486 {
1487 check_driver_version = false;
1488 }
1489 /* Because of https://github.com/oneapi-src/unified-runtime/issues/1777, future drivers
1490 * may break parsing done by a SYCL runtime from before the fix we expect in major
1491 * version 8. Parsed driver version would start with something different than current
1492 * "1.3.". To avoid blocking a device by mistake in the case of new driver / old SYCL
1493 * runtime, we disable driver version check in case LIBSYCL_MAJOR_VERSION is below 8
1494 * and actual driver version doesn't start with 1.3. */
1495# if __LIBSYCL_MAJOR_VERSION < 8
1496 if (check_driver_version &&
1497 !string_startswith(device.get_info<sycl::info::device::driver_version>(), "1.3."))
1498 {
1499 check_driver_version = false;
1500 }
1501# endif
1502 if (check_driver_version) {
1503 int driver_build_version = parse_driver_build_version(device);
1504 const int lowest_supported_driver_version = (driver_build_version > 100000) ?
1505 lowest_supported_driver_version_win :
1506 lowest_supported_driver_version_neo;
1507 if (driver_build_version < lowest_supported_driver_version) {
1508 filter_out = true;
1509
1510 VLOG_WARNING << "Driver version for device \""
1511 << device.get_info<sycl::info::device::name>()
1512 << "\" is too old. Expected \"" << lowest_supported_driver_version
1513 << "\" or newer, but got \"" << driver_build_version << "\".";
1514 }
1515 }
1516 }
1517 }
1518 if (!filter_out) {
1519 available_devices.push_back(device);
1520 }
1521 }
1522 }
1523 }
1524 catch (sycl::exception &e) {
1525 VLOG_WARNING << "An error has been encountered while enumerating SYCL devices: " << e.what();
1526 }
1527
1528 if (multiple_dgpus_detected) {
1529 *multiple_dgpus_detected = level_zero_dgpu_counter > 1;
1530 }
1531
1532 return available_devices;
1533}
1534
1535void OneapiDevice::architecture_information(const SyclDevice *device,
1536 string &name,
1537 bool &is_optimized)
1538{
1539 const sycl::ext::oneapi::experimental::architecture arch =
1540 reinterpret_cast<const sycl::device *>(device)
1541 ->get_info<sycl::ext::oneapi::experimental::info::device::architecture>();
1542
1543# define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \
1544 case sycl::ext::oneapi::experimental::architecture ::architecture_code: \
1545 name = #architecture_code; \
1546 is_optimized = is_arch_optimised; \
1547 break;
1548
1549 /* List of architectures that have been optimized by Intel and Blender developers.
1550 *
1551 * For example, Intel Rocket Lake iGPU (rkl) is not supported and not optimized,
1552 * while Intel Arc Alchemist dGPU (dg2) was optimized for.
1553 *
1554 * Devices can changed from unoptimized to optimized manually, after DPC++ has
1555 * been upgraded to support the architecture and CYCLES_ONEAPI_INTEL_BINARIES_ARCH
1556 * in CMake includes the architecture. */
1557 switch (arch) {
1558 FILL_ARCH_INFO(intel_gpu_bdw, false)
1559 FILL_ARCH_INFO(intel_gpu_skl, false)
1560 FILL_ARCH_INFO(intel_gpu_kbl, false)
1561 FILL_ARCH_INFO(intel_gpu_cfl, false)
1562 FILL_ARCH_INFO(intel_gpu_apl, false)
1563 FILL_ARCH_INFO(intel_gpu_glk, false)
1564 FILL_ARCH_INFO(intel_gpu_whl, false)
1565 FILL_ARCH_INFO(intel_gpu_aml, false)
1566 FILL_ARCH_INFO(intel_gpu_cml, false)
1567 FILL_ARCH_INFO(intel_gpu_icllp, false)
1568 FILL_ARCH_INFO(intel_gpu_ehl, false)
1569 FILL_ARCH_INFO(intel_gpu_tgllp, false)
1570 FILL_ARCH_INFO(intel_gpu_rkl, false)
1571 FILL_ARCH_INFO(intel_gpu_adl_s, false)
1572 FILL_ARCH_INFO(intel_gpu_adl_p, false)
1573 FILL_ARCH_INFO(intel_gpu_adl_n, false)
1574 FILL_ARCH_INFO(intel_gpu_dg1, false)
1575 FILL_ARCH_INFO(intel_gpu_dg2_g10, true)
1576 FILL_ARCH_INFO(intel_gpu_dg2_g11, true)
1577 FILL_ARCH_INFO(intel_gpu_dg2_g12, true)
1578 FILL_ARCH_INFO(intel_gpu_pvc, false)
1579 FILL_ARCH_INFO(intel_gpu_pvc_vg, false)
1580 /* intel_gpu_mtl_u == intel_gpu_mtl_s == intel_gpu_arl_u == intel_gpu_arl_s */
1581 FILL_ARCH_INFO(intel_gpu_mtl_u, true)
1582 FILL_ARCH_INFO(intel_gpu_mtl_h, true)
1583 FILL_ARCH_INFO(intel_gpu_bmg_g21, true)
1584 FILL_ARCH_INFO(intel_gpu_lnl_m, true)
1585
1586 default:
1587 name = "unknown";
1588 is_optimized = false;
1589 break;
1590 }
1591}
1592
1593char *OneapiDevice::device_capabilities()
1594{
1595 std::stringstream capabilities;
1596
1597 const std::vector<sycl::device> &oneapi_devices = available_sycl_devices();
1598 for (const sycl::device &device : oneapi_devices) {
1599# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1600 const std::string &name = device.get_info<sycl::info::device::name>();
1601# else
1602 const std::string &name = "SYCL Host Task (Debug)";
1603# endif
1604
1605 capabilities << std::string("\t") << name << "\n";
1606 capabilities << "\t\tsycl::info::platform::name\t\t\t"
1607 << device.get_platform().get_info<sycl::info::platform::name>() << "\n";
1608
1609 string arch_name;
1610 bool is_optimised_for_arch;
1611 architecture_information(
1612 reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
1613 capabilities << "\t\tsycl::info::device::architecture\t\t\t";
1614 capabilities << arch_name << "\n";
1615 capabilities << "\t\tsycl::info::device::is_cycles_optimized\t\t\t";
1616 capabilities << is_optimised_for_arch << "\n";
1617
1618# define WRITE_ATTR(attribute_name, attribute_variable) \
1619 capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
1620 << "\n";
1621# define GET_ATTR(attribute) \
1622 { \
1623 capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \
1624 << device.get_info<sycl::info::device ::attribute>() << "\n"; \
1625 }
1626# define GET_INTEL_ATTR(attribute) \
1627 { \
1628 if (device.has(sycl::aspect::ext_intel_##attribute)) { \
1629 capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \
1630 << device.get_info<sycl::ext::intel::info::device ::attribute>() << "\n"; \
1631 } \
1632 }
1633# define GET_ASPECT(aspect_) \
1634 { \
1635 capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \
1636 << "\n"; \
1637 }
1638
1639 GET_ATTR(vendor)
1640 GET_ATTR(driver_version)
1641 GET_ATTR(max_compute_units)
1642 GET_ATTR(max_clock_frequency)
1643 GET_ATTR(global_mem_size)
1644 GET_INTEL_ATTR(pci_address)
1645 GET_INTEL_ATTR(gpu_eu_simd_width)
1646 GET_INTEL_ATTR(gpu_eu_count)
1647 GET_INTEL_ATTR(gpu_slices)
1648 GET_INTEL_ATTR(gpu_subslices_per_slice)
1649 GET_INTEL_ATTR(gpu_eu_count_per_subslice)
1650 GET_INTEL_ATTR(gpu_hw_threads_per_eu)
1651 GET_INTEL_ATTR(max_mem_bandwidth)
1652 GET_ATTR(max_work_group_size)
1653 GET_ATTR(max_work_item_dimensions)
1654 sycl::id<3> max_work_item_sizes =
1655 device.get_info<sycl::info::device::max_work_item_sizes<3>>();
1656 WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0))
1657 WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1))
1658 WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2))
1659
1660 GET_ATTR(max_num_sub_groups)
1661 for (size_t sub_group_size : device.get_info<sycl::info::device::sub_group_sizes>()) {
1662 WRITE_ATTR(sub_group_size[], sub_group_size)
1663 }
1664 GET_ATTR(sub_group_independent_forward_progress)
1665
1666 GET_ATTR(preferred_vector_width_char)
1667 GET_ATTR(preferred_vector_width_short)
1668 GET_ATTR(preferred_vector_width_int)
1669 GET_ATTR(preferred_vector_width_long)
1670 GET_ATTR(preferred_vector_width_float)
1671 GET_ATTR(preferred_vector_width_double)
1672 GET_ATTR(preferred_vector_width_half)
1673
1674 GET_ATTR(address_bits)
1675 GET_ATTR(max_mem_alloc_size)
1676 GET_ATTR(mem_base_addr_align)
1677 GET_ATTR(error_correction_support)
1678 GET_ATTR(is_available)
1679 GET_ATTR(host_unified_memory)
1680
1681 GET_ASPECT(cpu)
1682 GET_ASPECT(gpu)
1683 GET_ASPECT(fp16)
1684 GET_ASPECT(atomic64)
1685 GET_ASPECT(usm_host_allocations)
1686 GET_ASPECT(usm_device_allocations)
1687 GET_ASPECT(usm_shared_allocations)
1688 GET_ASPECT(usm_system_allocations)
1689
1690# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__
1691 GET_ASPECT(ext_oneapi_non_uniform_groups)
1692# endif
1693# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__
1694 GET_ASPECT(ext_oneapi_bindless_images)
1695# endif
1696# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__
1697 GET_ASPECT(ext_oneapi_interop_semaphore_import)
1698# endif
1699# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__
1700 GET_ASPECT(ext_oneapi_interop_semaphore_export)
1701# endif
1702
1703# undef GET_INTEL_ATTR
1704# undef GET_ASPECT
1705# undef GET_ATTR
1706# undef WRITE_ATTR
1707 capabilities << "\n";
1708 }
1709
1710 return ::strdup(capabilities.str().c_str());
1711}
1712
1713void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
1714{
1715 int num = 0;
1716 std::vector<sycl::device> devices = available_sycl_devices();
1717 for (sycl::device &device : devices) {
1718 const std::string &platform_name =
1719 device.get_platform().get_info<sycl::info::platform::name>();
1720# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1721 std::string name = device.get_info<sycl::info::device::name>();
1722# else
1723 std::string name = "SYCL Host Task (Debug)";
1724# endif
1725# ifdef WITH_EMBREE_GPU
1726 bool hwrt_support = rtcIsSYCLDeviceSupported(device);
1727# else
1728 bool hwrt_support = false;
1729# endif
1730# if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300
1731 bool oidn_support = oidnIsSYCLDeviceSupported(&device);
1732# else
1733 bool oidn_support = false;
1734# endif
1735 std::string id = "ONEAPI_" + platform_name + "_" + name;
1736
1737 string arch_name;
1738 bool is_optimised_for_arch;
1739 architecture_information(
1740 reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
1741
1742 if (device.has(sycl::aspect::ext_intel_pci_address)) {
1743 id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
1744 }
1745 (cb)(id.c_str(),
1746 name.c_str(),
1747 num,
1748 hwrt_support,
1749 oidn_support,
1750 is_optimised_for_arch,
1751 user_ptr);
1752 num++;
1753 }
1754}
1755
1756size_t OneapiDevice::get_memcapacity()
1757{
1758 return reinterpret_cast<sycl::queue *>(device_queue_)
1759 ->get_device()
1760 .get_info<sycl::info::device::global_mem_size>();
1761}
1762
1763int OneapiDevice::get_num_multiprocessors()
1764{
1765 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
1766 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1767 return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1768 }
1769 return device.get_info<sycl::info::device::max_compute_units>();
1770}
1771
1772int OneapiDevice::get_max_num_threads_per_multiprocessor()
1773{
1774 const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
1775 if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1776 device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu))
1777 {
1778 return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
1779 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1780 }
1781 /* We'd want sycl::info::device::max_threads_per_compute_unit which doesn't exist yet.
1782 * max_work_group_size is the closest approximation but it can still be several times off. */
1783 return device.get_info<sycl::info::device::max_work_group_size>();
1784}
1785
1787
1788#endif
void BLI_kdtree_nd_ free(KDTree *tree)
ATTR_WARN_UNUSED_RESULT const size_t num
unsigned int uint
float progress
Definition WM_types.hh:1019
volatile int lock
for(;discarded_id_iter !=nullptr;discarded_id_iter=static_cast< ID * >(discarded_id_iter->next))
Definition blendfile.cc:634
BMesh const char void * data
return true
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
static PointerRNA * get_pointer_type(ButsContextPath *path, StructRNA *type)
BVHLayout bvh_layout
Definition params.h:83
bool top_level
Definition params.h:80
Definition bvh/bvh.h:67
BVHParams params
Definition bvh/bvh.h:69
string description
bool use_hardware_raytracing
virtual void host_free(const MemoryType type, void *host_pointer, const size_t size)
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void * host_alloc(const MemoryType type, const size_t size)
void mem_alloc(const size_t size)
Definition util/stats.h:18
void mem_free(const size_t size)
Definition util/stats.h:24
size_t memory_elements_size(const int elements)
@ MEM_TEXTURE
@ MEM_READ_WRITE
@ MEM_READ_ONLY
@ TYPE_UINT16
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:8
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
#define assert(assertion)
@ BVH_LAYOUT_BVH2
@ BVH_LAYOUT_EMBREEGPU
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
#define VLOG_INFO
Definition log.h:71
#define VLOG_WARNING
Definition log.h:69
#define DCHECK(expression)
Definition log.h:50
#define VLOG_WORK
Definition log.h:74
#define VLOG_DEBUG
Definition log.h:80
int BVHLayoutMask
Definition params.h:50
string string_human_readable_size(size_t size)
Definition string.cpp:257
string string_human_readable_number(size_t num)
Definition string.cpp:276
bool string_startswith(const string_view s, const string_view start)
Definition string.cpp:104
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
CCL_NAMESPACE_END IntegratorStateGPU * integrator_state
uint64_t data
uint interpolation
i
Definition text_draw.cc:230
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
Definition types_base.h:57
uint64_t device_ptr
Definition types_base.h:44
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
@ INTERPOLATION_CLOSEST
@ EXTENSION_REPEAT
@ EXTENSION_CLIP
@ EXTENSION_EXTEND
@ EXTENSION_MIRROR