Blender V4.5
optix/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2019 NVIDIA Corporation
2 * SPDX-FileCopyrightText: 2019-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#ifdef WITH_OPTIX
7
9# include "device/optix/queue.h"
10
11# include "bvh/bvh.h"
12# include "bvh/optix.h"
13
14# include "scene/hair.h"
15# include "scene/mesh.h"
16# include "scene/object.h"
17# include "scene/pointcloud.h"
18# include "scene/scene.h"
19
20# include "util/debug.h"
21# include "util/log.h"
22# include "util/path.h"
23# include "util/progress.h"
24# include "util/task.h"
25
26# define __KERNEL_OPTIX__
28
30
31static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
32{
33 OptixTask additional_tasks[16];
34 unsigned int num_additional_tasks = 0;
35
36 const OptixResult result = optixTaskExecute(task, additional_tasks, 16, &num_additional_tasks);
37 if (result == OPTIX_SUCCESS) {
38 for (unsigned int i = 0; i < num_additional_tasks; ++i) {
39 pool.push([&pool, additional_task = additional_tasks[i], &failure_reason] {
40 execute_optix_task(pool, additional_task, failure_reason);
41 });
42 }
43 }
44 else {
45 failure_reason = result;
46 }
47}
48
49OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
50 : CUDADevice(info, stats, profiler, headless),
51# ifdef WITH_OSL
52 osl_colorsystem(this, "osl_colorsystem", MEM_READ_ONLY),
53# endif
54 sbt_data(this, "__sbt", MEM_READ_ONLY),
55 launch_params(this, "kernel_params", false)
56{
57 /* Make the CUDA context current. */
58 if (!cuContext) {
59 /* Do not initialize if CUDA context creation failed already. */
60 return;
61 }
62 const CUDAContextScope scope(this);
63
64 /* Create OptiX context for this device. */
65 OptixDeviceContextOptions options = {};
66# ifdef WITH_CYCLES_LOGGING
67 options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
68 options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
69 switch (level) {
70 case 1:
71 LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
72 break;
73 case 2:
74 LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
75 break;
76 case 3:
77 LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
78 break;
79 case 4:
80 LOG_IF(INFO, VLOG_IS_ON(1)) << message;
81 break;
82 default:
83 break;
84 }
85 };
86# endif
87 if (DebugFlags().optix.use_debug) {
88 VLOG_INFO << "Using OptiX debug mode.";
89 options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
90 }
91 optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
92# ifdef WITH_CYCLES_LOGGING
93 optix_assert(optixDeviceContextSetLogCallback(
94 context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
95# endif
96
97 /* Fix weird compiler bug that assigns wrong size. */
98 launch_params.data_elements = sizeof(KernelParamsOptiX);
99
100 /* Allocate launch parameter buffer memory on device. */
101 launch_params.alloc_to_device(1);
102}
103
104OptiXDevice::~OptiXDevice()
105{
106 /* Make CUDA context current. */
107 const CUDAContextScope scope(this);
108
109 free_bvh_memory_delayed();
110
111 sbt_data.free();
112 texture_info.free();
113 launch_params.free();
114
115 /* Unload modules. */
116 if (optix_module != nullptr) {
117 optixModuleDestroy(optix_module);
118 }
119 for (int i = 0; i < 2; ++i) {
120 if (builtin_modules[i] != nullptr) {
121 optixModuleDestroy(builtin_modules[i]);
122 }
123 }
124 for (int i = 0; i < NUM_PIPELINES; ++i) {
125 if (pipelines[i] != nullptr) {
126 optixPipelineDestroy(pipelines[i]);
127 }
128 }
129 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
130 if (groups[i] != nullptr) {
131 optixProgramGroupDestroy(groups[i]);
132 }
133 }
134
135# ifdef WITH_OSL
136 if (osl_camera_module != nullptr) {
137 optixModuleDestroy(osl_camera_module);
138 }
139 for (const OptixModule &module : osl_modules) {
140 if (module != nullptr) {
141 optixModuleDestroy(module);
142 }
143 }
144 for (const OptixProgramGroup &group : osl_groups) {
145 if (group != nullptr) {
146 optixProgramGroupDestroy(group);
147 }
148 }
149 osl_colorsystem.free();
150# endif
151
152 optixDeviceContextDestroy(context);
153}
154
155unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
156{
157 return make_unique<OptiXDeviceQueue>(this);
158}
159
160BVHLayoutMask OptiXDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
161{
162 /* OptiX has its own internal acceleration structure format. */
163 return BVH_LAYOUT_OPTIX;
164}
165
166static string get_optix_include_dir()
167{
168 const char *env_dir = getenv("OPTIX_ROOT_DIR");
169 const char *default_dir = CYCLES_RUNTIME_OPTIX_ROOT_DIR;
170
171 if (env_dir && env_dir[0]) {
172 const string env_include_dir = path_join(env_dir, "include");
173 return env_include_dir;
174 }
175 if (default_dir[0]) {
176 const string default_include_dir = path_join(default_dir, "include");
177 return default_include_dir;
178 }
179
180 return string();
181}
182
183string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
184{
185 string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
186
187 /* Add OptiX SDK include directory to include paths. */
188 common_cflags += string_printf(" -I\"%s\"", get_optix_include_dir().c_str());
189
190 /* Specialization for shader ray-tracing. */
191 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
192 common_cflags += " --keep-device-functions";
193 }
194
195 return common_cflags;
196}
197
198void OptiXDevice::create_optix_module(TaskPool &pool,
199 OptixModuleCompileOptions &module_options,
200 string &ptx_data,
201 OptixModule &module,
202 OptixResult &result)
203{
204 OptixTask task = nullptr;
205 result = optixModuleCreateWithTasks(context,
206 &module_options,
207 &pipeline_options,
208 ptx_data.data(),
209 ptx_data.size(),
210 nullptr,
211 nullptr,
212 &module,
213 &task);
214 if (result == OPTIX_SUCCESS) {
215 execute_optix_task(pool, task, result);
216 }
217}
218
219bool OptiXDevice::load_kernels(const uint kernel_features)
220{
221 if (have_error()) {
222 /* Abort early if context creation failed already. */
223 return false;
224 }
225
226# ifdef WITH_OSL
227 /* TODO: Consider splitting kernels into an OSL-camera-only and a full-OSL variant. */
228 const bool use_osl_shading = (kernel_features & KERNEL_FEATURE_OSL_SHADING);
229 const bool use_osl_camera = (kernel_features & KERNEL_FEATURE_OSL_CAMERA);
230# else
231 const bool use_osl_shading = false;
232 const bool use_osl_camera = false;
233# endif
234
235 /* Skip creating OptiX module if only doing denoising. */
236 const bool need_optix_kernels = (kernel_features &
238
239 /* Detect existence of OptiX kernel and SDK here early. So we can error out
240 * before compiling the CUDA kernels, to avoid failing right after when
241 * compiling the OptiX kernel. */
242 string suffix = use_osl_shading ? "_osl" :
243 (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
244 "_shader_raytrace" :
245 "";
246 string ptx_filename;
247 if (need_optix_kernels) {
248 ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst");
249 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
250 std::string optix_include_dir = get_optix_include_dir();
251 if (optix_include_dir.empty()) {
252 set_error(
253 "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
254 "to a directory containing the OptiX SDK.");
255 return false;
256 }
257 if (!path_is_directory(optix_include_dir)) {
258 set_error(string_printf(
259 "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
260 "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
261 "directory containing the OptiX SDK.",
262 optix_include_dir.c_str()));
263 return false;
264 }
265 }
266 }
267
268 /* Load CUDA modules because we need some of the utility kernels. */
269 if (!CUDADevice::load_kernels(kernel_features)) {
270 return false;
271 }
272
273 if (!need_optix_kernels) {
274 return true;
275 }
276
277 const CUDAContextScope scope(this);
278
279 /* Unload existing OptiX module and pipelines first. */
280 if (optix_module != nullptr) {
281 optixModuleDestroy(optix_module);
282 optix_module = nullptr;
283 }
284 for (int i = 0; i < 2; ++i) {
285 if (builtin_modules[i] != nullptr) {
286 optixModuleDestroy(builtin_modules[i]);
287 builtin_modules[i] = nullptr;
288 }
289 }
290 for (int i = 0; i < NUM_PIPELINES; ++i) {
291 if (pipelines[i] != nullptr) {
292 optixPipelineDestroy(pipelines[i]);
293 pipelines[i] = nullptr;
294 }
295 }
296 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
297 if (groups[i] != nullptr) {
298 optixProgramGroupDestroy(groups[i]);
299 groups[i] = nullptr;
300 }
301 }
302
303# ifdef WITH_OSL
304 if (osl_camera_module != nullptr) {
305 optixModuleDestroy(osl_camera_module);
306 osl_camera_module = nullptr;
307 }
308
309 /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
310 for (const OptixModule &module : osl_modules) {
311 if (module != nullptr) {
312 optixModuleDestroy(module);
313 }
314 }
315 osl_modules.clear();
316
317 for (const OptixProgramGroup &group : osl_groups) {
318 if (group != nullptr) {
319 optixProgramGroupDestroy(group);
320 }
321 }
322 osl_groups.clear();
323# endif
324
325 OptixModuleCompileOptions module_options = {};
326 module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
327
328 if (DebugFlags().optix.use_debug) {
329 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
330 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
331 }
332 else {
333 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
334 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
335 }
336
337 module_options.boundValues = nullptr;
338 module_options.numBoundValues = 0;
339 module_options.payloadTypes = nullptr;
340 module_options.numPayloadTypes = 0;
341
342 /* Default to no motion blur and two-level graph, since it is the fastest option. */
343 pipeline_options.usesMotionBlur = false;
344 pipeline_options.traversableGraphFlags =
345 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
346 pipeline_options.numPayloadValues = 8;
347 pipeline_options.numAttributeValues = 2; /* u, v */
348 pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
349 pipeline_options.pipelineLaunchParamsVariableName = "kernel_params"; /* See globals.h */
350
351 pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
352 if (kernel_features & KERNEL_FEATURE_HAIR) {
353 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
354 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM;
355 }
356 else {
357 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
358 }
359 }
360 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
361 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
362 }
363
364 /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
365 * This is necessary since objects may be reported to have motion if the Vector pass is
366 * active, but may still need to be rendered without motion blur if that isn't active as well. */
367 if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
368 pipeline_options.usesMotionBlur = true;
369 /* Motion blur can insert motion transforms into the traversal graph.
370 * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
371 pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
372 }
373
374 { /* Load and compile PTX module with OptiX kernels. */
375 string ptx_data;
376 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
377 string cflags = compile_kernel_get_common_cflags(kernel_features);
378 ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
379 }
380 if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) {
381 set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
382 return false;
383 }
384
385 TaskPool pool;
386 OptixResult result;
387 create_optix_module(pool, module_options, ptx_data, optix_module, result);
388 pool.wait_work();
389 if (result != OPTIX_SUCCESS) {
390 set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
391 ptx_filename.c_str(),
392 optixGetErrorName(result)));
393 return false;
394 }
395 }
396
397 /* Create program groups. */
398 OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
399 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
400 group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
401 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
402 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
403 "__raygen__kernel_optix_integrator_intersect_closest";
404 group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
405 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
406 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
407 "__raygen__kernel_optix_integrator_intersect_shadow";
408 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
409 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
410 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
411 "__raygen__kernel_optix_integrator_intersect_subsurface";
412 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
413 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
414 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
415 "__raygen__kernel_optix_integrator_intersect_volume_stack";
416 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
417 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.module = optix_module;
418 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.entryFunctionName =
419 "__raygen__kernel_optix_integrator_intersect_dedicated_light";
420 group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
421 group_descs[PG_MISS].miss.module = optix_module;
422 group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
423 group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
424 group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
425 group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
426 group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
427 group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
428 group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
429 group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
430 group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
431 group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
432 group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
433 group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
434 group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
435 group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";
436
437 OptixProgramGroupDesc ignore_desc = {};
438 ignore_desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
439 ignore_desc.hitgroup.moduleCH = optix_module;
440 ignore_desc.hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_ignore";
441 ignore_desc.hitgroup.moduleAH = optix_module;
442 ignore_desc.hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_ignore";
443
444 if (kernel_features & KERNEL_FEATURE_HAIR) {
445 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
446 /* Built-in thick curve intersection. */
447 OptixBuiltinISOptions builtin_options = {};
448 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
449 builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE |
450 OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
451 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
452 builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */
453 builtin_options.usesMotionBlur = false;
454
455 optix_assert(optixBuiltinISModuleGet(
456 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
457
458 group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
459 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
460 group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
461 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
462
463 if (pipeline_options.usesMotionBlur) {
464 builtin_options.usesMotionBlur = true;
465
466 optix_assert(optixBuiltinISModuleGet(
467 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
468
469 group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
470 group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
471 group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
472 group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
473 group_descs[PG_HITV_MOTION] = ignore_desc;
474 group_descs[PG_HITL_MOTION] = ignore_desc;
475 }
476 }
477 else {
478 /* Custom ribbon intersection. */
479 group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
480 group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
481 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
482 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
483 }
484 }
485
486 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
487 group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
488 group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
489 group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module;
490 group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
491 group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS];
492 group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
493 group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module;
494 group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
495 group_descs[PG_HITV_POINTCLOUD] = ignore_desc;
496 group_descs[PG_HITL_POINTCLOUD] = ignore_desc;
497 }
498
499 /* Add hit group for local intersections. */
501 group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
502 group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
503 group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
504 }
505
506 /* Shader ray-tracing replaces some functions with direct callables. */
507 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
508 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
509 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
510 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
511 "__raygen__kernel_optix_integrator_shade_surface_raytrace";
512
513 /* Kernels with OSL shading support are built without SVM, so can skip those direct callables
514 * there. */
515 if (!use_osl_shading) {
516 group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
517 group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
518 group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
519 group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
520 group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
521 group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
522 "__direct_callable__svm_node_bevel";
523 }
524 }
525
526 if (kernel_features & KERNEL_FEATURE_MNEE) {
527 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
528 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
529 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.entryFunctionName =
530 "__raygen__kernel_optix_integrator_shade_surface_mnee";
531 }
532
533 /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
534 if (use_osl_shading) {
535 group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
536 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
537 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
538 "__raygen__kernel_optix_integrator_shade_background";
539 group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
540 group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
541 group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
542 "__raygen__kernel_optix_integrator_shade_light";
543 group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
544 group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
545 group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
546 "__raygen__kernel_optix_integrator_shade_surface";
547 group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
548 group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
549 group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
550 "__raygen__kernel_optix_integrator_shade_volume";
551 group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
552 group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
553 group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
554 "__raygen__kernel_optix_integrator_shade_shadow";
555 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
556 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.module = optix_module;
557 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.entryFunctionName =
558 "__raygen__kernel_optix_integrator_shade_dedicated_light";
559 group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
560 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
561 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
562 "__raygen__kernel_optix_shader_eval_displace";
563 group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
564 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
565 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
566 "__raygen__kernel_optix_shader_eval_background";
567 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
568 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
569 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
570 "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
571 }
572
573# ifdef WITH_OSL
574 /* When using custom OSL cameras, integrator_init_from_camera is its own specialized module. */
575 if (use_osl_camera) {
576 /* Load and compile the OSL camera PTX module. */
577 string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_camera.ptx.zst");
578 if (!path_read_compressed_text(ptx_filename, ptx_data)) {
579 set_error(
580 string_printf("Failed to load OptiX OSL camera kernel from '%s'", ptx_filename.c_str()));
581 return false;
582 }
583
584 TaskPool pool;
585 OptixResult result;
586 create_optix_module(pool, module_options, ptx_data, osl_camera_module, result);
587 pool.wait_work();
588 if (result != OPTIX_SUCCESS) {
589 set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
590 ptx_filename.c_str(),
591 optixGetErrorName(result)));
592 return false;
593 }
594
595 group_descs[PG_RGEN_INIT_FROM_CAMERA].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
596 group_descs[PG_RGEN_INIT_FROM_CAMERA].raygen.module = osl_camera_module;
597 group_descs[PG_RGEN_INIT_FROM_CAMERA].raygen.entryFunctionName =
598 "__raygen__kernel_optix_integrator_init_from_camera";
599 }
600# endif
601
602 optix_assert(optixProgramGroupCreate(
603 context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, nullptr, groups));
604
605 /* Get program stack sizes. */
606 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
607 /* Set up SBT, which in this case is used only to select between different programs. */
608 sbt_data.alloc(NUM_PROGRAM_GROUPS);
609 memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
610 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
611 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
612 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
613 }
614 sbt_data.copy_to_device(); /* Upload SBT to device. */
615
616 /* Calculate maximum trace continuation stack size. */
617 unsigned int trace_css = stack_size[PG_HITD].cssCH;
618 /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
619 trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
620 trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
621 trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
622 trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
623 trace_css = std::max(trace_css,
624 stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
625 trace_css = std::max(trace_css,
626 stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
627 trace_css = std::max(
628 trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH);
629 trace_css = std::max(
630 trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH);
631
632 OptixPipelineLinkOptions link_options = {};
633 link_options.maxTraceDepth = 1;
634
635 if (use_osl_shading || use_osl_camera) {
636 /* OSL kernels will be (re)created on by OSL manager. */
637 }
638 else if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) {
639 /* Create shader ray-tracing and MNEE pipeline. */
640 vector<OptixProgramGroup> pipeline_groups;
641 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
642 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
643 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
644 pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
645 pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
646 }
647 if (kernel_features & KERNEL_FEATURE_MNEE) {
648 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
649 }
650 pipeline_groups.push_back(groups[PG_MISS]);
651 pipeline_groups.push_back(groups[PG_HITD]);
652 pipeline_groups.push_back(groups[PG_HITS]);
653 pipeline_groups.push_back(groups[PG_HITL]);
654 pipeline_groups.push_back(groups[PG_HITV]);
655 if (pipeline_options.usesMotionBlur) {
656 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
657 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
658 pipeline_groups.push_back(groups[PG_HITV_MOTION]);
659 pipeline_groups.push_back(groups[PG_HITL_MOTION]);
660 }
661 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
662 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
663 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
664 pipeline_groups.push_back(groups[PG_HITV_POINTCLOUD]);
665 pipeline_groups.push_back(groups[PG_HITL_POINTCLOUD]);
666 }
667
668 optix_assert(optixPipelineCreate(context,
669 &pipeline_options,
670 &link_options,
671 pipeline_groups.data(),
672 pipeline_groups.size(),
673 nullptr,
674 nullptr,
675 &pipelines[PIP_SHADE]));
676
677 /* Combine ray generation and trace continuation stack size. */
678 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
679 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
680 link_options.maxTraceDepth * trace_css;
681 const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
682 stack_size[PG_CALL_SVM_BEVEL].dssDC);
683
684 /* Set stack size depending on pipeline options. */
685 optix_assert(optixPipelineSetStackSize(
686 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
687 }
688
689 { /* Create intersection-only pipeline. */
690 vector<OptixProgramGroup> pipeline_groups;
691 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
692 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
693 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
694 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
695 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
696 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_DEDICATED_LIGHT]);
697 pipeline_groups.push_back(groups[PG_MISS]);
698 pipeline_groups.push_back(groups[PG_HITD]);
699 pipeline_groups.push_back(groups[PG_HITS]);
700 pipeline_groups.push_back(groups[PG_HITL]);
701 pipeline_groups.push_back(groups[PG_HITV]);
702 if (pipeline_options.usesMotionBlur) {
703 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
704 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
705 }
706 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
707 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
708 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
709 }
710
711 optix_assert(optixPipelineCreate(context,
712 &pipeline_options,
713 &link_options,
714 pipeline_groups.data(),
715 pipeline_groups.size(),
716 nullptr,
717 nullptr,
718 &pipelines[PIP_INTERSECT]));
719
720 /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
721 const unsigned int css =
722 std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
723 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
724 std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
725 stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
726 link_options.maxTraceDepth * trace_css;
727
728 optix_assert(optixPipelineSetStackSize(
729 pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
730 }
731
732 return !have_error();
733}
734
735bool OptiXDevice::load_osl_kernels()
736{
737# ifdef WITH_OSL
738 if (have_error()) {
739 return false;
740 }
741
742 struct OSLKernel {
743 string ptx;
744 ustring fused_entry;
745 };
746
747 auto get_osl_kernel = [&](const OSL::ShaderGroupRef &group) {
748 if (!group) {
749 return OSLKernel{};
750 }
751 /* Other attribute access crashes when there are no layers. */
752 int num_layers = 0;
753 osl_globals.ss->getattribute(group.get(), "num_layers", num_layers);
754 if (num_layers == 0) {
755 return OSLKernel{};
756 }
757
758 string osl_ptx;
759 ustring fused_name;
760
761 osl_globals.ss->getattribute(group.get(), "group_fused_name", fused_name);
762 osl_globals.ss->getattribute(
763 group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
764
765 int groupdata_size = 0;
766 osl_globals.ss->getattribute(group.get(), "llvm_groupdata_size", groupdata_size);
767 if (groupdata_size == 0) {
768 // Old attribute name from our patched OSL version as fallback.
769 osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
770 }
771 if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
772 set_error(
773 string_printf("Requested OSL group data size (%d) is greater than the maximum "
774 "supported with OptiX (2048)",
775 groupdata_size));
776 return OSLKernel{};
777 }
778
779 return OSLKernel{std::move(osl_ptx), std::move(fused_name)};
780 };
781
782 /* This has to be in the same order as the ShaderType enum, so that the index calculation in
783 * osl_eval_nodes checks out */
784 vector<OSLKernel> osl_kernels;
785 osl_kernels.emplace_back(get_osl_kernel(osl_globals.camera_state));
786 for (const OSL::ShaderGroupRef &group : osl_globals.surface_state) {
787 osl_kernels.emplace_back(get_osl_kernel(group));
788 }
789 for (const OSL::ShaderGroupRef &group : osl_globals.volume_state) {
790 osl_kernels.emplace_back(get_osl_kernel(group));
791 }
792 for (const OSL::ShaderGroupRef &group : osl_globals.displacement_state) {
793 osl_kernels.emplace_back(get_osl_kernel(group));
794 }
795 for (const OSL::ShaderGroupRef &group : osl_globals.bump_state) {
796 osl_kernels.emplace_back(get_osl_kernel(group));
797 }
798
799 if (have_error()) {
800 return false;
801 }
802
803 const CUDAContextScope scope(this);
804
805 if (pipelines[PIP_SHADE]) {
806 optixPipelineDestroy(pipelines[PIP_SHADE]);
807 }
808
809 for (OptixModule &module : osl_modules) {
810 if (module != nullptr) {
811 optixModuleDestroy(module);
812 module = nullptr;
813 }
814 }
815 for (OptixProgramGroup &group : osl_groups) {
816 if (group != nullptr) {
817 optixProgramGroupDestroy(group);
818 group = nullptr;
819 }
820 }
821
822 /* We always need to reserve a spot for the camera shader group, but if it's unused
823 * and there are no other shader groups, we can skip creating the pipeline. */
824 if (osl_kernels.size() == 1 && osl_kernels[0].ptx.empty()) {
825 return true;
826 }
827
828 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
829 OptixModuleCompileOptions module_options = {};
830 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
831 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
832
833 /* In addition to the modules for each OSL group, we need to load our own osl_services.ptx
834 * as well as the shadeops.ptx that's embedded in OSL. */
835 size_t id_osl_services = osl_kernels.size();
836 size_t id_osl_shadeops = osl_kernels.size() + 1;
837 osl_groups.resize(osl_kernels.size() + 2);
838 osl_modules.resize(osl_kernels.size() + 2);
839
840 { /* Load and compile PTX module with OSL services. */
841 string osl_services_ptx, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst");
842 if (!path_read_compressed_text(ptx_filename, osl_services_ptx)) {
843 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
844 ptx_filename.c_str()));
845 return false;
846 }
847
848 const char *shadeops_ptx_ptr = nullptr;
849 osl_globals.ss->getattribute("shadeops_cuda_ptx", OSL::TypeDesc::PTR, &shadeops_ptx_ptr);
850 int shadeops_ptx_size = 0;
851 osl_globals.ss->getattribute("shadeops_cuda_ptx_size", OSL::TypeDesc::INT, &shadeops_ptx_size);
852 string shadeops_ptx(shadeops_ptx_ptr, shadeops_ptx_size);
853
854 TaskPool pool;
855 OptixResult services_result, shadeops_result;
856 create_optix_module(
857 pool, module_options, osl_services_ptx, osl_modules[id_osl_services], services_result);
858 create_optix_module(
859 pool, module_options, shadeops_ptx, osl_modules[id_osl_shadeops], shadeops_result);
860 pool.wait_work();
861
862 {
863 if (services_result != OPTIX_SUCCESS) {
864 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
865 ptx_filename.c_str(),
866 optixGetErrorName(services_result)));
867 return false;
868 }
869 OptixProgramGroupDesc group_desc = {};
870 group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
871 group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
872 group_desc.callables.moduleDC = osl_modules[id_osl_services];
873
874 optix_assert(optixProgramGroupCreate(context,
875 &group_desc,
876 1,
877 &group_options,
878 nullptr,
879 nullptr,
880 &osl_groups[id_osl_services]));
881 }
882
883 {
884 if (shadeops_result != OPTIX_SUCCESS) {
885 set_error(string_printf("Failed to load OptiX OSL shadeops kernel (%s)",
886 optixGetErrorName(shadeops_result)));
887 return false;
888 }
889 OptixProgramGroupDesc group_desc = {};
890 group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
891 group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_shadeops";
892 group_desc.callables.moduleDC = osl_modules[id_osl_shadeops];
893
894 optix_assert(optixProgramGroupCreate(context,
895 &group_desc,
896 1,
897 &group_options,
898 nullptr,
899 nullptr,
900 &osl_groups[id_osl_shadeops]));
901 }
902 }
903
904 TaskPool pool;
905 vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
906
907 for (size_t i = 0; i < osl_kernels.size(); ++i) {
908 if (osl_kernels[i].ptx.empty()) {
909 continue;
910 }
911
912 create_optix_module(pool, module_options, osl_kernels[i].ptx, osl_modules[i], results[i]);
913 }
914
915 pool.wait_work();
916
917 for (size_t i = 0; i < osl_kernels.size(); ++i) {
918 if (osl_kernels[i].ptx.empty()) {
919 continue;
920 }
921
922 if (results[i] != OPTIX_SUCCESS) {
923 set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
924 osl_kernels[i].fused_entry.c_str(),
925 optixGetErrorName(results[i])));
926 return false;
927 }
928
929 OptixProgramGroupDesc group_desc = {};
930 group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
931 group_desc.callables.entryFunctionNameDC = osl_kernels[i].fused_entry.c_str();
932 group_desc.callables.moduleDC = osl_modules[i];
933
934 optix_assert(optixProgramGroupCreate(
935 context, &group_desc, 1, &group_options, nullptr, nullptr, &osl_groups[i]));
936 }
937
938 /* Update SBT with new entries. */
939 sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
940 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
941 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
942 }
943 for (size_t i = 0; i < osl_groups.size(); ++i) {
944 if (osl_groups[i] != nullptr) {
945 optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
946 }
947 else {
948 /* Default to "__direct_callable__dummy_services", so that OSL evaluation for empty
949 * materials has direct callables to call and does not crash. */
950 optix_assert(optixSbtRecordPackHeader(osl_groups[id_osl_services],
951 &sbt_data[NUM_PROGRAM_GROUPS + i]));
952 }
953 }
954 sbt_data.copy_to_device(); /* Upload updated SBT to device. */
955
956 OptixPipelineLinkOptions link_options = {};
957 link_options.maxTraceDepth = 0;
958
959 {
960 vector<OptixProgramGroup> pipeline_groups;
961 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
962 pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
963 pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
964 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
965 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
966 pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
967 pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
968 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
969 pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
970 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
971 pipeline_groups.push_back(groups[PG_RGEN_SHADE_DEDICATED_LIGHT]);
972 pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
973 pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
974 pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
975 pipeline_groups.push_back(groups[PG_RGEN_INIT_FROM_CAMERA]);
976
977 for (const OptixProgramGroup &group : osl_groups) {
978 if (group != nullptr) {
979 pipeline_groups.push_back(group);
980 }
981 }
982
983 optix_assert(optixPipelineCreate(context,
984 &pipeline_options,
985 &link_options,
986 pipeline_groups.data(),
987 pipeline_groups.size(),
988 nullptr,
989 nullptr,
990 &pipelines[PIP_SHADE]));
991
992 /* Get program stack sizes. */
993 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
994 vector<OptixStackSizes> osl_stack_size(osl_groups.size());
995
996 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
997 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
998 }
999 for (size_t i = 0; i < osl_groups.size(); ++i) {
1000 if (osl_groups[i] != nullptr) {
1001 optix_assert(optixProgramGroupGetStackSize(
1002 osl_groups[i], &osl_stack_size[i], pipelines[PIP_SHADE]));
1003 }
1004 }
1005
1006 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
1007 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
1008 unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
1009 stack_size[PG_CALL_SVM_BEVEL].dssDC);
1010 for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
1011 dss = std::max(dss, osl_stack_size[i].dssDC);
1012 }
1013
1014 optix_assert(optixPipelineSetStackSize(
1015 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
1016 }
1017
1018 /* Copy colorsystem data from OSL to the device. */
1019 {
1020 /* The interface here is somewhat complex, since the colorsystem contains strings whose
1021 * representation is different between CPU and GPU.
1022 * OSL's ColorSystem type therefore consists of two parts: First the "fixed data" (e.g. floats)
1023 * that is identical between both, and then the strings.
1024 * To perform this conversion, in addition to the pointer to the CPU data, we query two sizes:
1025 * The total size of the CPU data and the number of strings. */
1026 uint8_t *cpu_data = nullptr;
1027 size_t cpu_data_sizes[2] = {0, 0};
1028 osl_globals.ss->getattribute("colorsystem", OSL::TypeDesc::PTR, &cpu_data);
1029 osl_globals.ss->getattribute(
1030 "colorsystem:sizes", TypeDesc(TypeDesc::LONGLONG, 2), (void *)cpu_data_sizes);
1031
1032 size_t cpu_full_size = cpu_data_sizes[0];
1033 size_t num_strings = cpu_data_sizes[1];
1034 size_t fixed_data_size = cpu_full_size - sizeof(ustringhash) * num_strings;
1035
1036 /* Allocate a buffer to fit the fixed data, as well as all the strings in GPU form. */
1037 uint8_t *gpu_data = osl_colorsystem.alloc(fixed_data_size + sizeof(size_t) * num_strings);
1038
1039 /* Copy the fixed data as-is. */
1040 memcpy(gpu_data, cpu_data, fixed_data_size);
1041
1042 /* Convert each string to GPU format. */
1043 ustringhash *cpu_strings = reinterpret_cast<ustringhash *>(cpu_data + fixed_data_size);
1044 size_t *gpu_strings = reinterpret_cast<size_t *>(gpu_data + fixed_data_size);
1045 for (int i = 0; i < num_strings; i++) {
1046 gpu_strings[i] = cpu_strings[i].hash();
1047 }
1048
1049 /* Copy GPU form of the data to the device. */
1050 osl_colorsystem.copy_to_device();
1051
1052 update_launch_params(offsetof(KernelParamsOptiX, osl_colorsystem),
1053 &osl_colorsystem.device_pointer,
1054 sizeof(device_ptr));
1055 }
1056
1057 return !have_error();
1058# else
1059 return false;
1060# endif
1061}
1062
1063OSLGlobals *OptiXDevice::get_cpu_osl_memory()
1064{
1065# ifdef WITH_OSL
1066 return &osl_globals;
1067# else
1068 return nullptr;
1069# endif
1070}
1071
1072bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
1073 OptixBuildOperation operation,
1074 const OptixBuildInput &build_input,
1075 const uint16_t num_motion_steps)
1076{
1077 /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1078 * from running out of memory (since both original and compacted acceleration structure memory
1079 * may be allocated at the same time for the duration of this function). The builds would
1080 * otherwise happen on the same CUDA stream anyway. */
1081 static thread_mutex mutex;
1083
1084 const CUDAContextScope scope(this);
1085
1086 bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1087
1088 /* Compute memory usage. */
1089 OptixAccelBufferSizes sizes = {};
1090 OptixAccelBuildOptions options = {};
1091 options.operation = operation;
1092 if (build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) {
1093 /* The build flags have to match the ones used to query the built-in curve intersection
1094 * program (see optixBuiltinISModuleGet above) */
1095 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
1096 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1097 use_fast_trace_bvh = true;
1098 }
1099 else if (use_fast_trace_bvh) {
1100 VLOG_INFO << "Using fast to trace OptiX BVH";
1101 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1102 }
1103 else {
1104 VLOG_INFO << "Using fast to update OptiX BVH";
1105 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1106 }
1107
1108 options.motionOptions.numKeys = num_motion_steps;
1109 options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1110 options.motionOptions.timeBegin = 0.0f;
1111 options.motionOptions.timeEnd = 1.0f;
1112
1113 optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1114
1115 /* Allocate required output buffers. */
1116 device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1117 temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1118 if (!temp_mem.device_pointer) {
1119 /* Make sure temporary memory allocation succeeded. */
1120 return false;
1121 }
1122
1123 /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
1124 device_only_memory<char> &out_data = *bvh->as_data;
1125 if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1126 assert(out_data.device == this);
1127 out_data.alloc_to_device(sizes.outputSizeInBytes);
1128 if (!out_data.device_pointer) {
1129 return false;
1130 }
1131 }
1132 else {
1133 assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1134 }
1135
1136 /* Finally build the acceleration structure. */
1137 OptixAccelEmitDesc compacted_size_prop = {};
1138 compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1139 /* A tiny space was allocated for this property at the end of the temporary buffer above.
1140 * Make sure this pointer is 8-byte aligned. */
1141 compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1142
1143 OptixTraversableHandle out_handle = 0;
1144 optix_assert(optixAccelBuild(context,
1145 nullptr,
1146 &options,
1147 &build_input,
1148 1,
1149 temp_mem.device_pointer,
1150 sizes.tempSizeInBytes,
1151 out_data.device_pointer,
1152 sizes.outputSizeInBytes,
1153 &out_handle,
1154 use_fast_trace_bvh ? &compacted_size_prop : nullptr,
1155 use_fast_trace_bvh ? 1 : 0));
1156 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1157
1158 /* Wait for all operations to finish. */
1159 cuda_assert(cuStreamSynchronize(nullptr));
1160
1161 /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
1162 */
1163 if (use_fast_trace_bvh) {
1164 uint64_t compacted_size = sizes.outputSizeInBytes;
1165 cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1166
1167 /* Temporary memory is no longer needed, so free it now to make space. */
1168 temp_mem.free();
1169
1170 /* There is no point compacting if the size does not change. */
1171 if (compacted_size < sizes.outputSizeInBytes) {
1172 device_only_memory<char> compacted_data(this, "optix compacted as", false);
1173 compacted_data.alloc_to_device(compacted_size);
1174 if (!compacted_data.device_pointer) {
1175 /* Do not compact if memory allocation for compacted acceleration structure fails.
1176 * Can just use the uncompacted one then, so succeed here regardless. */
1177 return !have_error();
1178 }
1179
1180 optix_assert(optixAccelCompact(context,
1181 nullptr,
1182 out_handle,
1183 compacted_data.device_pointer,
1184 compacted_size,
1185 &out_handle));
1186 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1187
1188 /* Wait for compaction to finish. */
1189 cuda_assert(cuStreamSynchronize(nullptr));
1190
1191 std::swap(out_data.device_size, compacted_data.device_size);
1192 std::swap(out_data.device_pointer, compacted_data.device_pointer);
1193 /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
1194 */
1195 }
1196 }
1197
1198 return !have_error();
1199}
1200
1201void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1202{
1203 const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1204
1205 free_bvh_memory_delayed();
1206
1207 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1208
1209 progress.set_substatus("Building OptiX acceleration structure");
1210
1211 if (!bvh->params.top_level) {
1212 assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1213
1214 /* Refit is only possible in viewport for now (because AS is built with
1215 * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
1216 OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1217 if (refit && !use_fast_trace_bvh) {
1218 assert(bvh_optix->traversable_handle != 0);
1219 operation = OPTIX_BUILD_OPERATION_UPDATE;
1220 }
1221 else {
1222 bvh_optix->as_data->free();
1223 bvh_optix->traversable_handle = 0;
1224 }
1225
1226 /* Build bottom level acceleration structures (BLAS). */
1227 Geometry *const geom = bvh->geometry[0];
1228 if (geom->is_hair()) {
1229 /* Build BLAS for curve primitives. */
1230 Hair *const hair = static_cast<Hair *const>(geom);
1231 if (hair->num_segments() == 0) {
1232 return;
1233 }
1234
1235 const size_t num_segments = hair->num_segments();
1236
1237 size_t num_motion_steps = 1;
1239 if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
1240 num_motion_steps = hair->get_motion_steps();
1241 }
1242
1243 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1244 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1245 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1246 /* Four control points for each curve segment. */
1247 size_t num_vertices = num_segments * 4;
1248 if (hair->curve_shape == CURVE_THICK) {
1249 num_vertices = hair->num_keys() + 2 * hair->num_curves();
1250 index_data.alloc(num_segments);
1251 vertex_data.alloc(num_vertices * num_motion_steps);
1252 }
1253 else {
1254 aabb_data.alloc(num_segments * num_motion_steps);
1255 }
1256
1257 /* Get AABBs for each motion step. */
1258 for (size_t step = 0; step < num_motion_steps; ++step) {
1259 /* The center step for motion vertices is not stored in the attribute. */
1260 const float3 *keys = hair->get_curve_keys().data();
1261 size_t center_step = (num_motion_steps - 1) / 2;
1262 if (step != center_step) {
1263 size_t attr_offset = (step > center_step) ? step - 1 : step;
1264 /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
1265 keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1266 }
1267
1268 if (hair->curve_shape == CURVE_THICK) {
1269 for (size_t curve_index = 0, segment_index = 0, vertex_index = step * num_vertices;
1270 curve_index < hair->num_curves();
1271 ++curve_index)
1272 {
1273 const Hair::Curve curve = hair->get_curve(curve_index);
1274 const array<float> &curve_radius = hair->get_curve_radius();
1275
1276 const int first_key_index = curve.first_key;
1277 {
1278 vertex_data[vertex_index++] = make_float4(keys[first_key_index].x,
1279 keys[first_key_index].y,
1280 keys[first_key_index].z,
1281 curve_radius[first_key_index]);
1282 }
1283
1284 for (int k = 0; k < curve.num_segments(); ++k) {
1285 if (step == 0) {
1286 index_data[segment_index++] = vertex_index - 1;
1287 }
1288 vertex_data[vertex_index++] = make_float4(keys[first_key_index + k].x,
1289 keys[first_key_index + k].y,
1290 keys[first_key_index + k].z,
1291 curve_radius[first_key_index + k]);
1292 }
1293
1294 const int last_key_index = first_key_index + curve.num_keys - 1;
1295 {
1296 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1297 keys[last_key_index].y,
1298 keys[last_key_index].z,
1299 curve_radius[last_key_index]);
1300 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1301 keys[last_key_index].y,
1302 keys[last_key_index].z,
1303 curve_radius[last_key_index]);
1304 }
1305 }
1306 }
1307 else {
1308 for (size_t curve_index = 0, i = 0; curve_index < hair->num_curves(); ++curve_index) {
1309 const Hair::Curve curve = hair->get_curve(curve_index);
1310
1311 for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1313 curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1314
1315 const size_t index = step * num_segments + i;
1316 aabb_data[index].minX = bounds.min.x;
1317 aabb_data[index].minY = bounds.min.y;
1318 aabb_data[index].minZ = bounds.min.z;
1319 aabb_data[index].maxX = bounds.max.x;
1320 aabb_data[index].maxY = bounds.max.y;
1321 aabb_data[index].maxZ = bounds.max.z;
1322 }
1323 }
1324 }
1325 }
1326
1327 /* Upload AABB data to GPU. */
1328 aabb_data.copy_to_device();
1329 index_data.copy_to_device();
1330 vertex_data.copy_to_device();
1331
1332 vector<device_ptr> aabb_ptrs;
1333 aabb_ptrs.reserve(num_motion_steps);
1334 vector<device_ptr> width_ptrs;
1335 vector<device_ptr> vertex_ptrs;
1336 width_ptrs.reserve(num_motion_steps);
1337 vertex_ptrs.reserve(num_motion_steps);
1338 for (size_t step = 0; step < num_motion_steps; ++step) {
1339 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1340 const device_ptr base_ptr = vertex_data.device_pointer +
1341 step * num_vertices * sizeof(float4);
1342 width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
1343 vertex_ptrs.push_back(base_ptr);
1344 }
1345
1346 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1347 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1348 OptixBuildInput build_input = {};
1349 if (hair->curve_shape == CURVE_THICK) {
1350 build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1351 build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
1352 build_input.curveArray.numPrimitives = num_segments;
1353 build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1354 build_input.curveArray.numVertices = num_vertices;
1355 build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1356 build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1357 build_input.curveArray.widthStrideInBytes = sizeof(float4);
1358 build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1359 build_input.curveArray.indexStrideInBytes = sizeof(int);
1360 build_input.curveArray.flag = build_flags;
1361 build_input.curveArray.primitiveIndexOffset = hair->curve_segment_offset;
1362 }
1363 else {
1364 /* Disable visibility test any-hit program, since it is already checked during
1365 * intersection. Those trace calls that require any-hit can force it with a ray flag. */
1366 build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1367
1368 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1369 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1370 build_input.customPrimitiveArray.numPrimitives = num_segments;
1371 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1372 build_input.customPrimitiveArray.flags = &build_flags;
1373 build_input.customPrimitiveArray.numSbtRecords = 1;
1374 build_input.customPrimitiveArray.primitiveIndexOffset = hair->curve_segment_offset;
1375 }
1376
1377 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1378 progress.set_error("Failed to build OptiX acceleration structure");
1379 }
1380 }
1381 else if (geom->is_mesh() || geom->is_volume()) {
1382 /* Build BLAS for triangle primitives. */
1383 Mesh *const mesh = static_cast<Mesh *const>(geom);
1384 if (mesh->num_triangles() == 0) {
1385 return;
1386 }
1387
1388 const size_t num_verts = mesh->get_verts().size();
1389
1390 size_t num_motion_steps = 1;
1392 if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
1393 num_motion_steps = mesh->get_motion_steps();
1394 }
1395
1396 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1397 index_data.alloc(mesh->get_triangles().size());
1398 memcpy(index_data.data(),
1399 mesh->get_triangles().data(),
1400 mesh->get_triangles().size() * sizeof(int));
1401 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1402 vertex_data.alloc(num_verts * num_motion_steps);
1403
1404 for (size_t step = 0; step < num_motion_steps; ++step) {
1405 const float3 *verts = mesh->get_verts().data();
1406
1407 size_t center_step = (num_motion_steps - 1) / 2;
1408 /* The center step for motion vertices is not stored in the attribute. */
1409 if (step != center_step) {
1410 verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
1411 }
1412
1413 memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1414 }
1415
1416 /* Upload triangle data to GPU. */
1417 index_data.copy_to_device();
1418 vertex_data.copy_to_device();
1419
1420 vector<device_ptr> vertex_ptrs;
1421 vertex_ptrs.reserve(num_motion_steps);
1422 for (size_t step = 0; step < num_motion_steps; ++step) {
1423 vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1424 }
1425
1426 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1427 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1428 OptixBuildInput build_input = {};
1429 build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1430 build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1431 build_input.triangleArray.numVertices = num_verts;
1432 build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1433 build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
1434 build_input.triangleArray.indexBuffer = index_data.device_pointer;
1435 build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1436 build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1437 build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1438 build_input.triangleArray.flags = &build_flags;
1439 /* The SBT does not store per primitive data since Cycles already allocates separate
1440 * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1441 * one and rely on that having the same meaning in this case. */
1442 build_input.triangleArray.numSbtRecords = 1;
1443 build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset;
1444
1445 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1446 progress.set_error("Failed to build OptiX acceleration structure");
1447 }
1448 }
1449 else if (geom->is_pointcloud()) {
1450 /* Build BLAS for points primitives. */
1451 PointCloud *const pointcloud = static_cast<PointCloud *const>(geom);
1452 const size_t num_points = pointcloud->num_points();
1453 if (num_points == 0) {
1454 return;
1455 }
1456
1457 size_t num_motion_steps = 1;
1458 Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1459 if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
1460 num_motion_steps = pointcloud->get_motion_steps();
1461 }
1462
1463 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1464 aabb_data.alloc(num_points * num_motion_steps);
1465
1466 /* Get AABBs for each motion step. */
1467 for (size_t step = 0; step < num_motion_steps; ++step) {
1468 /* The center step for motion vertices is not stored in the attribute. */
1469 size_t center_step = (num_motion_steps - 1) / 2;
1470
1471 if (step == center_step) {
1472 const float3 *points = pointcloud->get_points().data();
1473 const float *radius = pointcloud->get_radius().data();
1474
1475 for (size_t i = 0; i < num_points; ++i) {
1476 const PointCloud::Point point = pointcloud->get_point(i);
1478 point.bounds_grow(points, radius, bounds);
1479
1480 const size_t index = step * num_points + i;
1481 aabb_data[index].minX = bounds.min.x;
1482 aabb_data[index].minY = bounds.min.y;
1483 aabb_data[index].minZ = bounds.min.z;
1484 aabb_data[index].maxX = bounds.max.x;
1485 aabb_data[index].maxY = bounds.max.y;
1486 aabb_data[index].maxZ = bounds.max.z;
1487 }
1488 }
1489 else {
1490 size_t attr_offset = (step > center_step) ? step - 1 : step;
1491 const float4 *points = motion_points->data_float4() + attr_offset * num_points;
1492
1493 for (size_t i = 0; i < num_points; ++i) {
1494 const PointCloud::Point point = pointcloud->get_point(i);
1496 point.bounds_grow(points[i], bounds);
1497
1498 const size_t index = step * num_points + i;
1499 aabb_data[index].minX = bounds.min.x;
1500 aabb_data[index].minY = bounds.min.y;
1501 aabb_data[index].minZ = bounds.min.z;
1502 aabb_data[index].maxX = bounds.max.x;
1503 aabb_data[index].maxY = bounds.max.y;
1504 aabb_data[index].maxZ = bounds.max.z;
1505 }
1506 }
1507 }
1508
1509 /* Upload AABB data to GPU. */
1510 aabb_data.copy_to_device();
1511
1512 vector<device_ptr> aabb_ptrs;
1513 aabb_ptrs.reserve(num_motion_steps);
1514 for (size_t step = 0; step < num_motion_steps; ++step) {
1515 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb));
1516 }
1517
1518 /* Disable visibility test any-hit program, since it is already checked during
1519 * intersection. Those trace calls that require anyhit can force it with a ray flag.
1520 * For those, force a single any-hit call, so shadow record-all behavior works correctly. */
1521 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT |
1522 OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1523 OptixBuildInput build_input = {};
1524 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1525 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1526 build_input.customPrimitiveArray.numPrimitives = num_points;
1527 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1528 build_input.customPrimitiveArray.flags = &build_flags;
1529 build_input.customPrimitiveArray.numSbtRecords = 1;
1530 build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset;
1531
1532 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1533 progress.set_error("Failed to build OptiX acceleration structure");
1534 }
1535 }
1536 }
1537 else {
1538 unsigned int num_instances = 0;
1539 unsigned int max_num_instances = 0xFFFFFFFF;
1540
1541 bvh_optix->as_data->free();
1542 bvh_optix->traversable_handle = 0;
1543 bvh_optix->motion_transform_data->free();
1544
1545 optixDeviceContextGetProperty(context,
1546 OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1547 &max_num_instances,
1548 sizeof(max_num_instances));
1549 /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
1550 max_num_instances >>= 1;
1551 if (bvh->objects.size() > max_num_instances) {
1552 progress.set_error(
1553 "Failed to build OptiX acceleration structure because there are too many instances");
1554 return;
1555 }
1556
1557 /* Fill instance descriptions. */
1558 device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1559 instances.alloc(bvh->objects.size());
1560
1561 /* Calculate total motion transform size and allocate memory for them. */
1562 size_t motion_transform_offset = 0;
1563 if (pipeline_options.usesMotionBlur) {
1564 size_t total_motion_transform_size = 0;
1565 for (Object *const ob : bvh->objects) {
1566 if (ob->is_traceable() && ob->use_motion()) {
1567 total_motion_transform_size = align_up(total_motion_transform_size,
1568 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1569 const size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1570 total_motion_transform_size = total_motion_transform_size +
1571 sizeof(OptixSRTMotionTransform) +
1572 motion_keys * sizeof(OptixSRTData);
1573 }
1574 }
1575
1576 assert(bvh_optix->motion_transform_data->device == this);
1577 bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
1578 }
1579
1580 for (Object *ob : bvh->objects) {
1581 /* Skip non-traceable objects. */
1582 if (!ob->is_traceable()) {
1583 continue;
1584 }
1585
1586 BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh.get());
1587 OptixTraversableHandle handle = blas->traversable_handle;
1588 if (handle == 0) {
1589 continue;
1590 }
1591
1592 OptixInstance &instance = instances[num_instances++];
1593 memset(&instance, 0, sizeof(instance));
1594
1595 /* Clear transform to identity matrix. */
1596 instance.transform[0] = 1.0f;
1597 instance.transform[5] = 1.0f;
1598 instance.transform[10] = 1.0f;
1599
1600 /* Set user instance ID to object index. */
1601 instance.instanceId = ob->get_device_index();
1602
1603 /* Add some of the object visibility bits to the mask.
1604 * __prim_visibility contains the combined visibility bits of all instances, so is not
1605 * reliable if they differ between instances. But the OptiX visibility mask can only contain
1606 * 8 bits, so have to trade-off here and select just a few important ones.
1607 */
1608 instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;
1609
1610 /* Have to have at least one bit in the mask, or else instance would always be culled. */
1611 if (0 == instance.visibilityMask) {
1612 instance.visibilityMask = 0xFF;
1613 }
1614
1615 if (ob->get_geometry()->is_hair() &&
1616 static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK)
1617 {
1618 if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
1619 /* Select between motion blur and non-motion blur built-in intersection module. */
1620 instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1621 }
1622 }
1623 else if (ob->get_geometry()->is_pointcloud()) {
1624 /* Use the hit group that has an intersection program for point clouds. */
1625 instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD;
1626
1627 /* Also skip point clouds in local trace calls. */
1628 instance.visibilityMask |= 4;
1629 }
1630 {
1631 /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
1632 * since it needs to filter out end-caps there).
1633 *
1634 * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
1635 * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
1636 */
1637 instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
1638 }
1639
1640 /* Insert motion traversable if object has motion. */
1641 if (pipeline_options.usesMotionBlur && ob->use_motion()) {
1642 size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1643 size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1644 motion_keys * sizeof(OptixSRTData);
1645
1646 const CUDAContextScope scope(this);
1647
1648 motion_transform_offset = align_up(motion_transform_offset,
1649 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1650 CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
1651 motion_transform_offset;
1652 motion_transform_offset += motion_transform_size;
1653
1654 /* Allocate host side memory for motion transform and fill it with transform data. */
1655 array<uint8_t> motion_transform_storage(motion_transform_size);
1656 OptixSRTMotionTransform *motion_transform = reinterpret_cast<OptixSRTMotionTransform *>(
1657 motion_transform_storage.data());
1658 motion_transform->child = handle;
1659 motion_transform->motionOptions.numKeys = ob->get_motion().size();
1660 motion_transform->motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1661 motion_transform->motionOptions.timeBegin = 0.0f;
1662 motion_transform->motionOptions.timeEnd = 1.0f;
1663
1664 OptixSRTData *const srt_data = motion_transform->srtData;
1665 array<DecomposedTransform> decomp(ob->get_motion().size());
1667 decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1668
1669 for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1670 /* Scale. */
1671 srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
1672 srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
1673 srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
1674
1675 /* Shear. */
1676 srt_data[i].a = decomp[i].z.x; /* scale.x.y */
1677 srt_data[i].b = decomp[i].z.y; /* scale.x.z */
1678 srt_data[i].c = decomp[i].w.x; /* scale.y.z */
1679 assert(decomp[i].z.z == 0.0f); /* scale.y.x */
1680 assert(decomp[i].w.y == 0.0f); /* scale.z.x */
1681 assert(decomp[i].w.z == 0.0f); /* scale.z.y */
1682
1683 /* Pivot point. */
1684 srt_data[i].pvx = 0.0f;
1685 srt_data[i].pvy = 0.0f;
1686 srt_data[i].pvz = 0.0f;
1687
1688 /* Rotation. */
1689 srt_data[i].qx = decomp[i].x.x;
1690 srt_data[i].qy = decomp[i].x.y;
1691 srt_data[i].qz = decomp[i].x.z;
1692 srt_data[i].qw = decomp[i].x.w;
1693
1694 /* Translation. */
1695 srt_data[i].tx = decomp[i].y.x;
1696 srt_data[i].ty = decomp[i].y.y;
1697 srt_data[i].tz = decomp[i].y.z;
1698 }
1699
1700 /* Upload motion transform to GPU. */
1701 cuMemcpyHtoD(motion_transform_gpu, motion_transform, motion_transform_size);
1702 motion_transform = nullptr;
1703 motion_transform_storage.clear();
1704
1705 /* Get traversable handle to motion transform. */
1706 optixConvertPointerToTraversableHandle(context,
1707 motion_transform_gpu,
1708 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1709 &instance.traversableHandle);
1710 }
1711 else {
1712 instance.traversableHandle = handle;
1713
1714 if (ob->get_geometry()->is_instanced()) {
1715 /* Set transform matrix. */
1716 memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1717 }
1718 }
1719 }
1720
1721 /* Upload instance descriptions. */
1722 instances.resize(num_instances);
1723 instances.copy_to_device();
1724
1725 /* Build top-level acceleration structure (TLAS) */
1726 OptixBuildInput build_input = {};
1727 build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1728 build_input.instanceArray.instances = instances.device_pointer;
1729 build_input.instanceArray.numInstances = num_instances;
1730
1731 if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1732 progress.set_error("Failed to build OptiX acceleration structure");
1733 }
1734 tlas_handle = bvh_optix->traversable_handle;
1735 }
1736}
1737
1738void OptiXDevice::release_bvh(BVH *bvh)
1739{
1740 thread_scoped_lock lock(delayed_free_bvh_mutex);
1741 /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
1742 * while GPU is still rendering. */
1743 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1744
1745 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
1746 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
1747 bvh_optix->traversable_handle = 0;
1748}
1749
1750void OptiXDevice::free_bvh_memory_delayed()
1751{
1752 thread_scoped_lock lock(delayed_free_bvh_mutex);
1753 delayed_free_bvh_memory.free_memory();
1754}
1755
1756void OptiXDevice::const_copy_to(const char *name, void *host, const size_t size)
1757{
1758 /* Set constant memory for CUDA module. */
1759 CUDADevice::const_copy_to(name, host, size);
1760
1761 if (strcmp(name, "data") == 0) {
1762 assert(size <= sizeof(KernelData));
1763
1764 /* Update traversable handle (since it is different for each device on multi devices). */
1765 KernelData *const data = (KernelData *)host;
1766 *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;
1767
1768 update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
1769 return;
1770 }
1771
1772 /* Update data storage pointers in launch parameters. */
1773# define KERNEL_DATA_ARRAY(data_type, data_name) \
1774 if (strcmp(name, #data_name) == 0) { \
1775 update_launch_params(offsetof(KernelParamsOptiX, data_name), host, size); \
1776 return; \
1777 }
1778 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
1779# include "kernel/data_arrays.h"
1780# undef KERNEL_DATA_ARRAY
1781}
1782
1783void OptiXDevice::update_launch_params(const size_t offset, void *data, const size_t data_size)
1784{
1785 const CUDAContextScope scope(this);
1786
1787 cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
1788}
1789
1791
1792#endif /* WITH_OPTIX */
unsigned int uint
float progress
Definition WM_types.hh:1019
volatile int lock
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
static btDbvtVolume bounds(btDbvtNode **leaves, int count)
Definition btDbvt.cpp:299
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
Attribute * find(ustring name) const
bool top_level
Definition params.h:80
int bvh_type
Definition params.h:105
Definition bvh/bvh.h:67
vector< Geometry * > geometry
Definition bvh/bvh.h:70
BVHParams params
Definition bvh/bvh.h:69
vector< Object * > objects
Definition bvh/bvh.h:71
bool is_volume() const
bool is_pointcloud() const
bool is_hair() const
size_t prim_offset
AttributeSet attributes
bool is_mesh() const
Definition hair.h:13
Curve get_curve(const size_t i) const
Definition hair.h:111
size_t curve_segment_offset
Definition hair.h:90
size_t num_curves() const
Definition hair.h:126
size_t num_segments() const
Definition hair.h:131
CurveShapeType curve_shape
Definition hair.h:91
size_t num_keys() const
Definition hair.h:121
void alloc_to_device(const size_t num, bool shrink_to_fit=true)
@ MEM_READ_ONLY
CCL_NAMESPACE_BEGIN struct Options options
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:8
DebugFlags & DebugFlags()
Definition debug.h:145
#define KERNEL_FEATURE_OBJECT_MOTION
#define KERNEL_FEATURE_OSL_SHADING
#define KERNEL_FEATURE_SUBSURFACE
#define KERNEL_FEATURE_HAIR_THICK
#define KERNEL_FEATURE_PATH_TRACING
#define KERNEL_FEATURE_OSL_CAMERA
#define KERNEL_FEATURE_HAIR
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_BAKING
#define KERNEL_FEATURE_MNEE
#define KERNEL_FEATURE_POINTCLOUD
#define CCL_NAMESPACE_END
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
#define offsetof(t, d)
static float verts[][3]
ThreadMutex mutex
#define this
VecBase< float, 4 > float4
#define assert(assertion)
VecBase< float, D > step(VecOp< float, D >, VecOp< float, D >) RET
@ ATTR_STD_MOTION_VERTEX_POSITION
@ CURVE_THICK
@ BVH_LAYOUT_OPTIX
#define VLOG_INFO
Definition log.h:71
#define VLOG_IS_ON(severity)
Definition log.h:35
Segment< FEdge *, Vec3r > segment
int BVHLayoutMask
Definition params.h:50
@ BVH_TYPE_STATIC
Definition params.h:40
size_t path_file_size(const string &path)
Definition path.cpp:554
bool path_is_directory(const string &path)
Definition path.cpp:582
string path_get(const string &sub)
Definition path.cpp:337
string path_join(const string &dir, const string &file)
Definition path.cpp:415
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
static struct PyModuleDef module
Definition python.cpp:796
long long TypeDesc
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
float3 * data_float3()
float4 * data_float4()
void bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox &bounds) const
Definition hair.cpp:44
int first_key
Definition hair.h:19
int num_segments() const
Definition hair.h:22
int num_keys
Definition hair.h:20
size_t num_triangles() const
Definition scene/mesh.h:77
bool use_motion() const
int get_device_index() const
bool is_traceable() const
uint visibility_for_tracing() const
void bounds_grow(const float3 *points, const float *radius, BoundBox &bounds) const
Point get_point(const int i) const
size_t num_points() const
void push(TaskRunFunction &&task)
Definition task.cpp:21
void wait_work(Summary *stats=nullptr)
Definition task.cpp:27
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
std::mutex thread_mutex
Definition thread.h:27
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:28
void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, const size_t size)
uint64_t device_ptr
Definition types_base.h:44
ccl_device_inline size_t align_up(const size_t offset, const size_t alignment)
Definition types_base.h:47