Blender V4.3
optix/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2019 NVIDIA Corporation
2 * SPDX-FileCopyrightText: 2019-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#ifdef WITH_OPTIX
7
9# include "device/optix/queue.h"
10
11# include "bvh/bvh.h"
12# include "bvh/optix.h"
13
14# include "scene/hair.h"
15# include "scene/mesh.h"
16# include "scene/object.h"
17# include "scene/pass.h"
18# include "scene/pointcloud.h"
19# include "scene/scene.h"
20
21# include "util/debug.h"
22# include "util/log.h"
23# include "util/md5.h"
24# include "util/path.h"
25# include "util/progress.h"
26# include "util/task.h"
27# include "util/time.h"
28
29# define __KERNEL_OPTIX__
31
33
34# if OPTIX_ABI_VERSION >= 55
35static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
36{
37 OptixTask additional_tasks[16];
38 unsigned int num_additional_tasks = 0;
39
40 const OptixResult result = optixTaskExecute(task, additional_tasks, 16, &num_additional_tasks);
41 if (result == OPTIX_SUCCESS) {
42 for (unsigned int i = 0; i < num_additional_tasks; ++i) {
44 &execute_optix_task, std::ref(pool), additional_tasks[i], std::ref(failure_reason)));
45 }
46 }
47 else {
48 failure_reason = result;
49 }
50}
51# endif
52
53OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
54 : CUDADevice(info, stats, profiler, headless),
55 sbt_data(this, "__sbt", MEM_READ_ONLY),
56 launch_params(this, "kernel_params", false)
57{
58 /* Make the CUDA context current. */
59 if (!cuContext) {
60 /* Do not initialize if CUDA context creation failed already. */
61 return;
62 }
63 const CUDAContextScope scope(this);
64
65 /* Create OptiX context for this device. */
66 OptixDeviceContextOptions options = {};
67# ifdef WITH_CYCLES_LOGGING
68 options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
69 options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
70 switch (level) {
71 case 1:
72 LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
73 break;
74 case 2:
75 LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
76 break;
77 case 3:
78 LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
79 break;
80 case 4:
81 LOG_IF(INFO, VLOG_IS_ON(1)) << message;
82 break;
83 }
84 };
85# endif
86 if (DebugFlags().optix.use_debug) {
87 VLOG_INFO << "Using OptiX debug mode.";
88 options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
89 }
90 optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
91# ifdef WITH_CYCLES_LOGGING
92 optix_assert(optixDeviceContextSetLogCallback(
93 context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
94# endif
95
96 /* Fix weird compiler bug that assigns wrong size. */
97 launch_params.data_elements = sizeof(KernelParamsOptiX);
98
99 /* Allocate launch parameter buffer memory on device. */
100 launch_params.alloc_to_device(1);
101}
102
103OptiXDevice::~OptiXDevice()
104{
105 /* Make CUDA context current. */
106 const CUDAContextScope scope(this);
107
108 free_bvh_memory_delayed();
109
110 sbt_data.free();
111 texture_info.free();
112 launch_params.free();
113
114 /* Unload modules. */
115 if (optix_module != NULL) {
116 optixModuleDestroy(optix_module);
117 }
118 for (int i = 0; i < 2; ++i) {
119 if (builtin_modules[i] != NULL) {
120 optixModuleDestroy(builtin_modules[i]);
121 }
122 }
123 for (int i = 0; i < NUM_PIPELINES; ++i) {
124 if (pipelines[i] != NULL) {
125 optixPipelineDestroy(pipelines[i]);
126 }
127 }
128 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
129 if (groups[i] != NULL) {
130 optixProgramGroupDestroy(groups[i]);
131 }
132 }
133
134# ifdef WITH_OSL
135 for (const OptixModule &module : osl_modules) {
136 if (module != NULL) {
137 optixModuleDestroy(module);
138 }
139 }
140 for (const OptixProgramGroup &group : osl_groups) {
141 if (group != NULL) {
142 optixProgramGroupDestroy(group);
143 }
144 }
145# endif
146
147 optixDeviceContextDestroy(context);
148}
149
150unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
151{
152 return make_unique<OptiXDeviceQueue>(this);
153}
154
155BVHLayoutMask OptiXDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
156{
157 /* OptiX has its own internal acceleration structure format. */
158 return BVH_LAYOUT_OPTIX;
159}
160
161static string get_optix_include_dir()
162{
163 const char *env_dir = getenv("OPTIX_ROOT_DIR");
164 const char *default_dir = CYCLES_RUNTIME_OPTIX_ROOT_DIR;
165
166 if (env_dir && env_dir[0]) {
167 const string env_include_dir = path_join(env_dir, "include");
168 return env_include_dir;
169 }
170 else if (default_dir[0]) {
171 const string default_include_dir = path_join(default_dir, "include");
172 return default_include_dir;
173 }
174
175 return string();
176}
177
178string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
179{
180 string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
181
182 /* Add OptiX SDK include directory to include paths. */
183 common_cflags += string_printf(" -I\"%s\"", get_optix_include_dir().c_str());
184
185 /* Specialization for shader ray-tracing. */
186 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
187 common_cflags += " --keep-device-functions";
188 }
189
190 return common_cflags;
191}
192
193bool OptiXDevice::load_kernels(const uint kernel_features)
194{
195 if (have_error()) {
196 /* Abort early if context creation failed already. */
197 return false;
198 }
199
200# ifdef WITH_OSL
201 const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
202# else
203 const bool use_osl = false;
204# endif
205
206 /* Skip creating OptiX module if only doing denoising. */
207 const bool need_optix_kernels = (kernel_features &
209
210 /* Detect existence of OptiX kernel and SDK here early. So we can error out
211 * before compiling the CUDA kernels, to avoid failing right after when
212 * compiling the OptiX kernel. */
213 string suffix = use_osl ? "_osl" :
214 (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
215 "_shader_raytrace" :
216 "";
217 string ptx_filename;
218 if (need_optix_kernels) {
219 ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst");
220 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
221 std::string optix_include_dir = get_optix_include_dir();
222 if (optix_include_dir.empty()) {
223 set_error(
224 "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
225 "to a directory containing the OptiX SDK.");
226 return false;
227 }
228 else if (!path_is_directory(optix_include_dir)) {
229 set_error(string_printf(
230 "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
231 "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
232 "directory containing the OptiX SDK.",
233 optix_include_dir.c_str()));
234 return false;
235 }
236 }
237 }
238
239 /* Load CUDA modules because we need some of the utility kernels. */
240 if (!CUDADevice::load_kernels(kernel_features)) {
241 return false;
242 }
243
244 if (!need_optix_kernels) {
245 return true;
246 }
247
248 const CUDAContextScope scope(this);
249
250 /* Unload existing OptiX module and pipelines first. */
251 if (optix_module != NULL) {
252 optixModuleDestroy(optix_module);
253 optix_module = NULL;
254 }
255 for (int i = 0; i < 2; ++i) {
256 if (builtin_modules[i] != NULL) {
257 optixModuleDestroy(builtin_modules[i]);
258 builtin_modules[i] = NULL;
259 }
260 }
261 for (int i = 0; i < NUM_PIPELINES; ++i) {
262 if (pipelines[i] != NULL) {
263 optixPipelineDestroy(pipelines[i]);
264 pipelines[i] = NULL;
265 }
266 }
267 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
268 if (groups[i] != NULL) {
269 optixProgramGroupDestroy(groups[i]);
270 groups[i] = NULL;
271 }
272 }
273
274# ifdef WITH_OSL
275 /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
276 for (const OptixModule &module : osl_modules) {
277 if (module != NULL) {
278 optixModuleDestroy(module);
279 }
280 }
281 osl_modules.clear();
282
283 for (const OptixProgramGroup &group : osl_groups) {
284 if (group != NULL) {
285 optixProgramGroupDestroy(group);
286 }
287 }
288 osl_groups.clear();
289# endif
290
291 OptixModuleCompileOptions module_options = {};
292 module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
293
294 if (DebugFlags().optix.use_debug) {
295 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
296 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
297 }
298 else {
299 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
300 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
301 }
302
303 module_options.boundValues = nullptr;
304 module_options.numBoundValues = 0;
305# if OPTIX_ABI_VERSION >= 55
306 module_options.payloadTypes = nullptr;
307 module_options.numPayloadTypes = 0;
308# endif
309
310 /* Default to no motion blur and two-level graph, since it is the fastest option. */
311 pipeline_options.usesMotionBlur = false;
312 pipeline_options.traversableGraphFlags =
313 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
314 pipeline_options.numPayloadValues = 8;
315 pipeline_options.numAttributeValues = 2; /* u, v */
316 pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
317 pipeline_options.pipelineLaunchParamsVariableName = "kernel_params"; /* See globals.h */
318
319 pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
320 if (kernel_features & KERNEL_FEATURE_HAIR) {
321 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
322# if OPTIX_ABI_VERSION >= 55
323 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM;
324# else
325 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
326# endif
327 }
328 else
329 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
330 }
331 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
332 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
333 }
334
335 /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
336 * This is necessary since objects may be reported to have motion if the Vector pass is
337 * active, but may still need to be rendered without motion blur if that isn't active as well. */
338 if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
339 pipeline_options.usesMotionBlur = true;
340 /* Motion blur can insert motion transforms into the traversal graph.
341 * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
342 pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
343 }
344
345 { /* Load and compile PTX module with OptiX kernels. */
346 string ptx_data;
347 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
348 string cflags = compile_kernel_get_common_cflags(kernel_features);
349 ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
350 }
351 if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) {
352 set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
353 return false;
354 }
355
356# if OPTIX_ABI_VERSION >= 84
357 OptixTask task = nullptr;
358 OptixResult result = optixModuleCreateWithTasks(context,
359 &module_options,
360 &pipeline_options,
361 ptx_data.data(),
362 ptx_data.size(),
363 nullptr,
364 nullptr,
365 &optix_module,
366 &task);
367 if (result == OPTIX_SUCCESS) {
369 execute_optix_task(pool, task, result);
370 pool.wait_work();
371 }
372# elif OPTIX_ABI_VERSION >= 55
373 OptixTask task = nullptr;
374 OptixResult result = optixModuleCreateFromPTXWithTasks(context,
375 &module_options,
376 &pipeline_options,
377 ptx_data.data(),
378 ptx_data.size(),
379 nullptr,
380 nullptr,
381 &optix_module,
382 &task);
383 if (result == OPTIX_SUCCESS) {
385 execute_optix_task(pool, task, result);
386 pool.wait_work();
387 }
388# else
389 const OptixResult result = optixModuleCreateFromPTX(context,
390 &module_options,
391 &pipeline_options,
392 ptx_data.data(),
393 ptx_data.size(),
394 nullptr,
395 0,
396 &optix_module);
397# endif
398 if (result != OPTIX_SUCCESS) {
399 set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
400 ptx_filename.c_str(),
401 optixGetErrorName(result)));
402 return false;
403 }
404 }
405
406 /* Create program groups. */
407 OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
408 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
409 group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
410 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
411 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
412 "__raygen__kernel_optix_integrator_intersect_closest";
413 group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
414 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
415 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
416 "__raygen__kernel_optix_integrator_intersect_shadow";
417 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
418 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
419 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
420 "__raygen__kernel_optix_integrator_intersect_subsurface";
421 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
422 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
423 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
424 "__raygen__kernel_optix_integrator_intersect_volume_stack";
425 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
426 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.module = optix_module;
427 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.entryFunctionName =
428 "__raygen__kernel_optix_integrator_intersect_dedicated_light";
429 group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
430 group_descs[PG_MISS].miss.module = optix_module;
431 group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
432 group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
433 group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
434 group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
435 group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
436 group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
437 group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
438 group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
439 group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
440 group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
441 group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
442 group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
443 group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
444 group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";
445
446 if (kernel_features & KERNEL_FEATURE_HAIR) {
447 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
448 /* Built-in thick curve intersection. */
449 OptixBuiltinISOptions builtin_options = {};
450# if OPTIX_ABI_VERSION >= 55
451 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
452 builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE |
453 OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
454 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
455 builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */
456# else
457 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
458# endif
459 builtin_options.usesMotionBlur = false;
460
461 optix_assert(optixBuiltinISModuleGet(
462 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
463
464 group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
465 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
466 group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
467 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
468
469 if (pipeline_options.usesMotionBlur) {
470 builtin_options.usesMotionBlur = true;
471
472 optix_assert(optixBuiltinISModuleGet(
473 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
474
475 group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
476 group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
477 group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
478 group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
479 }
480 }
481 else {
482 /* Custom ribbon intersection. */
483 group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
484 group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
485 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
486 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
487 }
488 }
489
490 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
491 group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
492 group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
493 group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module;
494 group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
495 group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS];
496 group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
497 group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module;
498 group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
499 }
500
501 /* Add hit group for local intersections. */
503 group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
504 group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
505 group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
506 }
507
508 /* Shader ray-tracing replaces some functions with direct callables. */
509 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
510 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
511 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
512 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
513 "__raygen__kernel_optix_integrator_shade_surface_raytrace";
514
515 /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
516 if (!use_osl) {
517 group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
518 group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
519 group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
520 group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
521 group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
522 group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
523 "__direct_callable__svm_node_bevel";
524 }
525 }
526
527 if (kernel_features & KERNEL_FEATURE_MNEE) {
528 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
529 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
530 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.entryFunctionName =
531 "__raygen__kernel_optix_integrator_shade_surface_mnee";
532 }
533
534 /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
535 if (use_osl) {
536 group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
537 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
538 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
539 "__raygen__kernel_optix_integrator_shade_background";
540 group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
541 group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
542 group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
543 "__raygen__kernel_optix_integrator_shade_light";
544 group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
545 group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
546 group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
547 "__raygen__kernel_optix_integrator_shade_surface";
548 group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
549 group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
550 group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
551 "__raygen__kernel_optix_integrator_shade_volume";
552 group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
553 group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
554 group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
555 "__raygen__kernel_optix_integrator_shade_shadow";
556 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
557 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.module = optix_module;
558 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.entryFunctionName =
559 "__raygen__kernel_optix_integrator_shade_dedicated_light";
560 group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
561 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
562 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
563 "__raygen__kernel_optix_shader_eval_displace";
564 group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
565 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
566 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
567 "__raygen__kernel_optix_shader_eval_background";
568 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
569 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
570 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
571 "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
572 }
573
574 optix_assert(optixProgramGroupCreate(
575 context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
576
577 /* Get program stack sizes. */
578 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
579 /* Set up SBT, which in this case is used only to select between different programs. */
580 sbt_data.alloc(NUM_PROGRAM_GROUPS);
581 memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
582 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
583 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
584# if OPTIX_ABI_VERSION >= 84
585 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
586# else
587 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
588# endif
589 }
590 sbt_data.copy_to_device(); /* Upload SBT to device. */
591
592 /* Calculate maximum trace continuation stack size. */
593 unsigned int trace_css = stack_size[PG_HITD].cssCH;
594 /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
595 trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
596 trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
597 trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
598 trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
599 trace_css = std::max(trace_css,
600 stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
601 trace_css = std::max(trace_css,
602 stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
603 trace_css = std::max(
604 trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH);
605 trace_css = std::max(
606 trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH);
607
608 OptixPipelineLinkOptions link_options = {};
609 link_options.maxTraceDepth = 1;
610# if OPTIX_ABI_VERSION < 84
611 link_options.debugLevel = module_options.debugLevel;
612# endif
613
614 if (use_osl) {
615 /* Re-create OSL pipeline in case kernels are reloaded after it has been created before. */
616 load_osl_kernels();
617 }
618 else if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) {
619 /* Create shader ray-tracing and MNEE pipeline. */
620 vector<OptixProgramGroup> pipeline_groups;
621 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
622 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
623 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
624 pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
625 pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
626 }
627 if (kernel_features & KERNEL_FEATURE_MNEE) {
628 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
629 }
630 pipeline_groups.push_back(groups[PG_MISS]);
631 pipeline_groups.push_back(groups[PG_HITD]);
632 pipeline_groups.push_back(groups[PG_HITS]);
633 pipeline_groups.push_back(groups[PG_HITL]);
634 pipeline_groups.push_back(groups[PG_HITV]);
635 if (pipeline_options.usesMotionBlur) {
636 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
637 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
638 }
639 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
640 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
641 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
642 }
643
644 optix_assert(optixPipelineCreate(context,
645 &pipeline_options,
646 &link_options,
647 pipeline_groups.data(),
648 pipeline_groups.size(),
649 nullptr,
650 0,
651 &pipelines[PIP_SHADE]));
652
653 /* Combine ray generation and trace continuation stack size. */
654 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
655 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
656 link_options.maxTraceDepth * trace_css;
657 const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
658 stack_size[PG_CALL_SVM_BEVEL].dssDC);
659
660 /* Set stack size depending on pipeline options. */
661 optix_assert(optixPipelineSetStackSize(
662 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
663 }
664
665 { /* Create intersection-only pipeline. */
666 vector<OptixProgramGroup> pipeline_groups;
667 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
668 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
669 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
670 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
671 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
672 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_DEDICATED_LIGHT]);
673 pipeline_groups.push_back(groups[PG_MISS]);
674 pipeline_groups.push_back(groups[PG_HITD]);
675 pipeline_groups.push_back(groups[PG_HITS]);
676 pipeline_groups.push_back(groups[PG_HITL]);
677 pipeline_groups.push_back(groups[PG_HITV]);
678 if (pipeline_options.usesMotionBlur) {
679 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
680 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
681 }
682 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
683 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
684 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
685 }
686
687 optix_assert(optixPipelineCreate(context,
688 &pipeline_options,
689 &link_options,
690 pipeline_groups.data(),
691 pipeline_groups.size(),
692 nullptr,
693 0,
694 &pipelines[PIP_INTERSECT]));
695
696 /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
697 const unsigned int css =
698 std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
699 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
700 std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
701 stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
702 link_options.maxTraceDepth * trace_css;
703
704 optix_assert(optixPipelineSetStackSize(
705 pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
706 }
707
708 return !have_error();
709}
710
711bool OptiXDevice::load_osl_kernels()
712{
713# ifdef WITH_OSL
714 if (have_error()) {
715 return false;
716 }
717
718 struct OSLKernel {
719 string ptx;
720 string init_entry;
721 string exec_entry;
722 };
723
724 /* This has to be in the same order as the ShaderType enum, so that the index calculation in
725 * osl_eval_nodes checks out */
726 vector<OSLKernel> osl_kernels;
727
729 type = static_cast<ShaderType>(type + 1))
730 {
731 const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
732 osl_globals.surface_state :
733 type == SHADER_TYPE_VOLUME ?
734 osl_globals.volume_state :
736 osl_globals.displacement_state :
737 osl_globals.bump_state);
738 for (const OSL::ShaderGroupRef &group : groups) {
739 if (group) {
740 string osl_ptx, init_name, entry_name;
741 osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
742 osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
743 osl_globals.ss->getattribute(
744 group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
745
746 int groupdata_size = 0;
747 osl_globals.ss->getattribute(group.get(), "llvm_groupdata_size", groupdata_size);
748 if (groupdata_size == 0) {
749 // Old attribute name from our patched OSL version as fallback.
750 osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
751 }
752 if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
753 set_error(
754 string_printf("Requested OSL group data size (%d) is greater than the maximum "
755 "supported with OptiX (2048)",
756 groupdata_size));
757 return false;
758 }
759
760 osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
761 }
762 else {
763 /* Add empty entry for non-existent shader groups, so that the index stays stable. */
764 osl_kernels.emplace_back();
765 }
766 }
767 }
768
769 const CUDAContextScope scope(this);
770
771 if (pipelines[PIP_SHADE]) {
772 optixPipelineDestroy(pipelines[PIP_SHADE]);
773 }
774
775 for (OptixModule &module : osl_modules) {
776 if (module != NULL) {
777 optixModuleDestroy(module);
778 module = NULL;
779 }
780 }
781 for (OptixProgramGroup &group : osl_groups) {
782 if (group != NULL) {
783 optixProgramGroupDestroy(group);
784 group = NULL;
785 }
786 }
787
788 if (osl_kernels.empty()) {
789 /* No OSL shader groups, so no need to create a pipeline. */
790 return true;
791 }
792
793 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
794 OptixModuleCompileOptions module_options = {};
795 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
796 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
797
798 osl_groups.resize(osl_kernels.size() * 2 + 1);
799 osl_modules.resize(osl_kernels.size() + 1);
800
801 { /* Load and compile PTX module with OSL services. */
802 string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst");
803 if (!path_read_compressed_text(ptx_filename, ptx_data)) {
804 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
805 ptx_filename.c_str()));
806 return false;
807 }
808
809# if OPTIX_ABI_VERSION >= 84
810 const OptixResult result = optixModuleCreate(context,
811 &module_options,
812 &pipeline_options,
813 ptx_data.data(),
814 ptx_data.size(),
815 nullptr,
816 0,
817 &osl_modules.back());
818# else
819 const OptixResult result = optixModuleCreateFromPTX(context,
820 &module_options,
821 &pipeline_options,
822 ptx_data.data(),
823 ptx_data.size(),
824 nullptr,
825 0,
826 &osl_modules.back());
827# endif
828 if (result != OPTIX_SUCCESS) {
829 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
830 ptx_filename.c_str(),
831 optixGetErrorName(result)));
832 return false;
833 }
834
835 OptixProgramGroupDesc group_desc = {};
836 group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
837 group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
838 group_desc.callables.moduleDC = osl_modules.back();
839
840 optix_assert(optixProgramGroupCreate(
841 context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
842 }
843
845 vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
846
847 for (size_t i = 0; i < osl_kernels.size(); ++i) {
848 if (osl_kernels[i].ptx.empty()) {
849 continue;
850 }
851
852# if OPTIX_ABI_VERSION >= 84
853 OptixTask task = nullptr;
854 results[i] = optixModuleCreateWithTasks(context,
855 &module_options,
856 &pipeline_options,
857 osl_kernels[i].ptx.data(),
858 osl_kernels[i].ptx.size(),
859 nullptr,
860 nullptr,
861 &osl_modules[i],
862 &task);
863 if (results[i] == OPTIX_SUCCESS) {
864 execute_optix_task(pool, task, results[i]);
865 }
866# elif OPTIX_ABI_VERSION >= 55
867 OptixTask task = nullptr;
868 results[i] = optixModuleCreateFromPTXWithTasks(context,
869 &module_options,
870 &pipeline_options,
871 osl_kernels[i].ptx.data(),
872 osl_kernels[i].ptx.size(),
873 nullptr,
874 nullptr,
875 &osl_modules[i],
876 &task);
877 if (results[i] == OPTIX_SUCCESS) {
878 execute_optix_task(pool, task, results[i]);
879 }
880# else
881 pool.push([this, &results, i, &module_options, &osl_kernels]() {
882 results[i] = optixModuleCreateFromPTX(context,
883 &module_options,
884 &pipeline_options,
885 osl_kernels[i].ptx.data(),
886 osl_kernels[i].ptx.size(),
887 nullptr,
888 0,
889 &osl_modules[i]);
890 });
891# endif
892 }
893
894 pool.wait_work();
895
896 for (size_t i = 0; i < osl_kernels.size(); ++i) {
897 if (osl_kernels[i].ptx.empty()) {
898 continue;
899 }
900
901 if (results[i] != OPTIX_SUCCESS) {
902 set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
903 osl_kernels[i].init_entry.c_str(),
904 optixGetErrorName(results[i])));
905 return false;
906 }
907
908 OptixProgramGroupDesc group_descs[2] = {};
909 group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
910 group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
911 group_descs[0].callables.moduleDC = osl_modules[i];
912 group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
913 group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
914 group_descs[1].callables.moduleDC = osl_modules[i];
915
916 optix_assert(optixProgramGroupCreate(
917 context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
918 }
919
920 /* Update SBT with new entries. */
921 sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
922 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
923 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
924 }
925 for (size_t i = 0; i < osl_groups.size(); ++i) {
926 if (osl_groups[i] != NULL) {
927 optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
928 }
929 else {
930 /* Default to "__direct_callable__dummy_services", so that OSL evaluation for empty
931 * materials has direct callables to call and does not crash. */
932 optix_assert(optixSbtRecordPackHeader(osl_groups.back(), &sbt_data[NUM_PROGRAM_GROUPS + i]));
933 }
934 }
935 sbt_data.copy_to_device(); /* Upload updated SBT to device. */
936
937 OptixPipelineLinkOptions link_options = {};
938 link_options.maxTraceDepth = 0;
939# if OPTIX_ABI_VERSION < 84
940 link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
941# endif
942
943 {
944 vector<OptixProgramGroup> pipeline_groups;
945 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
946 pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
947 pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
948 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
949 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
950 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
951 pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
952 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
953 pipeline_groups.push_back(groups[PG_RGEN_SHADE_DEDICATED_LIGHT]);
954 pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
955 pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
956 pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
957
958 for (const OptixProgramGroup &group : osl_groups) {
959 if (group != NULL) {
960 pipeline_groups.push_back(group);
961 }
962 }
963
964 optix_assert(optixPipelineCreate(context,
965 &pipeline_options,
966 &link_options,
967 pipeline_groups.data(),
968 pipeline_groups.size(),
969 nullptr,
970 0,
971 &pipelines[PIP_SHADE]));
972
973 /* Get program stack sizes. */
974 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
975 vector<OptixStackSizes> osl_stack_size(osl_groups.size());
976
977 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
978# if OPTIX_ABI_VERSION >= 84
979 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
980# else
981 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
982# endif
983 }
984 for (size_t i = 0; i < osl_groups.size(); ++i) {
985 if (osl_groups[i] != NULL) {
986# if OPTIX_ABI_VERSION >= 84
987 optix_assert(optixProgramGroupGetStackSize(
988 osl_groups[i], &osl_stack_size[i], pipelines[PIP_SHADE]));
989# else
990 optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
991# endif
992 }
993 }
994
995 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
996 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
997 unsigned int dss = 0;
998 for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
999 dss = std::max(dss, osl_stack_size[i].dssDC);
1000 }
1001
1002 optix_assert(optixPipelineSetStackSize(
1003 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
1004 }
1005
1006 return !have_error();
1007# else
1008 return false;
1009# endif
1010}
1011
1012void *OptiXDevice::get_cpu_osl_memory()
1013{
1014# ifdef WITH_OSL
1015 return &osl_globals;
1016# else
1017 return NULL;
1018# endif
1019}
1020
1021bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
1022 OptixBuildOperation operation,
1023 const OptixBuildInput &build_input,
1024 uint16_t num_motion_steps)
1025{
1026 /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1027 * from running out of memory (since both original and compacted acceleration structure memory
1028 * may be allocated at the same time for the duration of this function). The builds would
1029 * otherwise happen on the same CUDA stream anyway. */
1030 static thread_mutex mutex;
1032
1033 const CUDAContextScope scope(this);
1034
1035 bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1036
1037 /* Compute memory usage. */
1038 OptixAccelBufferSizes sizes = {};
1039 OptixAccelBuildOptions options = {};
1040 options.operation = operation;
1041 if (build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) {
1042 /* The build flags have to match the ones used to query the built-in curve intersection
1043 * program (see optixBuiltinISModuleGet above) */
1044 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
1045 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1046 use_fast_trace_bvh = true;
1047 }
1048 else if (use_fast_trace_bvh) {
1049 VLOG_INFO << "Using fast to trace OptiX BVH";
1050 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1051 }
1052 else {
1053 VLOG_INFO << "Using fast to update OptiX BVH";
1054 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1055 }
1056
1057 options.motionOptions.numKeys = num_motion_steps;
1058 options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1059 options.motionOptions.timeBegin = 0.0f;
1060 options.motionOptions.timeEnd = 1.0f;
1061
1062 optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1063
1064 /* Allocate required output buffers. */
1065 device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1066 temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1067 if (!temp_mem.device_pointer) {
1068 /* Make sure temporary memory allocation succeeded. */
1069 return false;
1070 }
1071
1072 /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
1073 device_only_memory<char> &out_data = *bvh->as_data;
1074 if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1075 assert(out_data.device == this);
1076 out_data.alloc_to_device(sizes.outputSizeInBytes);
1077 if (!out_data.device_pointer) {
1078 return false;
1079 }
1080 }
1081 else {
1082 assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1083 }
1084
1085 /* Finally build the acceleration structure. */
1086 OptixAccelEmitDesc compacted_size_prop = {};
1087 compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1088 /* A tiny space was allocated for this property at the end of the temporary buffer above.
1089 * Make sure this pointer is 8-byte aligned. */
1090 compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1091
1092 OptixTraversableHandle out_handle = 0;
1093 optix_assert(optixAccelBuild(context,
1094 NULL,
1095 &options,
1096 &build_input,
1097 1,
1098 temp_mem.device_pointer,
1099 sizes.tempSizeInBytes,
1100 out_data.device_pointer,
1101 sizes.outputSizeInBytes,
1102 &out_handle,
1103 use_fast_trace_bvh ? &compacted_size_prop : NULL,
1104 use_fast_trace_bvh ? 1 : 0));
1105 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1106
1107 /* Wait for all operations to finish. */
1108 cuda_assert(cuStreamSynchronize(NULL));
1109
1110 /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
1111 */
1112 if (use_fast_trace_bvh) {
1113 uint64_t compacted_size = sizes.outputSizeInBytes;
1114 cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1115
1116 /* Temporary memory is no longer needed, so free it now to make space. */
1117 temp_mem.free();
1118
1119 /* There is no point compacting if the size does not change. */
1120 if (compacted_size < sizes.outputSizeInBytes) {
1121 device_only_memory<char> compacted_data(this, "optix compacted as", false);
1122 compacted_data.alloc_to_device(compacted_size);
1123 if (!compacted_data.device_pointer) {
1124 /* Do not compact if memory allocation for compacted acceleration structure fails.
1125 * Can just use the uncompacted one then, so succeed here regardless. */
1126 return !have_error();
1127 }
1128
1129 optix_assert(optixAccelCompact(
1130 context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
1131 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1132
1133 /* Wait for compaction to finish. */
1134 cuda_assert(cuStreamSynchronize(NULL));
1135
1136 std::swap(out_data.device_size, compacted_data.device_size);
1137 std::swap(out_data.device_pointer, compacted_data.device_pointer);
1138 /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
1139 */
1140 }
1141 }
1142
1143 return !have_error();
1144}
1145
1146void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1147{
1148 const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1149
1150 free_bvh_memory_delayed();
1151
1152 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1153
1154 progress.set_substatus("Building OptiX acceleration structure");
1155
1156 if (!bvh->params.top_level) {
1157 assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1158
1159 /* Refit is only possible in viewport for now (because AS is built with
1160 * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
1161 OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1162 if (refit && !use_fast_trace_bvh) {
1163 assert(bvh_optix->traversable_handle != 0);
1164 operation = OPTIX_BUILD_OPERATION_UPDATE;
1165 }
1166 else {
1167 bvh_optix->as_data->free();
1168 bvh_optix->traversable_handle = 0;
1169 }
1170
1171 /* Build bottom level acceleration structures (BLAS). */
1172 Geometry *const geom = bvh->geometry[0];
1173 if (geom->geometry_type == Geometry::HAIR) {
1174 /* Build BLAS for curve primitives. */
1175 Hair *const hair = static_cast<Hair *const>(geom);
1176 if (hair->num_segments() == 0) {
1177 return;
1178 }
1179
1180 const size_t num_segments = hair->num_segments();
1181
1182 size_t num_motion_steps = 1;
1183 Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1184 if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
1185 num_motion_steps = hair->get_motion_steps();
1186 }
1187
1188 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1189 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1190 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1191 /* Four control points for each curve segment. */
1192 size_t num_vertices = num_segments * 4;
1193 if (hair->curve_shape == CURVE_THICK) {
1194# if OPTIX_ABI_VERSION >= 55
1195 num_vertices = hair->num_keys() + 2 * hair->num_curves();
1196# endif
1197 index_data.alloc(num_segments);
1198 vertex_data.alloc(num_vertices * num_motion_steps);
1199 }
1200 else {
1201 aabb_data.alloc(num_segments * num_motion_steps);
1202 }
1203
1204 /* Get AABBs for each motion step. */
1205 for (size_t step = 0; step < num_motion_steps; ++step) {
1206 /* The center step for motion vertices is not stored in the attribute. */
1207 const float3 *keys = hair->get_curve_keys().data();
1208 size_t center_step = (num_motion_steps - 1) / 2;
1209 if (step != center_step) {
1210 size_t attr_offset = (step > center_step) ? step - 1 : step;
1211 /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
1212 keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1213 }
1214
1215# if OPTIX_ABI_VERSION >= 55
1216 if (hair->curve_shape == CURVE_THICK) {
1217 for (size_t curve_index = 0, segment_index = 0, vertex_index = step * num_vertices;
1218 curve_index < hair->num_curves();
1219 ++curve_index)
1220 {
1221 const Hair::Curve curve = hair->get_curve(curve_index);
1222 const array<float> &curve_radius = hair->get_curve_radius();
1223
1224 const int first_key_index = curve.first_key;
1225 {
1226 vertex_data[vertex_index++] = make_float4(keys[first_key_index].x,
1227 keys[first_key_index].y,
1228 keys[first_key_index].z,
1229 curve_radius[first_key_index]);
1230 }
1231
1232 for (int k = 0; k < curve.num_segments(); ++k) {
1233 if (step == 0) {
1234 index_data[segment_index++] = vertex_index - 1;
1235 }
1236 vertex_data[vertex_index++] = make_float4(keys[first_key_index + k].x,
1237 keys[first_key_index + k].y,
1238 keys[first_key_index + k].z,
1239 curve_radius[first_key_index + k]);
1240 }
1241
1242 const int last_key_index = first_key_index + curve.num_keys - 1;
1243 {
1244 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1245 keys[last_key_index].y,
1246 keys[last_key_index].z,
1247 curve_radius[last_key_index]);
1248 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1249 keys[last_key_index].y,
1250 keys[last_key_index].z,
1251 curve_radius[last_key_index]);
1252 }
1253 }
1254 }
1255 else
1256# endif
1257 {
1258 for (size_t curve_index = 0, i = 0; curve_index < hair->num_curves(); ++curve_index) {
1259 const Hair::Curve curve = hair->get_curve(curve_index);
1260
1261 for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1262# if OPTIX_ABI_VERSION < 55
1263 if (hair->curve_shape == CURVE_THICK) {
1264 const array<float> &curve_radius = hair->get_curve_radius();
1265
1266 int k0 = curve.first_key + segment;
1267 int k1 = k0 + 1;
1268 int ka = max(k0 - 1, curve.first_key);
1269 int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
1270
1271 index_data[i] = i * 4;
1272 float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
1273
1274 const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
1275 const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
1276 const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
1277 const float4 pw = make_float4(
1278 curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
1279
1280 /* Convert Catmull-Rom data to B-spline. */
1281 static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
1282 static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
1283 static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
1284 static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
1285
1286 v[0] = make_float4(
1287 dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
1288 v[1] = make_float4(
1289 dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
1290 v[2] = make_float4(
1291 dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
1292 v[3] = make_float4(
1293 dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
1294 }
1295 else
1296# endif
1297 {
1299 curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1300
1301 const size_t index = step * num_segments + i;
1302 aabb_data[index].minX = bounds.min.x;
1303 aabb_data[index].minY = bounds.min.y;
1304 aabb_data[index].minZ = bounds.min.z;
1305 aabb_data[index].maxX = bounds.max.x;
1306 aabb_data[index].maxY = bounds.max.y;
1307 aabb_data[index].maxZ = bounds.max.z;
1308 }
1309 }
1310 }
1311 }
1312 }
1313
1314 /* Upload AABB data to GPU. */
1315 aabb_data.copy_to_device();
1316 index_data.copy_to_device();
1317 vertex_data.copy_to_device();
1318
1319 vector<device_ptr> aabb_ptrs;
1320 aabb_ptrs.reserve(num_motion_steps);
1321 vector<device_ptr> width_ptrs;
1322 vector<device_ptr> vertex_ptrs;
1323 width_ptrs.reserve(num_motion_steps);
1324 vertex_ptrs.reserve(num_motion_steps);
1325 for (size_t step = 0; step < num_motion_steps; ++step) {
1326 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1327 const device_ptr base_ptr = vertex_data.device_pointer +
1328 step * num_vertices * sizeof(float4);
1329 width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
1330 vertex_ptrs.push_back(base_ptr);
1331 }
1332
1333 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1334 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1335 OptixBuildInput build_input = {};
1336 if (hair->curve_shape == CURVE_THICK) {
1337 build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1338# if OPTIX_ABI_VERSION >= 55
1339 build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
1340# else
1341 build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
1342# endif
1343 build_input.curveArray.numPrimitives = num_segments;
1344 build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1345 build_input.curveArray.numVertices = num_vertices;
1346 build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1347 build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1348 build_input.curveArray.widthStrideInBytes = sizeof(float4);
1349 build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1350 build_input.curveArray.indexStrideInBytes = sizeof(int);
1351 build_input.curveArray.flag = build_flags;
1352 build_input.curveArray.primitiveIndexOffset = hair->curve_segment_offset;
1353 }
1354 else {
1355 /* Disable visibility test any-hit program, since it is already checked during
1356 * intersection. Those trace calls that require any-hit can force it with a ray flag. */
1357 build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1358
1359 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1360 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1361 build_input.customPrimitiveArray.numPrimitives = num_segments;
1362 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1363 build_input.customPrimitiveArray.flags = &build_flags;
1364 build_input.customPrimitiveArray.numSbtRecords = 1;
1365 build_input.customPrimitiveArray.primitiveIndexOffset = hair->curve_segment_offset;
1366 }
1367
1368 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1369 progress.set_error("Failed to build OptiX acceleration structure");
1370 }
1371 }
1372 else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
1373 /* Build BLAS for triangle primitives. */
1374 Mesh *const mesh = static_cast<Mesh *const>(geom);
1375 if (mesh->num_triangles() == 0) {
1376 return;
1377 }
1378
1379 const size_t num_verts = mesh->get_verts().size();
1380
1381 size_t num_motion_steps = 1;
1382 Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1383 if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
1384 num_motion_steps = mesh->get_motion_steps();
1385 }
1386
1387 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1388 index_data.alloc(mesh->get_triangles().size());
1389 memcpy(index_data.data(),
1390 mesh->get_triangles().data(),
1391 mesh->get_triangles().size() * sizeof(int));
1392 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1393 vertex_data.alloc(num_verts * num_motion_steps);
1394
1395 for (size_t step = 0; step < num_motion_steps; ++step) {
1396 const float3 *verts = mesh->get_verts().data();
1397
1398 size_t center_step = (num_motion_steps - 1) / 2;
1399 /* The center step for motion vertices is not stored in the attribute. */
1400 if (step != center_step) {
1401 verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
1402 }
1403
1404 memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1405 }
1406
1407 /* Upload triangle data to GPU. */
1408 index_data.copy_to_device();
1409 vertex_data.copy_to_device();
1410
1411 vector<device_ptr> vertex_ptrs;
1412 vertex_ptrs.reserve(num_motion_steps);
1413 for (size_t step = 0; step < num_motion_steps; ++step) {
1414 vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1415 }
1416
1417 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1418 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1419 OptixBuildInput build_input = {};
1420 build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1421 build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1422 build_input.triangleArray.numVertices = num_verts;
1423 build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1424 build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
1425 build_input.triangleArray.indexBuffer = index_data.device_pointer;
1426 build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1427 build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1428 build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1429 build_input.triangleArray.flags = &build_flags;
1430 /* The SBT does not store per primitive data since Cycles already allocates separate
1431 * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1432 * one and rely on that having the same meaning in this case. */
1433 build_input.triangleArray.numSbtRecords = 1;
1434 build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset;
1435
1436 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1437 progress.set_error("Failed to build OptiX acceleration structure");
1438 }
1439 }
1440 else if (geom->geometry_type == Geometry::POINTCLOUD) {
1441 /* Build BLAS for points primitives. */
1442 PointCloud *const pointcloud = static_cast<PointCloud *const>(geom);
1443 const size_t num_points = pointcloud->num_points();
1444 if (num_points == 0) {
1445 return;
1446 }
1447
1448 size_t num_motion_steps = 1;
1449 Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1450 if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
1451 num_motion_steps = pointcloud->get_motion_steps();
1452 }
1453
1454 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1455 aabb_data.alloc(num_points * num_motion_steps);
1456
1457 /* Get AABBs for each motion step. */
1458 for (size_t step = 0; step < num_motion_steps; ++step) {
1459 /* The center step for motion vertices is not stored in the attribute. */
1460 size_t center_step = (num_motion_steps - 1) / 2;
1461
1462 if (step == center_step) {
1463 const float3 *points = pointcloud->get_points().data();
1464 const float *radius = pointcloud->get_radius().data();
1465
1466 for (size_t i = 0; i < num_points; ++i) {
1467 const PointCloud::Point point = pointcloud->get_point(i);
1469 point.bounds_grow(points, radius, bounds);
1470
1471 const size_t index = step * num_points + i;
1472 aabb_data[index].minX = bounds.min.x;
1473 aabb_data[index].minY = bounds.min.y;
1474 aabb_data[index].minZ = bounds.min.z;
1475 aabb_data[index].maxX = bounds.max.x;
1476 aabb_data[index].maxY = bounds.max.y;
1477 aabb_data[index].maxZ = bounds.max.z;
1478 }
1479 }
1480 else {
1481 size_t attr_offset = (step > center_step) ? step - 1 : step;
1482 const float4 *points = motion_points->data_float4() + attr_offset * num_points;
1483
1484 for (size_t i = 0; i < num_points; ++i) {
1485 const PointCloud::Point point = pointcloud->get_point(i);
1487 point.bounds_grow(points[i], bounds);
1488
1489 const size_t index = step * num_points + i;
1490 aabb_data[index].minX = bounds.min.x;
1491 aabb_data[index].minY = bounds.min.y;
1492 aabb_data[index].minZ = bounds.min.z;
1493 aabb_data[index].maxX = bounds.max.x;
1494 aabb_data[index].maxY = bounds.max.y;
1495 aabb_data[index].maxZ = bounds.max.z;
1496 }
1497 }
1498 }
1499
1500 /* Upload AABB data to GPU. */
1501 aabb_data.copy_to_device();
1502
1503 vector<device_ptr> aabb_ptrs;
1504 aabb_ptrs.reserve(num_motion_steps);
1505 for (size_t step = 0; step < num_motion_steps; ++step) {
1506 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb));
1507 }
1508
1509 /* Disable visibility test any-hit program, since it is already checked during
1510 * intersection. Those trace calls that require anyhit can force it with a ray flag.
1511 * For those, force a single any-hit call, so shadow record-all behavior works correctly. */
1512 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT |
1513 OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1514 OptixBuildInput build_input = {};
1515 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1516# if OPTIX_ABI_VERSION < 23
1517 build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1518 build_input.aabbArray.numPrimitives = num_points;
1519 build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
1520 build_input.aabbArray.flags = &build_flags;
1521 build_input.aabbArray.numSbtRecords = 1;
1522 build_input.aabbArray.primitiveIndexOffset = pointcloud->prim_offset;
1523# else
1524 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1525 build_input.customPrimitiveArray.numPrimitives = num_points;
1526 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1527 build_input.customPrimitiveArray.flags = &build_flags;
1528 build_input.customPrimitiveArray.numSbtRecords = 1;
1529 build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset;
1530# endif
1531
1532 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1533 progress.set_error("Failed to build OptiX acceleration structure");
1534 }
1535 }
1536 }
1537 else {
1538 unsigned int num_instances = 0;
1539 unsigned int max_num_instances = 0xFFFFFFFF;
1540
1541 bvh_optix->as_data->free();
1542 bvh_optix->traversable_handle = 0;
1543 bvh_optix->motion_transform_data->free();
1544
1545 optixDeviceContextGetProperty(context,
1546 OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1547 &max_num_instances,
1548 sizeof(max_num_instances));
1549 /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
1550 max_num_instances >>= 1;
1551 if (bvh->objects.size() > max_num_instances) {
1552 progress.set_error(
1553 "Failed to build OptiX acceleration structure because there are too many instances");
1554 return;
1555 }
1556
1557 /* Fill instance descriptions. */
1558 device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1559 instances.alloc(bvh->objects.size());
1560
1561 /* Calculate total motion transform size and allocate memory for them. */
1562 size_t motion_transform_offset = 0;
1563 if (pipeline_options.usesMotionBlur) {
1564 size_t total_motion_transform_size = 0;
1565 for (Object *const ob : bvh->objects) {
1566 if (ob->is_traceable() && ob->use_motion()) {
1567 total_motion_transform_size = align_up(total_motion_transform_size,
1568 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1569 const size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1570 total_motion_transform_size = total_motion_transform_size +
1571 sizeof(OptixSRTMotionTransform) +
1572 motion_keys * sizeof(OptixSRTData);
1573 }
1574 }
1575
1576 assert(bvh_optix->motion_transform_data->device == this);
1577 bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
1578 }
1579
1580 for (Object *ob : bvh->objects) {
1581 /* Skip non-traceable objects. */
1582 if (!ob->is_traceable()) {
1583 continue;
1584 }
1585
1586 BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
1587 OptixTraversableHandle handle = blas->traversable_handle;
1588 if (handle == 0) {
1589 continue;
1590 }
1591
1592 OptixInstance &instance = instances[num_instances++];
1593 memset(&instance, 0, sizeof(instance));
1594
1595 /* Clear transform to identity matrix. */
1596 instance.transform[0] = 1.0f;
1597 instance.transform[5] = 1.0f;
1598 instance.transform[10] = 1.0f;
1599
1600 /* Set user instance ID to object index. */
1601 instance.instanceId = ob->get_device_index();
1602
1603 /* Add some of the object visibility bits to the mask.
1604 * __prim_visibility contains the combined visibility bits of all instances, so is not
1605 * reliable if they differ between instances. But the OptiX visibility mask can only contain
1606 * 8 bits, so have to trade-off here and select just a few important ones.
1607 */
1608 instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;
1609
1610 /* Have to have at least one bit in the mask, or else instance would always be culled. */
1611 if (0 == instance.visibilityMask) {
1612 instance.visibilityMask = 0xFF;
1613 }
1614
1615 if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
1616 static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK)
1617 {
1618 if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
1619 /* Select between motion blur and non-motion blur built-in intersection module. */
1620 instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1621 }
1622 }
1623 else if (ob->get_geometry()->geometry_type == Geometry::POINTCLOUD) {
1624 /* Use the hit group that has an intersection program for point clouds. */
1625 instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD;
1626
1627 /* Also skip point clouds in local trace calls. */
1628 instance.visibilityMask |= 4;
1629 }
1630
1631# if OPTIX_ABI_VERSION < 55
1632 /* Cannot disable any-hit program for thick curves, since it needs to filter out end-caps. */
1633 else
1634# endif
1635 {
1636 /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
1637 * since it needs to filter out end-caps there).
1638 *
1639 * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
1640 * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
1641 */
1642 instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
1643 }
1644
1645 /* Insert motion traversable if object has motion. */
1646 if (pipeline_options.usesMotionBlur && ob->use_motion()) {
1647 size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1648 size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1649 motion_keys * sizeof(OptixSRTData);
1650
1651 const CUDAContextScope scope(this);
1652
1653 motion_transform_offset = align_up(motion_transform_offset,
1654 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1655 CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
1656 motion_transform_offset;
1657 motion_transform_offset += motion_transform_size;
1658
1659 /* Allocate host side memory for motion transform and fill it with transform data. */
1660 OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
1661 new uint8_t[motion_transform_size]);
1662 motion_transform.child = handle;
1663 motion_transform.motionOptions.numKeys = ob->get_motion().size();
1664 motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1665 motion_transform.motionOptions.timeBegin = 0.0f;
1666 motion_transform.motionOptions.timeEnd = 1.0f;
1667
1668 OptixSRTData *const srt_data = motion_transform.srtData;
1669 array<DecomposedTransform> decomp(ob->get_motion().size());
1671 decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1672
1673 for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1674 /* Scale. */
1675 srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
1676 srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
1677 srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
1678
1679 /* Shear. */
1680 srt_data[i].a = decomp[i].z.x; /* scale.x.y */
1681 srt_data[i].b = decomp[i].z.y; /* scale.x.z */
1682 srt_data[i].c = decomp[i].w.x; /* scale.y.z */
1683 assert(decomp[i].z.z == 0.0f); /* scale.y.x */
1684 assert(decomp[i].w.y == 0.0f); /* scale.z.x */
1685 assert(decomp[i].w.z == 0.0f); /* scale.z.y */
1686
1687 /* Pivot point. */
1688 srt_data[i].pvx = 0.0f;
1689 srt_data[i].pvy = 0.0f;
1690 srt_data[i].pvz = 0.0f;
1691
1692 /* Rotation. */
1693 srt_data[i].qx = decomp[i].x.x;
1694 srt_data[i].qy = decomp[i].x.y;
1695 srt_data[i].qz = decomp[i].x.z;
1696 srt_data[i].qw = decomp[i].x.w;
1697
1698 /* Translation. */
1699 srt_data[i].tx = decomp[i].y.x;
1700 srt_data[i].ty = decomp[i].y.y;
1701 srt_data[i].tz = decomp[i].y.z;
1702 }
1703
1704 /* Upload motion transform to GPU. */
1705 cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
1706 delete[] reinterpret_cast<uint8_t *>(&motion_transform);
1707
1708 /* Get traversable handle to motion transform. */
1709 optixConvertPointerToTraversableHandle(context,
1710 motion_transform_gpu,
1711 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1712 &instance.traversableHandle);
1713 }
1714 else {
1715 instance.traversableHandle = handle;
1716
1717 if (ob->get_geometry()->is_instanced()) {
1718 /* Set transform matrix. */
1719 memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1720 }
1721 }
1722 }
1723
1724 /* Upload instance descriptions. */
1725 instances.resize(num_instances);
1726 instances.copy_to_device();
1727
1728 /* Build top-level acceleration structure (TLAS) */
1729 OptixBuildInput build_input = {};
1730 build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1731 build_input.instanceArray.instances = instances.device_pointer;
1732 build_input.instanceArray.numInstances = num_instances;
1733
1734 if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1735 progress.set_error("Failed to build OptiX acceleration structure");
1736 }
1737 tlas_handle = bvh_optix->traversable_handle;
1738 }
1739}
1740
1741void OptiXDevice::release_bvh(BVH *bvh)
1742{
1743 thread_scoped_lock lock(delayed_free_bvh_mutex);
1744 /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
1745 * while GPU is still rendering. */
1746 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1747
1748 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
1749 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
1750 bvh_optix->traversable_handle = 0;
1751}
1752
1753void OptiXDevice::free_bvh_memory_delayed()
1754{
1755 thread_scoped_lock lock(delayed_free_bvh_mutex);
1756 delayed_free_bvh_memory.free_memory();
1757}
1758
1759void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
1760{
1761 /* Set constant memory for CUDA module. */
1762 CUDADevice::const_copy_to(name, host, size);
1763
1764 if (strcmp(name, "data") == 0) {
1765 assert(size <= sizeof(KernelData));
1766
1767 /* Update traversable handle (since it is different for each device on multi devices). */
1768 KernelData *const data = (KernelData *)host;
1769 *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;
1770
1771 update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
1772 return;
1773 }
1774
1775 /* Update data storage pointers in launch parameters. */
1776# define KERNEL_DATA_ARRAY(data_type, data_name) \
1777 if (strcmp(name, #data_name) == 0) { \
1778 update_launch_params(offsetof(KernelParamsOptiX, data_name), host, size); \
1779 return; \
1780 }
1781 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
1782# include "kernel/data_arrays.h"
1783# undef KERNEL_DATA_ARRAY
1784}
1785
1786void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
1787{
1788 const CUDAContextScope scope(this);
1789
1790 cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
1791}
1792
1794
1795#endif /* WITH_OPTIX */
unsigned int uint
ThreadMutex mutex
volatile int lock
ATTR_WARN_UNUSED_RESULT const BMVert * v
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
static btDbvtVolume bounds(btDbvtNode **leaves, int count)
Definition btDbvt.cpp:299
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
Attribute * find(ustring name) const
float3 * data_float3()
float4 * data_float4()
bool top_level
Definition params.h:81
int bvh_type
Definition params.h:106
Definition bvh/bvh.h:66
vector< Geometry * > geometry
Definition bvh/bvh.h:69
BVHParams params
Definition bvh/bvh.h:68
vector< Object * > objects
Definition bvh/bvh.h:70
Type geometry_type
size_t prim_offset
AttributeSet attributes
Definition hair.h:14
size_t num_segments() const
Definition hair.h:131
CurveShapeType curve_shape
Definition hair.h:92
void set_substatus(const string &substatus_)
Definition progress.h:274
void set_error(const string &error_message_)
Definition progress.h:113
void alloc_to_device(size_t num, bool shrink_to_fit=true)
additional_info("compositor_sum_squared_difference_float_shared") .push_constant(Type output_img float dot(value.rgb, luminance_coefficients)") .define("LOAD(value)"
@ MEM_READ_ONLY
CCL_NAMESPACE_BEGIN struct Options options
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:6
DebugFlags & DebugFlags()
Definition debug.h:142
#define function_bind
#define CCL_NAMESPACE_END
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
#define NULL
#define offsetof(t, d)
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
static float verts[][3]
@ SHADER_TYPE_BUMP
@ SHADER_TYPE_SURFACE
@ SHADER_TYPE_VOLUME
@ SHADER_TYPE_DISPLACEMENT
#define KERNEL_FEATURE_OBJECT_MOTION
@ ATTR_STD_MOTION_VERTEX_POSITION
#define KERNEL_FEATURE_OSL
@ CURVE_THICK
#define KERNEL_FEATURE_SUBSURFACE
KernelData
#define KERNEL_FEATURE_HAIR_THICK
@ BVH_LAYOUT_OPTIX
#define KERNEL_FEATURE_PATH_TRACING
#define KERNEL_FEATURE_HAIR
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_BAKING
#define KERNEL_FEATURE_MNEE
#define KERNEL_FEATURE_POINTCLOUD
#define VLOG_INFO
Definition log.h:72
#define VLOG_IS_ON(severity)
Definition log.h:36
struct blender::compositor::@172::@174 task
T step(const T &edge, const T &value)
VecBase< float, 4 > float4
int BVHLayoutMask
Definition params.h:51
@ BVH_TYPE_STATIC
Definition params.h:41
size_t path_file_size(const string &path)
Definition path.cpp:556
bool path_is_directory(const string &path)
Definition path.cpp:584
string path_get(const string &sub)
Definition path.cpp:339
string path_join(const string &dir, const string &file)
Definition path.cpp:417
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
static struct PyModuleDef module
Definition python.cpp:991
#define min(a, b)
Definition sort.c:32
unsigned short uint16_t
Definition stdint.h:79
unsigned char uint8_t
Definition stdint.h:78
unsigned __int64 uint64_t
Definition stdint.h:90
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
Point get_point(int i) const
size_t num_points() const
void push(TaskRunFunction &&task)
Definition task.cpp:22
void wait_work(Summary *stats=NULL)
Definition task.cpp:28
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:30
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition thread.h:29
void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size)
float max
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition util/types.h:48
uint64_t device_ptr
Definition util/types.h:45