Blender V5.0
kernel/device/gpu/kernel.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5/* Common GPU kernels. */
6
10
11#include "kernel/sample/lcg.h"
12
13/* Include constant tables before entering Metal's context class scope (context_begin.h) */
14#include "kernel/tables.h"
15
16#ifdef __KERNEL_METAL__
18#elif defined(__KERNEL_ONEAPI__)
20#endif
21
23
27
41
42#include "kernel/bake/bake.h"
43
46
47#ifdef __KERNEL_METAL__
49#elif defined(__KERNEL_ONEAPI__)
51#endif
52
53#include "kernel/film/read.h"
54
55#if defined(__HIPRT__)
57#endif
58/* --------------------------------------------------------------------
59 * Integrator.
60 */
61
63 ccl_gpu_kernel_signature(integrator_reset, const int num_states)
64{
65 const int state = ccl_gpu_global_id_x();
66
68 INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
69 INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
70 }
71}
73
77 const int num_tiles,
80{
81 const int work_index = ccl_gpu_global_id_x();
82
83 if (work_index >= max_tile_work_size * num_tiles) {
84 return;
85 }
86
87 const int tile_index = work_index / max_tile_work_size;
88 const int tile_work_index = work_index - tile_index * max_tile_work_size;
89
91
92 if (tile_work_index >= tile->work_size) {
93 return;
94 }
95
96 const int state = tile->path_index_offset + tile_work_index;
97
100
103}
105
109 const int num_tiles,
111 const int max_tile_work_size)
112{
113 const int work_index = ccl_gpu_global_id_x();
114
115 if (work_index >= max_tile_work_size * num_tiles) {
116 return;
117 }
118
119 const int tile_index = work_index / max_tile_work_size;
120 const int tile_work_index = work_index - tile_index * max_tile_work_size;
121
123
124 if (tile_work_index >= tile->work_size) {
125 return;
126 }
127
128 const int state = tile->path_index_offset + tile_work_index;
129
130 uint x, y, sample;
132
135}
137
138#if !defined(__HIPRT__)
139
140/* Intersection kernels need access to the kernel handler for specialization constants to work
141 * properly. */
142# ifdef __KERNEL_ONEAPI__
144# endif
145
150 const int work_size)
151{
152 const int global_index = ccl_gpu_global_id_x();
153
155 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
157 }
158}
160
163 const ccl_global int *path_index_array,
164 const int work_size)
165{
166 const int global_index = ccl_gpu_global_id_x();
167
168 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
169 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
171 }
172}
174
177 const ccl_global int *path_index_array,
178 const int work_size)
179{
180 const int global_index = ccl_gpu_global_id_x();
181
182 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
183 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
185 }
186}
188
191 const ccl_global int *path_index_array,
192 const int work_size)
193{
194# ifdef __VOLUME__
195 const int global_index = ccl_gpu_global_id_x();
196
197 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
198 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
200 }
201# endif
202}
204
207 const ccl_global int *path_index_array,
208 const int work_size)
209{
210 const int global_index = ccl_gpu_global_id_x();
211
212 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
213 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
215 }
216}
218
219# ifdef __KERNEL_ONEAPI__
221# endif
222
223#endif
224
227 const ccl_global int *path_index_array,
229 const int work_size)
230{
231 const int global_index = ccl_gpu_global_id_x();
232
233 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
234 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
236 }
237}
239
242 const ccl_global int *path_index_array,
244 const int work_size)
245{
246 const int global_index = ccl_gpu_global_id_x();
247
248 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
249 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
251 }
252}
254
257 const ccl_global int *path_index_array,
259 const int work_size)
260{
261 const int global_index = ccl_gpu_global_id_x();
262
263 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
264 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
266 }
267}
269
272 const ccl_global int *path_index_array,
274 const int work_size)
275{
276 const int global_index = ccl_gpu_global_id_x();
277
278 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
279 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
281 }
282}
284
285#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
286constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
287#endif
288
289#if !defined(__HIPRT__)
290
291/* Kernels using intersections need access to the kernel handler for specialization constants to
292 * work properly. */
293# ifdef __KERNEL_ONEAPI__
295# endif
296
299 const ccl_global int *path_index_array,
301 const int work_size)
302{
303 const int global_index = ccl_gpu_global_id_x();
304
305 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
306 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
307
308# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
309 KernelGlobals kg = nullptr;
310 /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
311 * Dummy offset should not affect result, but somehow fixes bug! */
312 kg += __dummy_constant;
314# else
316# endif
317 }
318}
320
323 const ccl_global int *path_index_array,
325 const int work_size)
326{
327 const int global_index = ccl_gpu_global_id_x();
328
329 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
330 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
332 }
333}
335
336# ifdef __KERNEL_ONEAPI__
338# endif
339
340#endif
341
344 const ccl_global int *path_index_array,
346 const int work_size)
347{
348 const int global_index = ccl_gpu_global_id_x();
349
350 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
351 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
353 }
354}
356
359 const ccl_global int *path_index_array,
361 const int work_size)
362{
363 const int global_index = ccl_gpu_global_id_x();
364
365 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
366 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
368 }
369}
371
374 const ccl_global int *path_index_array,
376 const int work_size)
377{
378 const int global_index = ccl_gpu_global_id_x();
379
380 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
381 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
383 }
384}
386
388 ccl_gpu_kernel_signature(integrator_queued_paths_array,
389 const int num_states,
390 ccl_global int *indices,
391 ccl_global int *num_indices,
392 const int kernel_index)
393{
394 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
395 int kernel_index);
396 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
397
398 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
399}
401
403 ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
404 const int num_states,
405 ccl_global int *indices,
406 ccl_global int *num_indices,
407 const int kernel_index)
408{
409 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
410 int kernel_index);
411 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
412
413 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
414}
416
418 ccl_gpu_kernel_signature(integrator_active_paths_array,
419 const int num_states,
420 ccl_global int *indices,
421 ccl_global int *num_indices)
422{
423 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
424
425 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
426}
428
430 ccl_gpu_kernel_signature(integrator_terminated_paths_array,
431 const int num_states,
432 ccl_global int *indices,
433 ccl_global int *num_indices,
434 const int indices_offset)
435{
436 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
437
439 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
440}
442
444 ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
445 const int num_states,
446 ccl_global int *indices,
447 ccl_global int *num_indices,
448 const int indices_offset)
449{
450 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
451
453 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
454}
456
458 ccl_gpu_kernel_signature(integrator_sorted_paths_array,
459 const int num_states,
460 const int num_states_limit,
461 ccl_global int *indices,
462 ccl_global int *num_indices,
463 ccl_global int *key_counter,
464 ccl_global int *key_prefix_sum,
465 const int kernel_index)
466{
467 ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
468 INTEGRATOR_STATE(state, path, shader_sort_key) :
470 int kernel_index);
471 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
472
473 const uint state_index = ccl_gpu_global_id_x();
476 num_states_limit,
477 indices,
478 num_indices,
479 key_counter,
480 key_prefix_sum,
481 ccl_gpu_kernel_lambda_pass);
482}
484
485/* oneAPI Verizon needs the local_mem accessor in the arguments. */
486#ifdef __KERNEL_ONEAPI__
488 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
489 const int num_states,
490 const int partition_size,
491 const int num_states_limit,
492 ccl_global int *indices,
493 const int kernel_index,
494 sycl::local_accessor<int> &local_mem)
495#else
497 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
498 const int num_states,
499 const int partition_size,
500 const int num_states_limit,
501 ccl_global int *indices,
502 const int kernel_index)
503#endif
504{
505#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
506 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
507 kernel_integrator_state.path.queued_kernel;
508 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
509 kernel_integrator_state.path.shader_sort_key;
510 ccl_global int *key_offsets = (ccl_global int *)
511 kernel_integrator_state.sort_partition_key_offsets;
512
513# ifdef __KERNEL_METAL__
514 int max_shaders = context.launch_params_metal.data.max_shaders;
515# endif
516
517# ifdef __KERNEL_ONEAPI__
518 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
519 * uses metal_*, we need the below to be compatible with these kernels. */
520 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
521 int metal_local_id = ccl_gpu_thread_idx_x;
522 int metal_local_size = ccl_gpu_block_dim_x;
523 int metal_grid_id = ccl_gpu_block_idx_x;
524 /* There is no difference here between different access decorations, as we are requesting
525 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
526 ccl_gpu_shared int *threadgroup_array =
527 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
528# endif
529
530 gpu_parallel_sort_bucket_pass(num_states,
531 partition_size,
532 max_shaders,
533 kernel_index,
534 d_queued_kernel,
535 d_shader_sort_key,
536 key_offsets,
537 (ccl_gpu_shared int *)threadgroup_array,
538 metal_local_id,
539 metal_local_size,
540 metal_grid_id);
541#endif
542}
544
545/* oneAPI version needs the local_mem accessor in the arguments. */
546#ifdef __KERNEL_ONEAPI__
548 ccl_gpu_kernel_signature(integrator_sort_write_pass,
549 const int num_states,
550 const int partition_size,
551 const int num_states_limit,
552 ccl_global int *indices,
553 const int kernel_index,
554 sycl::local_accessor<int> &local_mem)
555#else
557 ccl_gpu_kernel_signature(integrator_sort_write_pass,
558 const int num_states,
559 const int partition_size,
560 const int num_states_limit,
561 ccl_global int *indices,
562 const int kernel_index)
563#endif
564
565{
566#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
567 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
568 kernel_integrator_state.path.queued_kernel;
569 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
570 kernel_integrator_state.path.shader_sort_key;
571 ccl_global int *key_offsets = (ccl_global int *)
572 kernel_integrator_state.sort_partition_key_offsets;
573
574# ifdef __KERNEL_METAL__
575 int max_shaders = context.launch_params_metal.data.max_shaders;
576# endif
577
578# ifdef __KERNEL_ONEAPI__
579 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
580 * uses metal_*, we need the below to be compatible with these kernels. */
581 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
582 int metal_local_id = ccl_gpu_thread_idx_x;
583 int metal_local_size = ccl_gpu_block_dim_x;
584 int metal_grid_id = ccl_gpu_block_idx_x;
585 /* There is no difference here between different access decorations, as we are requesting
586 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
587 ccl_gpu_shared int *threadgroup_array =
588 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
589# endif
590
591 gpu_parallel_sort_write_pass(num_states,
592 partition_size,
593 max_shaders,
594 kernel_index,
595 num_states_limit,
596 indices,
597 d_queued_kernel,
598 d_shader_sort_key,
599 key_offsets,
600 (ccl_gpu_shared int *)threadgroup_array,
601 metal_local_id,
602 metal_local_size,
603 metal_grid_id);
604#endif
605}
607
609 ccl_gpu_kernel_signature(integrator_compact_paths_array,
610 const int num_states,
611 ccl_global int *indices,
612 ccl_global int *num_indices,
613 const int num_active_paths)
614{
615 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
616 (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
617 int num_active_paths);
618 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
619
620 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
621}
623
625 ccl_gpu_kernel_signature(integrator_compact_states,
626 const ccl_global int *active_terminated_states,
627 const int active_states_offset,
628 const int terminated_states_offset,
629 const int work_size)
630{
631 const int global_index = ccl_gpu_global_id_x();
632
633 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
634 const int from_state = active_terminated_states[active_states_offset + global_index];
635 const int to_state = active_terminated_states[terminated_states_offset + global_index];
636
637 ccl_gpu_kernel_call(integrator_state_move(nullptr, to_state, from_state));
638 }
639}
641
643 ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
644 const int num_states,
645 ccl_global int *indices,
646 ccl_global int *num_indices,
647 const int num_active_paths)
648{
649 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
650 (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
651 int num_active_paths);
652 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
653
654 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
655}
657
659 ccl_gpu_kernel_signature(integrator_compact_shadow_states,
660 const ccl_global int *active_terminated_states,
661 const int active_states_offset,
662 const int terminated_states_offset,
663 const int work_size)
664{
665 const int global_index = ccl_gpu_global_id_x();
666
667 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
668 const int from_state = active_terminated_states[active_states_offset + global_index];
669 const int to_state = active_terminated_states[terminated_states_offset + global_index];
670
671 ccl_gpu_kernel_call(integrator_shadow_state_move(nullptr, to_state, from_state));
672 }
673}
675
677 prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
678{
679 gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
680}
682
683/* --------------------------------------------------------------------
684 * Adaptive sampling.
685 */
686
690 const int sx,
691 const int sy,
692 const int sw,
693 const int sh,
694 const float threshold,
695 const int reset,
696 const int offset,
697 const int stride,
698 ccl_global uint *num_active_pixels)
699{
700 const int work_index = ccl_gpu_global_id_x();
701 const int y = work_index / sw;
702 const int x = work_index - y * sw;
703
704 bool converged = true;
705
706 if (x < sw && y < sh) {
708 nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
709 }
710
711 /* NOTE: All threads specified in the mask must execute the intrinsic. */
712 const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
713 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
714 if (lane_id == 0) {
715 atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
716 }
717}
719
723 const int sx,
724 const int sy,
725 const int sw,
726 const int sh,
727 const int offset,
728 const int stride)
729{
730 const int y = ccl_gpu_global_id_x();
731
732 if (y < sh) {
734 film_adaptive_sampling_filter_x(nullptr, render_buffer, sy + y, sx, sw, offset, stride));
735 }
736}
738
742 const int sx,
743 const int sy,
744 const int sw,
745 const int sh,
746 const int offset,
747 const int stride)
748{
749 const int x = ccl_gpu_global_id_x();
750
751 if (x < sw) {
753 film_adaptive_sampling_filter_y(nullptr, render_buffer, sx + x, sy, sh, offset, stride));
754 }
755}
757
758/* --------------------------------------------------------------------
759 * Cryptomatte.
760 */
761
765 const int num_pixels)
766{
767 const int pixel_index = ccl_gpu_global_id_x();
768
769 if (pixel_index < num_pixels) {
771 }
772}
774
775/* --------------------------------------------------------------------
776 * Film.
777 */
778
779ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
780 const int rgba_offset,
781 const int rgba_stride,
782 const int x,
783 const int y,
784 const half4 half_pixel)
785{
786 /* Work around HIP issue with half float display, see #92972. */
787#ifdef __KERNEL_HIP__
788 ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
789 out[0] = half_pixel.x;
790 out[1] = half_pixel.y;
791 out[2] = half_pixel.z;
792 out[3] = half_pixel.w;
793#else
794 ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
795 *out = half_pixel;
796#endif
797}
798
799#ifdef __KERNEL_METAL__
800
801/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
802 * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
803# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
804 float local_pixel[4]; \
805 film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
806 if (input_channel_count >= 1) { \
807 pixel[0] = local_pixel[0]; \
808 } \
809 if (input_channel_count >= 2) { \
810 pixel[1] = local_pixel[1]; \
811 } \
812 if (input_channel_count >= 3) { \
813 pixel[2] = local_pixel[2]; \
814 } \
815 if (input_channel_count >= 4) { \
816 pixel[3] = local_pixel[3]; \
817 }
818
819#else
820
821# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
822 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
823
824#endif
825
826#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
827 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
828 ccl_gpu_kernel_signature(film_convert_##variant, \
829 const KernelFilmConvert kfilm_convert, \
830 ccl_global float *pixels, \
831 ccl_global float *render_buffer, \
832 int num_pixels, \
833 int width, \
834 int offset, \
835 int stride, \
836 int channel_offset, \
837 int rgba_offset, \
838 int rgba_stride) \
839 { \
840 const int render_pixel_index = ccl_gpu_global_id_x(); \
841 if (render_pixel_index >= num_pixels) { \
842 return; \
843 } \
844\
845 const int x = render_pixel_index % width; \
846 const int y = render_pixel_index / width; \
847\
848 const uint64_t buffer_pixel_index = x + y * stride; \
849 ccl_global const float *buffer = render_buffer + offset + \
850 buffer_pixel_index * kfilm_convert.pass_stride; \
851\
852 ccl_global float *pixel = pixels + channel_offset + \
853 (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
854\
855 FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
856 } \
857 ccl_gpu_kernel_postfix \
858\
859 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
860 ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
861 const KernelFilmConvert kfilm_convert, \
862 ccl_global uchar4 *rgba, \
863 ccl_global float *render_buffer, \
864 int num_pixels, \
865 int width, \
866 int offset, \
867 int stride, \
868 int rgba_offset, \
869 int rgba_stride) \
870 { \
871 const int render_pixel_index = ccl_gpu_global_id_x(); \
872 if (render_pixel_index >= num_pixels) { \
873 return; \
874 } \
875\
876 const int x = render_pixel_index % width; \
877 const int y = render_pixel_index / width; \
878\
879 const uint64_t buffer_pixel_index = x + y * stride; \
880 ccl_global const float *buffer = render_buffer + offset + \
881 buffer_pixel_index * kfilm_convert.pass_stride; \
882\
883 float pixel[4]; \
884 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
885\
886 if (input_channel_count == 1) { \
887 pixel[1] = pixel[2] = pixel[0]; \
888 } \
889 if (input_channel_count <= 3) { \
890 pixel[3] = 1.0f; \
891 } \
892\
893 film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
894\
895 const half4 half_pixel = float4_to_half4_display( \
896 make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
897 kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
898 } \
899 ccl_gpu_kernel_postfix
900
901/* 1 channel inputs */
904KERNEL_FILM_CONVERT_VARIANT(volume_majorant, 1)
905KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
907
908/* 3 channel inputs */
909KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
912
913/* 4 channel inputs */
915KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
916KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
917KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
920
921#undef KERNEL_FILM_CONVERT_VARIANT
922
923/* --------------------------------------------------------------------
924 * Shader evaluation.
925 */
926
927/* Displacement */
928
932 ccl_global float *output,
933 const int offset,
934 const int work_size)
935{
936 int i = ccl_gpu_global_id_x();
937 if (i < work_size) {
939 }
940}
942
943/* Background */
944
948 ccl_global float *output,
949 const int offset,
950 const int work_size)
951{
952 int i = ccl_gpu_global_id_x();
953 if (i < work_size) {
955 }
956}
958
959/* Curve Shadow Transparency */
960
964 ccl_global float *output,
965 const int offset,
966 const int work_size)
967{
968 int i = ccl_gpu_global_id_x();
969 if (i < work_size) {
972 }
973}
975
976/* Volume Density. */
977
981 ccl_global float *output,
982 const int offset,
983 const int work_size)
984{
985 int i = ccl_gpu_global_id_x();
986 if (i < work_size) {
988 }
989}
991
992/* --------------------------------------------------------------------
993 * Denoising.
994 */
995
997 ccl_gpu_kernel_signature(filter_color_preprocess,
999 const int full_x,
1000 const int full_y,
1001 const int width,
1002 const int height,
1003 const int offset,
1004 const int stride,
1005 const int pass_stride,
1006 const int pass_denoised)
1007{
1008 const int work_index = ccl_gpu_global_id_x();
1009 const int y = work_index / width;
1010 const int x = work_index - y * width;
1011
1012 if (x >= width || y >= height) {
1013 return;
1014 }
1015
1016 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1017 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
1018
1019 ccl_global float *color_out = buffer + pass_denoised;
1020 color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
1021 color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
1022 color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
1023}
1025
1027 ccl_gpu_kernel_signature(filter_guiding_preprocess,
1028 ccl_global float *guiding_buffer,
1029 const int guiding_pass_stride,
1030 const int guiding_pass_albedo,
1031 const int guiding_pass_normal,
1032 const int guiding_pass_flow,
1033 const ccl_global float *render_buffer,
1034 const int render_offset,
1035 const int render_stride,
1036 const int render_pass_stride,
1037 const int render_pass_sample_count,
1038 const int render_pass_denoising_albedo,
1039 const int render_pass_denoising_normal,
1040 const int render_pass_motion,
1041 const int full_x,
1042 const int full_y,
1043 const int width,
1044 const int height,
1045 const int num_samples)
1046{
1047 const int work_index = ccl_gpu_global_id_x();
1048 const int y = work_index / width;
1049 const int x = work_index - y * width;
1050
1051 if (x >= width || y >= height) {
1052 return;
1053 }
1054
1055 const uint64_t guiding_pixel_index = x + y * width;
1056 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1057
1058 const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
1059 const ccl_global float *buffer = render_buffer + render_pixel_index * render_pass_stride;
1060
1061 float pixel_scale;
1062 if (render_pass_sample_count == PASS_UNUSED) {
1063 pixel_scale = 1.0f / num_samples;
1064 }
1065 else {
1066 pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
1067 }
1068
1069 /* Albedo pass. */
1070 if (guiding_pass_albedo != PASS_UNUSED) {
1071 kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
1072
1073 const ccl_global float *albedo_in = buffer + render_pass_denoising_albedo;
1074 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1075
1076 albedo_out[0] = albedo_in[0] * pixel_scale;
1077 albedo_out[1] = albedo_in[1] * pixel_scale;
1078 albedo_out[2] = albedo_in[2] * pixel_scale;
1079 }
1080
1081 /* Normal pass. */
1082 if (guiding_pass_normal != PASS_UNUSED) {
1083 kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
1084
1085 const ccl_global float *normal_in = buffer + render_pass_denoising_normal;
1086 ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
1087
1088 normal_out[0] = normal_in[0] * pixel_scale;
1089 normal_out[1] = normal_in[1] * pixel_scale;
1090 normal_out[2] = normal_in[2] * pixel_scale;
1091 }
1092
1093 /* Flow pass. */
1094 if (guiding_pass_flow != PASS_UNUSED) {
1095 kernel_assert(render_pass_motion != PASS_UNUSED);
1096
1097 const ccl_global float *motion_in = buffer + render_pass_motion;
1098 ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
1099
1100 flow_out[0] = -motion_in[0] * pixel_scale;
1101 flow_out[1] = -motion_in[1] * pixel_scale;
1102 }
1103}
1105
1107 ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
1108 ccl_global float *guiding_buffer,
1109 const int guiding_pass_stride,
1110 const int guiding_pass_albedo,
1111 const int width,
1112 const int height)
1113{
1114 kernel_assert(guiding_pass_albedo != PASS_UNUSED);
1115
1116 const int work_index = ccl_gpu_global_id_x();
1117 const int y = work_index / width;
1118 const int x = work_index - y * width;
1119
1120 if (x >= width || y >= height) {
1121 return;
1122 }
1123
1124 const uint64_t guiding_pixel_index = x + y * width;
1125 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1126
1127 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1128
1129 albedo_out[0] = 0.5f;
1130 albedo_out[1] = 0.5f;
1131 albedo_out[2] = 0.5f;
1132}
1134
1136 ccl_gpu_kernel_signature(filter_color_postprocess,
1138 const int full_x,
1139 const int full_y,
1140 const int width,
1141 const int height,
1142 const int offset,
1143 const int stride,
1144 const int pass_stride,
1145 const int num_samples,
1146 const int pass_noisy,
1147 const int pass_denoised,
1148 const int pass_sample_count,
1149 const int num_components,
1150 const int use_compositing)
1151{
1152 const int work_index = ccl_gpu_global_id_x();
1153 const int y = work_index / width;
1154 const int x = work_index - y * width;
1155
1156 if (x >= width || y >= height) {
1157 return;
1158 }
1159
1160 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1161 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
1162
1163 float pixel_scale;
1164 if (pass_sample_count == PASS_UNUSED) {
1165 pixel_scale = num_samples;
1166 }
1167 else {
1168 pixel_scale = __float_as_uint(buffer[pass_sample_count]);
1169 }
1170
1171 ccl_global float *denoised_pixel = buffer + pass_denoised;
1172
1173 denoised_pixel[0] *= pixel_scale;
1174 denoised_pixel[1] *= pixel_scale;
1175 denoised_pixel[2] *= pixel_scale;
1176
1177 if (num_components == 3) {
1178 /* Pass without alpha channel. */
1179 }
1180 else if (!use_compositing) {
1181 /* Currently compositing passes are either 3-component (derived by dividing light passes)
1182 * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
1183 * simplifies logic and avoids extra memory allocation. */
1184 const ccl_global float *noisy_pixel = buffer + pass_noisy;
1185 denoised_pixel[3] = noisy_pixel[3];
1186 }
1187 else {
1188 /* Assigning to zero since this is a default alpha value for 3-component passes, and it
1189 * is an opaque pixel for 4 component passes. */
1190 denoised_pixel[3] = 0;
1191 }
1192}
1194
1196 ccl_gpu_kernel_signature(filter_color_flip_y,
1198 const int full_x,
1199 const int full_y,
1200 const int width,
1201 const int height,
1202 const int offset,
1203 const int stride,
1204 const int pass_stride,
1205 const int pass_denoised)
1206{
1207 const int work_index = ccl_gpu_global_id_x();
1208 const int y = work_index / width;
1209 const int x = work_index - y * width;
1210
1211 if (x >= width || y >= height / 2) {
1212 return;
1213 }
1214
1215 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1216 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride + pass_denoised;
1217 ccl_global float *buffer_flipped = buffer + (height - 1 - y * 2) * stride * pass_stride;
1218
1219 float3 temp;
1220 temp.x = buffer[0];
1221 temp.y = buffer[1];
1222 temp.z = buffer[2];
1223 buffer[0] = buffer_flipped[0];
1224 buffer[1] = buffer_flipped[1];
1225 buffer[2] = buffer_flipped[2];
1226 buffer_flipped[0] = temp.x;
1227 buffer_flipped[1] = temp.y;
1228 buffer_flipped[2] = temp.z;
1229}
1231
1232/* --------------------------------------------------------------------
1233 * Shadow catcher.
1234 */
1235
1237 ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
1238 const int num_states,
1239 ccl_global uint *num_possible_splits)
1240{
1241 const int state = ccl_gpu_global_id_x();
1242
1243 bool can_split = false;
1244
1245 if (state < num_states) {
1247 }
1248
1249 /* NOTE: All threads specified in the mask must execute the intrinsic. */
1250 const auto can_split_mask = ccl_gpu_ballot(can_split);
1251 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
1252 if (lane_id == 0) {
1253 atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
1254 }
1255}
1257
1258/* --------------------------------------------------------------------
1259 * Volume Scattering Probability Guiding.
1260 */
1261
1265 const int sx,
1266 const int sy,
1267 const int sw,
1268 const int sh,
1269 const int offset,
1270 const int stride)
1271{
1272 const int work_index = ccl_gpu_global_id_x();
1273 const int y = work_index / sw;
1274 const int x = work_index % sw;
1275
1276 if (y < sh) {
1278 nullptr, render_buffer, sy + y, sx + x, sx, sx + sw, offset, stride));
1279 }
1280}
1282
1286 const int sx,
1287 const int sy,
1288 const int sw,
1289 const int sh,
1290 const int offset,
1291 const int stride)
1292{
1293 const int x = ccl_gpu_global_id_x();
1294
1295 if (x < sw) {
1297 volume_guiding_filter_y(nullptr, render_buffer, sx + x, sy, sy + sh, offset, stride));
1298 }
1299}
unsigned int uint
unsigned short ushort
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
Definition block_sizes.h:17
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
Definition block_sizes.h:14
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
Definition block_sizes.h:13
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
Definition block_sizes.h:12
#define GPU_PARALLEL_SORT_BLOCK_SIZE
Definition block_sizes.h:18
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
Definition half.h:41
ccl_device_inline void film_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, const int pixel_index)
#define kernel_assert(cond)
#define PASS_UNUSED
const ThreadKernelGlobalsCPU * KernelGlobals
#define ccl_device_inline
#define ccl_global
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_global_id_x()
#define ccl_gpu_warp_size
#define ccl_gpu_shared
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define kernel_integrator_state
#define __float_as_uint(x)
@ Kernel_DummyConstant
static ushort indices[]
#define input
#define out
#define output
constexpr T clamp(T, U, U) RET
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_dedicated_light(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
ccl_device void kernel_volume_density_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_within_bounds(i, n)
#define ccl_gpu_kernel_call(x)
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_threads(block_num_threads)
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
const int tile_work_index
const int num_states
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
const ccl_global KernelWorkTile * tile
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
const int tile_index
ccl_gpu_kernel_postfix const ccl_global int * path_index_array
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, const int stride)
ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, const int stride)
ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, const int stride)
void KERNEL_FUNCTION_FULL_NAME volume_guiding_filter_x(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int y, const int center_x, const int min_x, const int max_x, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME volume_guiding_filter_y(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int center_y, const int height, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_background(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
bool KERNEL_FUNCTION_FULL_NAME adaptive_sampling_convergence_check(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME cryptomatte_postprocess(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME shader_eval_displace(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_x(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_y(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_curve_shadow_transparency(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME shader_eval_volume_density(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
ccl_device_inline uint popcount(const uint x)
Definition math_base.h:688
static ulong state[N]
int context(const bContext *C, const char *member, bContextDataResult *result)
std::shared_ptr< const T > get(const GenericKey &key, FunctionRef< std::unique_ptr< T >()> compute_fn)
#define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op)
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_dedicated_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition shade_light.h:65
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_volume_ray_marching(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
Definition state.h:236
#define INTEGRATOR_STATE(state, nested_struct, member)
Definition state.h:235
float x
Definition sky_math.h:136
Definition half.h:60
half x
Definition half.h:61
half w
Definition half.h:61
half z
Definition half.h:61
half y
Definition half.h:61
i
Definition text_draw.cc:230
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(const ccl_global KernelWorkTile *tile, const uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)