Blender V4.3
kernel/device/gpu/kernel.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5/* Common GPU kernels. */
6
10
11#include "kernel/sample/lcg.h"
12
13/* Include constant tables before entering Metal's context class scope (context_begin.h) */
14#include "kernel/tables.h"
15
16#ifdef __KERNEL_METAL__
18#elif defined(__KERNEL_ONEAPI__)
20#endif
21
23
27
41
42#include "kernel/bake/bake.h"
43
45
46#ifdef __KERNEL_METAL__
48#elif defined(__KERNEL_ONEAPI__)
50#endif
51
52#include "kernel/film/read.h"
53
54#if defined(__HIPRT__)
56#endif
57/* --------------------------------------------------------------------
58 * Integrator.
59 */
60
62 ccl_gpu_kernel_signature(integrator_reset, int num_states)
63{
64 const int state = ccl_gpu_global_id_x();
65
67 INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
68 INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
69 }
70}
72
76 const int num_tiles,
79{
80 const int work_index = ccl_gpu_global_id_x();
81
82 if (work_index >= max_tile_work_size * num_tiles) {
83 return;
84 }
85
86 const int tile_index = work_index / max_tile_work_size;
87 const int tile_work_index = work_index - tile_index * max_tile_work_size;
88
90
92 return;
93 }
94
96
99
102}
104
108 const int num_tiles,
110 const int max_tile_work_size)
111{
112 const int work_index = ccl_gpu_global_id_x();
113
114 if (work_index >= max_tile_work_size * num_tiles) {
115 return;
116 }
117
118 const int tile_index = work_index / max_tile_work_size;
119 const int tile_work_index = work_index - tile_index * max_tile_work_size;
120
122
124 return;
125 }
126
128
129 uint x, y, sample;
131
134}
136
137#if !defined(__HIPRT__)
138
139/* Intersection kernels need access to the kernel handler for specialization constants to work
140 * properly. */
141# ifdef __KERNEL_ONEAPI__
143# endif
144
149 const int work_size)
150{
151 const int global_index = ccl_gpu_global_id_x();
152
154 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
156 }
157}
159
162 ccl_global const int *path_index_array,
163 const int work_size)
164{
165 const int global_index = ccl_gpu_global_id_x();
166
167 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
168 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
170 }
171}
173
176 ccl_global const int *path_index_array,
177 const int work_size)
178{
179 const int global_index = ccl_gpu_global_id_x();
180
181 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
182 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
184 }
185}
187
190 ccl_global const int *path_index_array,
191 const int work_size)
192{
193# ifdef __VOLUME__
194 const int global_index = ccl_gpu_global_id_x();
195
196 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
197 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
199 }
200# endif
201}
203
206 ccl_global const int *path_index_array,
207 const int work_size)
208{
209 const int global_index = ccl_gpu_global_id_x();
210
211 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
212 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
214 }
215}
217
218# ifdef __KERNEL_ONEAPI__
220# endif
221
222#endif
223
226 ccl_global const int *path_index_array,
228 const int work_size)
229{
230 const int global_index = ccl_gpu_global_id_x();
231
232 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
233 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
235 }
236}
238
241 ccl_global const int *path_index_array,
243 const int work_size)
244{
245 const int global_index = ccl_gpu_global_id_x();
246
247 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
248 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
250 }
251}
253
256 ccl_global const int *path_index_array,
258 const int work_size)
259{
260 const int global_index = ccl_gpu_global_id_x();
261
262 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
263 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
265 }
266}
268
271 ccl_global const int *path_index_array,
273 const int work_size)
274{
275 const int global_index = ccl_gpu_global_id_x();
276
277 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
278 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
280 }
281}
283
284#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
285constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
286#endif
287
288#if !defined(__HIPRT__)
289
290/* Kernels using intersections need access to the kernel handler for specialization constants to
291 * work properly. */
292# ifdef __KERNEL_ONEAPI__
294# endif
295
298 ccl_global const int *path_index_array,
300 const int work_size)
301{
302 const int global_index = ccl_gpu_global_id_x();
303
304 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
305 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
306
307# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
308 KernelGlobals kg = NULL;
309 /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
310 * Dummy offset should not affect result, but somehow fixes bug! */
311 kg += __dummy_constant;
313# else
315# endif
316 }
317}
319
322 ccl_global const int *path_index_array,
324 const int work_size)
325{
326 const int global_index = ccl_gpu_global_id_x();
327
328 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
329 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
331 }
332}
334
335# ifdef __KERNEL_ONEAPI__
337# endif
338
339#endif
340
343 ccl_global const int *path_index_array,
345 const int work_size)
346{
347 const int global_index = ccl_gpu_global_id_x();
348
349 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
350 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
352 }
353}
355
358 ccl_global const int *path_index_array,
360 const int work_size)
361{
362 const int global_index = ccl_gpu_global_id_x();
363
364 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
365 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
367 }
368}
370
372 ccl_gpu_kernel_signature(integrator_queued_paths_array,
373 int num_states,
374 ccl_global int *indices,
375 ccl_global int *num_indices,
376 int kernel_index)
377{
378 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
379 int kernel_index);
380 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
381
382 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
383}
385
387 ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
388 int num_states,
389 ccl_global int *indices,
390 ccl_global int *num_indices,
391 int kernel_index)
392{
393 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
394 int kernel_index);
395 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
396
397 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
398}
400
402 ccl_gpu_kernel_signature(integrator_active_paths_array,
403 int num_states,
404 ccl_global int *indices,
405 ccl_global int *num_indices)
406{
407 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
408
409 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
410}
412
414 ccl_gpu_kernel_signature(integrator_terminated_paths_array,
415 int num_states,
416 ccl_global int *indices,
417 ccl_global int *num_indices,
418 int indices_offset)
419{
420 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
421
423 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
424}
426
428 ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
429 int num_states,
430 ccl_global int *indices,
431 ccl_global int *num_indices,
432 int indices_offset)
433{
434 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
435
437 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
438}
440
442 ccl_gpu_kernel_signature(integrator_sorted_paths_array,
443 int num_states,
444 int num_states_limit,
445 ccl_global int *indices,
446 ccl_global int *num_indices,
447 ccl_global int *key_counter,
448 ccl_global int *key_prefix_sum,
449 int kernel_index)
450{
451 ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
452 INTEGRATOR_STATE(state, path, shader_sort_key) :
454 int kernel_index);
455 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
456
457 const uint state_index = ccl_gpu_global_id_x();
460 num_states_limit,
461 indices,
462 num_indices,
463 key_counter,
464 key_prefix_sum,
465 ccl_gpu_kernel_lambda_pass);
466}
468
469/* oneAPI Verizon needs the local_mem accessor in the arguments. */
470#ifdef __KERNEL_ONEAPI__
472 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
473 int num_states,
474 int partition_size,
475 int num_states_limit,
476 ccl_global int *indices,
477 int kernel_index,
478 sycl::local_accessor<int> &local_mem)
479#else
481 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
482 int num_states,
483 int partition_size,
484 int num_states_limit,
485 ccl_global int *indices,
486 int kernel_index)
487#endif
488{
489#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
490 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
491 kernel_integrator_state.path.queued_kernel;
492 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
493 kernel_integrator_state.path.shader_sort_key;
494 ccl_global int *key_offsets = (ccl_global int *)
495 kernel_integrator_state.sort_partition_key_offsets;
496
497# ifdef __KERNEL_METAL__
498 int max_shaders = context.launch_params_metal.data.max_shaders;
499# endif
500
501# ifdef __KERNEL_ONEAPI__
502 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
503 * uses metal_*, we need the below to be compatible with these kernels. */
504 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
505 int metal_local_id = ccl_gpu_thread_idx_x;
506 int metal_local_size = ccl_gpu_block_dim_x;
507 int metal_grid_id = ccl_gpu_block_idx_x;
508 /* There is no difference here between different access decorations, as we are requesting
509 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
510 ccl_gpu_shared int *threadgroup_array =
511 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
512# endif
513
514 gpu_parallel_sort_bucket_pass(num_states,
515 partition_size,
516 max_shaders,
517 kernel_index,
518 d_queued_kernel,
519 d_shader_sort_key,
520 key_offsets,
521 (ccl_gpu_shared int *)threadgroup_array,
522 metal_local_id,
523 metal_local_size,
524 metal_grid_id);
525#endif
526}
528
529/* oneAPI version needs the local_mem accessor in the arguments. */
530#ifdef __KERNEL_ONEAPI__
532 ccl_gpu_kernel_signature(integrator_sort_write_pass,
533 int num_states,
534 int partition_size,
535 int num_states_limit,
536 ccl_global int *indices,
537 int kernel_index,
538 sycl::local_accessor<int> &local_mem)
539#else
541 ccl_gpu_kernel_signature(integrator_sort_write_pass,
542 int num_states,
543 int partition_size,
544 int num_states_limit,
545 ccl_global int *indices,
546 int kernel_index)
547#endif
548
549{
550#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
551 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
552 kernel_integrator_state.path.queued_kernel;
553 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
554 kernel_integrator_state.path.shader_sort_key;
555 ccl_global int *key_offsets = (ccl_global int *)
556 kernel_integrator_state.sort_partition_key_offsets;
557
558# ifdef __KERNEL_METAL__
559 int max_shaders = context.launch_params_metal.data.max_shaders;
560# endif
561
562# ifdef __KERNEL_ONEAPI__
563 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
564 * uses metal_*, we need the below to be compatible with these kernels. */
565 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
566 int metal_local_id = ccl_gpu_thread_idx_x;
567 int metal_local_size = ccl_gpu_block_dim_x;
568 int metal_grid_id = ccl_gpu_block_idx_x;
569 /* There is no difference here between different access decorations, as we are requesting
570 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
571 ccl_gpu_shared int *threadgroup_array =
572 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
573# endif
574
575 gpu_parallel_sort_write_pass(num_states,
576 partition_size,
577 max_shaders,
578 kernel_index,
579 num_states_limit,
580 indices,
581 d_queued_kernel,
582 d_shader_sort_key,
583 key_offsets,
584 (ccl_gpu_shared int *)threadgroup_array,
585 metal_local_id,
586 metal_local_size,
587 metal_grid_id);
588#endif
589}
591
593 ccl_gpu_kernel_signature(integrator_compact_paths_array,
594 int num_states,
595 ccl_global int *indices,
596 ccl_global int *num_indices,
597 int num_active_paths)
598{
599 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
600 (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
601 int num_active_paths);
602 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
603
604 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
605}
607
609 ccl_gpu_kernel_signature(integrator_compact_states,
610 ccl_global const int *active_terminated_states,
611 const int active_states_offset,
612 const int terminated_states_offset,
613 const int work_size)
614{
615 const int global_index = ccl_gpu_global_id_x();
616
617 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
618 const int from_state = active_terminated_states[active_states_offset + global_index];
619 const int to_state = active_terminated_states[terminated_states_offset + global_index];
620
621 ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
622 }
623}
625
627 ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
628 int num_states,
629 ccl_global int *indices,
630 ccl_global int *num_indices,
631 int num_active_paths)
632{
633 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
634 (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
635 int num_active_paths);
636 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
637
638 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
639}
641
643 ccl_gpu_kernel_signature(integrator_compact_shadow_states,
644 ccl_global const int *active_terminated_states,
645 const int active_states_offset,
646 const int terminated_states_offset,
647 const int work_size)
648{
649 const int global_index = ccl_gpu_global_id_x();
650
651 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
652 const int from_state = active_terminated_states[active_states_offset + global_index];
653 const int to_state = active_terminated_states[terminated_states_offset + global_index];
654
655 ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
656 }
657}
659
661 prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
662{
663 gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
664}
666
667/* --------------------------------------------------------------------
668 * Adaptive sampling.
669 */
670
674 int sx,
675 int sy,
676 int sw,
677 int sh,
678 float threshold,
679 int reset,
680 int offset,
681 int stride,
682 ccl_global uint *num_active_pixels)
683{
684 const int work_index = ccl_gpu_global_id_x();
685 const int y = work_index / sw;
686 const int x = work_index - y * sw;
687
688 bool converged = true;
689
690 if (x < sw && y < sh) {
692 nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
693 }
694
695 /* NOTE: All threads specified in the mask must execute the intrinsic. */
696 const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
697 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
698 if (lane_id == 0) {
699 atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
700 }
701}
703
707 int sx,
708 int sy,
709 int sw,
710 int sh,
711 int offset,
712 int stride)
713{
714 const int y = ccl_gpu_global_id_x();
715
716 if (y < sh) {
718 film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
719 }
720}
722
726 int sx,
727 int sy,
728 int sw,
729 int sh,
730 int offset,
731 int stride)
732{
733 const int x = ccl_gpu_global_id_x();
734
735 if (x < sw) {
737 film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
738 }
739}
741
742/* --------------------------------------------------------------------
743 * Cryptomatte.
744 */
745
749 int num_pixels)
750{
751 const int pixel_index = ccl_gpu_global_id_x();
752
753 if (pixel_index < num_pixels) {
755 }
756}
758
759/* --------------------------------------------------------------------
760 * Film.
761 */
762
763ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
764 const int rgba_offset,
765 const int rgba_stride,
766 const int x,
767 const int y,
768 const half4 half_pixel)
769{
770 /* Work around HIP issue with half float display, see #92972. */
771#ifdef __KERNEL_HIP__
772 ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
773 out[0] = half_pixel.x;
774 out[1] = half_pixel.y;
775 out[2] = half_pixel.z;
776 out[3] = half_pixel.w;
777#else
778 ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
779 *out = half_pixel;
780#endif
781}
782
783#ifdef __KERNEL_METAL__
784
785/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
786 * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
787# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
788 float local_pixel[4]; \
789 film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
790 if (input_channel_count >= 1) { \
791 pixel[0] = local_pixel[0]; \
792 } \
793 if (input_channel_count >= 2) { \
794 pixel[1] = local_pixel[1]; \
795 } \
796 if (input_channel_count >= 3) { \
797 pixel[2] = local_pixel[2]; \
798 } \
799 if (input_channel_count >= 4) { \
800 pixel[3] = local_pixel[3]; \
801 }
802
803#else
804
805# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
806 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
807
808#endif
809
810#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
811 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
812 ccl_gpu_kernel_signature(film_convert_##variant, \
813 const KernelFilmConvert kfilm_convert, \
814 ccl_global float *pixels, \
815 ccl_global float *render_buffer, \
816 int num_pixels, \
817 int width, \
818 int offset, \
819 int stride, \
820 int channel_offset, \
821 int rgba_offset, \
822 int rgba_stride) \
823 { \
824 const int render_pixel_index = ccl_gpu_global_id_x(); \
825 if (render_pixel_index >= num_pixels) { \
826 return; \
827 } \
828\
829 const int x = render_pixel_index % width; \
830 const int y = render_pixel_index / width; \
831\
832 const uint64_t buffer_pixel_index = x + y * stride; \
833 ccl_global const float *buffer = render_buffer + offset + \
834 buffer_pixel_index * kfilm_convert.pass_stride; \
835\
836 ccl_global float *pixel = pixels + channel_offset + \
837 (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
838\
839 FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
840 } \
841 ccl_gpu_kernel_postfix \
842\
843 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
844 ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
845 const KernelFilmConvert kfilm_convert, \
846 ccl_global uchar4 *rgba, \
847 ccl_global float *render_buffer, \
848 int num_pixels, \
849 int width, \
850 int offset, \
851 int stride, \
852 int rgba_offset, \
853 int rgba_stride) \
854 { \
855 const int render_pixel_index = ccl_gpu_global_id_x(); \
856 if (render_pixel_index >= num_pixels) { \
857 return; \
858 } \
859\
860 const int x = render_pixel_index % width; \
861 const int y = render_pixel_index / width; \
862\
863 const uint64_t buffer_pixel_index = x + y * stride; \
864 ccl_global const float *buffer = render_buffer + offset + \
865 buffer_pixel_index * kfilm_convert.pass_stride; \
866\
867 float pixel[4]; \
868 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
869\
870 if (input_channel_count == 1) { \
871 pixel[1] = pixel[2] = pixel[0]; \
872 } \
873 if (input_channel_count <= 3) { \
874 pixel[3] = 1.0f; \
875 } \
876\
877 film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
878\
879 const half4 half_pixel = float4_to_half4_display( \
880 make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
881 kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
882 } \
883 ccl_gpu_kernel_postfix
884
885/* 1 channel inputs */
888KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
890
891/* 3 channel inputs */
892KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
894
895/* 4 channel inputs */
897KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
898KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
899KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
902
903#undef KERNEL_FILM_CONVERT_VARIANT
904
905/* --------------------------------------------------------------------
906 * Shader evaluation.
907 */
908
909/* Displacement */
910
914 ccl_global float *output,
915 const int offset,
916 const int work_size)
917{
918 int i = ccl_gpu_global_id_x();
919 if (i < work_size) {
920 ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
921 }
922}
924
925/* Background */
926
930 ccl_global float *output,
931 const int offset,
932 const int work_size)
933{
934 int i = ccl_gpu_global_id_x();
935 if (i < work_size) {
936 ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
937 }
938}
940
941/* Curve Shadow Transparency */
942
946 ccl_global float *output,
947 const int offset,
948 const int work_size)
949{
950 int i = ccl_gpu_global_id_x();
951 if (i < work_size) {
953 kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
954 }
955}
957
958/* --------------------------------------------------------------------
959 * Denoising.
960 */
961
963 ccl_gpu_kernel_signature(filter_color_preprocess,
965 int full_x,
966 int full_y,
967 int width,
968 int height,
969 int offset,
970 int stride,
971 int pass_stride,
972 int pass_denoised)
973{
974 const int work_index = ccl_gpu_global_id_x();
975 const int y = work_index / width;
976 const int x = work_index - y * width;
977
978 if (x >= width || y >= height) {
979 return;
980 }
981
982 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
983 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
984
985 ccl_global float *color_out = buffer + pass_denoised;
986 color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
987 color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
988 color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
989}
991
993 ccl_gpu_kernel_signature(filter_guiding_preprocess,
994 ccl_global float *guiding_buffer,
995 int guiding_pass_stride,
996 int guiding_pass_albedo,
997 int guiding_pass_normal,
998 int guiding_pass_flow,
999 ccl_global const float *render_buffer,
1000 int render_offset,
1001 int render_stride,
1002 int render_pass_stride,
1003 int render_pass_sample_count,
1004 int render_pass_denoising_albedo,
1005 int render_pass_denoising_normal,
1006 int render_pass_motion,
1007 int full_x,
1008 int full_y,
1009 int width,
1010 int height,
1011 int num_samples)
1012{
1013 const int work_index = ccl_gpu_global_id_x();
1014 const int y = work_index / width;
1015 const int x = work_index - y * width;
1016
1017 if (x >= width || y >= height) {
1018 return;
1019 }
1020
1021 const uint64_t guiding_pixel_index = x + y * width;
1022 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1023
1024 const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
1025 ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
1026
1027 float pixel_scale;
1028 if (render_pass_sample_count == PASS_UNUSED) {
1029 pixel_scale = 1.0f / num_samples;
1030 }
1031 else {
1032 pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
1033 }
1034
1035 /* Albedo pass. */
1036 if (guiding_pass_albedo != PASS_UNUSED) {
1037 kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
1038
1039 ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
1040 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1041
1042 albedo_out[0] = aledo_in[0] * pixel_scale;
1043 albedo_out[1] = aledo_in[1] * pixel_scale;
1044 albedo_out[2] = aledo_in[2] * pixel_scale;
1045 }
1046
1047 /* Normal pass. */
1048 if (guiding_pass_normal != PASS_UNUSED) {
1049 kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
1050
1051 ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
1052 ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
1053
1054 normal_out[0] = normal_in[0] * pixel_scale;
1055 normal_out[1] = normal_in[1] * pixel_scale;
1056 normal_out[2] = normal_in[2] * pixel_scale;
1057 }
1058
1059 /* Flow pass. */
1060 if (guiding_pass_flow != PASS_UNUSED) {
1061 kernel_assert(render_pass_motion != PASS_UNUSED);
1062
1063 ccl_global const float *motion_in = buffer + render_pass_motion;
1064 ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
1065
1066 flow_out[0] = -motion_in[0] * pixel_scale;
1067 flow_out[1] = -motion_in[1] * pixel_scale;
1068 }
1069}
1071
1073 ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
1074 ccl_global float *guiding_buffer,
1075 int guiding_pass_stride,
1076 int guiding_pass_albedo,
1077 int width,
1078 int height)
1079{
1080 kernel_assert(guiding_pass_albedo != PASS_UNUSED);
1081
1082 const int work_index = ccl_gpu_global_id_x();
1083 const int y = work_index / width;
1084 const int x = work_index - y * width;
1085
1086 if (x >= width || y >= height) {
1087 return;
1088 }
1089
1090 const uint64_t guiding_pixel_index = x + y * width;
1091 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1092
1093 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1094
1095 albedo_out[0] = 0.5f;
1096 albedo_out[1] = 0.5f;
1097 albedo_out[2] = 0.5f;
1098}
1100
1102 ccl_gpu_kernel_signature(filter_color_postprocess,
1104 int full_x,
1105 int full_y,
1106 int width,
1107 int height,
1108 int offset,
1109 int stride,
1110 int pass_stride,
1111 int num_samples,
1112 int pass_noisy,
1113 int pass_denoised,
1114 int pass_sample_count,
1115 int num_components,
1116 int use_compositing)
1117{
1118 const int work_index = ccl_gpu_global_id_x();
1119 const int y = work_index / width;
1120 const int x = work_index - y * width;
1121
1122 if (x >= width || y >= height) {
1123 return;
1124 }
1125
1126 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1127 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
1128
1129 float pixel_scale;
1130 if (pass_sample_count == PASS_UNUSED) {
1131 pixel_scale = num_samples;
1132 }
1133 else {
1134 pixel_scale = __float_as_uint(buffer[pass_sample_count]);
1135 }
1136
1137 ccl_global float *denoised_pixel = buffer + pass_denoised;
1138
1139 denoised_pixel[0] *= pixel_scale;
1140 denoised_pixel[1] *= pixel_scale;
1141 denoised_pixel[2] *= pixel_scale;
1142
1143 if (num_components == 3) {
1144 /* Pass without alpha channel. */
1145 }
1146 else if (!use_compositing) {
1147 /* Currently compositing passes are either 3-component (derived by dividing light passes)
1148 * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
1149 * simplifies logic and avoids extra memory allocation. */
1150 ccl_global const float *noisy_pixel = buffer + pass_noisy;
1151 denoised_pixel[3] = noisy_pixel[3];
1152 }
1153 else {
1154 /* Assigning to zero since this is a default alpha value for 3-component passes, and it
1155 * is an opaque pixel for 4 component passes. */
1156 denoised_pixel[3] = 0;
1157 }
1158}
1160
1161/* --------------------------------------------------------------------
1162 * Shadow catcher.
1163 */
1164
1166 ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
1167 int num_states,
1168 ccl_global uint *num_possible_splits)
1169{
1170 const int state = ccl_gpu_global_id_x();
1171
1172 bool can_split = false;
1173
1174 if (state < num_states) {
1176 }
1177
1178 /* NOTE: All threads specified in the mask must execute the intrinsic. */
1179 const auto can_split_mask = ccl_gpu_ballot(can_split);
1180 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
1181 if (lane_id == 0) {
1182 atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
1183 }
1184}
unsigned short ushort
unsigned int uint
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
void reset()
clear internal cached data and reset random seed
Definition half.h:42
ccl_device_inline void film_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, int pixel_index)
#define kernel_assert(cond)
const KernelGlobalsCPU *ccl_restrict KernelGlobals
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_global_id_x()
#define ccl_gpu_warp_size
#define ccl_gpu_shared
#define ccl_device_inline
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define ccl_global
#define kernel_integrator_state
#define NULL
#define __float_as_uint(x)
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
@ Kernel_DummyConstant
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_dedicated_light(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_within_bounds(i, n)
#define ccl_gpu_kernel_call(x)
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_threads(block_num_threads)
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
const int tile_work_index
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
ccl_gpu_kernel_postfix ccl_global const int * path_index_array
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
const int tile_index
ccl_global const KernelWorkTile * tile
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
const int state
ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, int x, int y, float threshold, int reset, int offset, int stride)
ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
#define PASS_UNUSED
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_x(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME cryptomatte_postprocess(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME shader_eval_background(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME shader_eval_displace(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
bool KERNEL_FUNCTION_FULL_NAME adaptive_sampling_convergence_check(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int y, float threshold, int reset, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_y(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_curve_shadow_transparency(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
std::shared_ptr< const T > get(const GenericKey &key, FunctionRef< std::unique_ptr< T >()> compute_fn)
#define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op)
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORT_BLOCK_SIZE
__device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_dedicated_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition shade_light.h:65
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg, ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
Definition state.h:236
#define INTEGRATOR_STATE(state, nested_struct, member)
Definition state.h:235
unsigned __int64 uint64_t
Definition stdint.h:90
Definition half.h:61
half x
Definition half.h:62
half w
Definition half.h:62
half z
Definition half.h:62
half y
Definition half.h:62
ccl_device_inline uint popcount(uint x)
Definition util/math.h:855
ccl_device_inline int clamp(int a, int mn, int mx)
Definition util/math.h:379
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)