Blender V4.5
kernel/device/gpu/kernel.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5/* Common GPU kernels. */
6
10
11#include "kernel/sample/lcg.h"
12
13/* Include constant tables before entering Metal's context class scope (context_begin.h) */
14#include "kernel/tables.h"
15
16#ifdef __KERNEL_METAL__
18#elif defined(__KERNEL_ONEAPI__)
20#endif
21
23
27
41
42#include "kernel/bake/bake.h"
43
45
46#ifdef __KERNEL_METAL__
48#elif defined(__KERNEL_ONEAPI__)
50#endif
51
52#include "kernel/film/read.h"
53
54#if defined(__HIPRT__)
56#endif
57/* --------------------------------------------------------------------
58 * Integrator.
59 */
60
62 ccl_gpu_kernel_signature(integrator_reset, const int num_states)
63{
64 const int state = ccl_gpu_global_id_x();
65
67 INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
68 INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
69 }
70}
72
76 const int num_tiles,
79{
80 const int work_index = ccl_gpu_global_id_x();
81
82 if (work_index >= max_tile_work_size * num_tiles) {
83 return;
84 }
85
86 const int tile_index = work_index / max_tile_work_size;
87 const int tile_work_index = work_index - tile_index * max_tile_work_size;
88
90
91 if (tile_work_index >= tile->work_size) {
92 return;
93 }
94
95 const int state = tile->path_index_offset + tile_work_index;
96
99
102}
104
108 const int num_tiles,
110 const int max_tile_work_size)
111{
112 const int work_index = ccl_gpu_global_id_x();
113
114 if (work_index >= max_tile_work_size * num_tiles) {
115 return;
116 }
117
118 const int tile_index = work_index / max_tile_work_size;
119 const int tile_work_index = work_index - tile_index * max_tile_work_size;
120
122
123 if (tile_work_index >= tile->work_size) {
124 return;
125 }
126
127 const int state = tile->path_index_offset + tile_work_index;
128
129 uint x, y, sample;
131
134}
136
137#if !defined(__HIPRT__)
138
139/* Intersection kernels need access to the kernel handler for specialization constants to work
140 * properly. */
141# ifdef __KERNEL_ONEAPI__
143# endif
144
149 const int work_size)
150{
151 const int global_index = ccl_gpu_global_id_x();
152
154 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
156 }
157}
159
162 const ccl_global int *path_index_array,
163 const int work_size)
164{
165 const int global_index = ccl_gpu_global_id_x();
166
167 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
168 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
170 }
171}
173
176 const ccl_global int *path_index_array,
177 const int work_size)
178{
179 const int global_index = ccl_gpu_global_id_x();
180
181 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
182 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
184 }
185}
187
190 const ccl_global int *path_index_array,
191 const int work_size)
192{
193# ifdef __VOLUME__
194 const int global_index = ccl_gpu_global_id_x();
195
196 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
197 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
199 }
200# endif
201}
203
206 const ccl_global int *path_index_array,
207 const int work_size)
208{
209 const int global_index = ccl_gpu_global_id_x();
210
211 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
212 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
214 }
215}
217
218# ifdef __KERNEL_ONEAPI__
220# endif
221
222#endif
223
226 const ccl_global int *path_index_array,
228 const int work_size)
229{
230 const int global_index = ccl_gpu_global_id_x();
231
232 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
233 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
235 }
236}
238
241 const ccl_global int *path_index_array,
243 const int work_size)
244{
245 const int global_index = ccl_gpu_global_id_x();
246
247 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
248 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
250 }
251}
253
256 const ccl_global int *path_index_array,
258 const int work_size)
259{
260 const int global_index = ccl_gpu_global_id_x();
261
262 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
263 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
265 }
266}
268
271 const ccl_global int *path_index_array,
273 const int work_size)
274{
275 const int global_index = ccl_gpu_global_id_x();
276
277 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
278 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
280 }
281}
283
284#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
285constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
286#endif
287
288#if !defined(__HIPRT__)
289
290/* Kernels using intersections need access to the kernel handler for specialization constants to
291 * work properly. */
292# ifdef __KERNEL_ONEAPI__
294# endif
295
298 const ccl_global int *path_index_array,
300 const int work_size)
301{
302 const int global_index = ccl_gpu_global_id_x();
303
304 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
305 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
306
307# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
308 KernelGlobals kg = nullptr;
309 /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
310 * Dummy offset should not affect result, but somehow fixes bug! */
311 kg += __dummy_constant;
313# else
315# endif
316 }
317}
319
322 const ccl_global int *path_index_array,
324 const int work_size)
325{
326 const int global_index = ccl_gpu_global_id_x();
327
328 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
329 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
331 }
332}
334
335# ifdef __KERNEL_ONEAPI__
337# endif
338
339#endif
340
343 const ccl_global int *path_index_array,
345 const int work_size)
346{
347 const int global_index = ccl_gpu_global_id_x();
348
349 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
350 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
352 }
353}
355
358 const ccl_global int *path_index_array,
360 const int work_size)
361{
362 const int global_index = ccl_gpu_global_id_x();
363
364 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
365 const int state = (path_index_array) ? path_index_array[global_index] : global_index;
367 }
368}
370
372 ccl_gpu_kernel_signature(integrator_queued_paths_array,
373 const int num_states,
374 ccl_global int *indices,
375 ccl_global int *num_indices,
376 const int kernel_index)
377{
378 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
379 int kernel_index);
380 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
381
382 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
383}
385
387 ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
388 const int num_states,
389 ccl_global int *indices,
390 ccl_global int *num_indices,
391 const int kernel_index)
392{
393 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
394 int kernel_index);
395 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
396
397 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
398}
400
402 ccl_gpu_kernel_signature(integrator_active_paths_array,
403 const int num_states,
404 ccl_global int *indices,
405 ccl_global int *num_indices)
406{
407 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
408
409 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
410}
412
414 ccl_gpu_kernel_signature(integrator_terminated_paths_array,
415 const int num_states,
416 ccl_global int *indices,
417 ccl_global int *num_indices,
418 const int indices_offset)
419{
420 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
421
423 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
424}
426
428 ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
429 const int num_states,
430 ccl_global int *indices,
431 ccl_global int *num_indices,
432 const int indices_offset)
433{
434 ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
435
437 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
438}
440
442 ccl_gpu_kernel_signature(integrator_sorted_paths_array,
443 const int num_states,
444 const int num_states_limit,
445 ccl_global int *indices,
446 ccl_global int *num_indices,
447 ccl_global int *key_counter,
448 ccl_global int *key_prefix_sum,
449 const int kernel_index)
450{
451 ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
452 INTEGRATOR_STATE(state, path, shader_sort_key) :
454 int kernel_index);
455 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
456
457 const uint state_index = ccl_gpu_global_id_x();
460 num_states_limit,
461 indices,
462 num_indices,
463 key_counter,
464 key_prefix_sum,
465 ccl_gpu_kernel_lambda_pass);
466}
468
469/* oneAPI Verizon needs the local_mem accessor in the arguments. */
470#ifdef __KERNEL_ONEAPI__
472 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
473 const int num_states,
474 const int partition_size,
475 const int num_states_limit,
476 ccl_global int *indices,
477 const int kernel_index,
478 sycl::local_accessor<int> &local_mem)
479#else
481 ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
482 const int num_states,
483 const int partition_size,
484 const int num_states_limit,
485 ccl_global int *indices,
486 const int kernel_index)
487#endif
488{
489#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
490 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
491 kernel_integrator_state.path.queued_kernel;
492 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
493 kernel_integrator_state.path.shader_sort_key;
494 ccl_global int *key_offsets = (ccl_global int *)
495 kernel_integrator_state.sort_partition_key_offsets;
496
497# ifdef __KERNEL_METAL__
498 int max_shaders = context.launch_params_metal.data.max_shaders;
499# endif
500
501# ifdef __KERNEL_ONEAPI__
502 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
503 * uses metal_*, we need the below to be compatible with these kernels. */
504 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
505 int metal_local_id = ccl_gpu_thread_idx_x;
506 int metal_local_size = ccl_gpu_block_dim_x;
507 int metal_grid_id = ccl_gpu_block_idx_x;
508 /* There is no difference here between different access decorations, as we are requesting
509 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
510 ccl_gpu_shared int *threadgroup_array =
511 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
512# endif
513
514 gpu_parallel_sort_bucket_pass(num_states,
515 partition_size,
516 max_shaders,
517 kernel_index,
518 d_queued_kernel,
519 d_shader_sort_key,
520 key_offsets,
521 (ccl_gpu_shared int *)threadgroup_array,
522 metal_local_id,
523 metal_local_size,
524 metal_grid_id);
525#endif
526}
528
529/* oneAPI version needs the local_mem accessor in the arguments. */
530#ifdef __KERNEL_ONEAPI__
532 ccl_gpu_kernel_signature(integrator_sort_write_pass,
533 const int num_states,
534 const int partition_size,
535 const int num_states_limit,
536 ccl_global int *indices,
537 const int kernel_index,
538 sycl::local_accessor<int> &local_mem)
539#else
541 ccl_gpu_kernel_signature(integrator_sort_write_pass,
542 const int num_states,
543 const int partition_size,
544 const int num_states_limit,
545 ccl_global int *indices,
546 const int kernel_index)
547#endif
548
549{
550#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
551 ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
552 kernel_integrator_state.path.queued_kernel;
553 ccl_global uint *d_shader_sort_key = (ccl_global uint *)
554 kernel_integrator_state.path.shader_sort_key;
555 ccl_global int *key_offsets = (ccl_global int *)
556 kernel_integrator_state.sort_partition_key_offsets;
557
558# ifdef __KERNEL_METAL__
559 int max_shaders = context.launch_params_metal.data.max_shaders;
560# endif
561
562# ifdef __KERNEL_ONEAPI__
563 /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
564 * uses metal_*, we need the below to be compatible with these kernels. */
565 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
566 int metal_local_id = ccl_gpu_thread_idx_x;
567 int metal_local_size = ccl_gpu_block_dim_x;
568 int metal_grid_id = ccl_gpu_block_idx_x;
569 /* There is no difference here between different access decorations, as we are requesting
570 * a raw pointer immediately, so the simplest decoration option is used (no decoration). */
571 ccl_gpu_shared int *threadgroup_array =
572 local_mem.get_multi_ptr<sycl::access::decorated::no>().get();
573# endif
574
575 gpu_parallel_sort_write_pass(num_states,
576 partition_size,
577 max_shaders,
578 kernel_index,
579 num_states_limit,
580 indices,
581 d_queued_kernel,
582 d_shader_sort_key,
583 key_offsets,
584 (ccl_gpu_shared int *)threadgroup_array,
585 metal_local_id,
586 metal_local_size,
587 metal_grid_id);
588#endif
589}
591
593 ccl_gpu_kernel_signature(integrator_compact_paths_array,
594 const int num_states,
595 ccl_global int *indices,
596 ccl_global int *num_indices,
597 const int num_active_paths)
598{
599 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
600 (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
601 int num_active_paths);
602 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
603
604 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
605}
607
609 ccl_gpu_kernel_signature(integrator_compact_states,
610 const ccl_global int *active_terminated_states,
611 const int active_states_offset,
612 const int terminated_states_offset,
613 const int work_size)
614{
615 const int global_index = ccl_gpu_global_id_x();
616
617 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
618 const int from_state = active_terminated_states[active_states_offset + global_index];
619 const int to_state = active_terminated_states[terminated_states_offset + global_index];
620
621 ccl_gpu_kernel_call(integrator_state_move(nullptr, to_state, from_state));
622 }
623}
625
627 ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
628 const int num_states,
629 ccl_global int *indices,
630 ccl_global int *num_indices,
631 const int num_active_paths)
632{
633 ccl_gpu_kernel_lambda((state >= num_active_paths) &&
634 (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
635 int num_active_paths);
636 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
637
638 gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
639}
641
643 ccl_gpu_kernel_signature(integrator_compact_shadow_states,
644 const ccl_global int *active_terminated_states,
645 const int active_states_offset,
646 const int terminated_states_offset,
647 const int work_size)
648{
649 const int global_index = ccl_gpu_global_id_x();
650
651 if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
652 const int from_state = active_terminated_states[active_states_offset + global_index];
653 const int to_state = active_terminated_states[terminated_states_offset + global_index];
654
655 ccl_gpu_kernel_call(integrator_shadow_state_move(nullptr, to_state, from_state));
656 }
657}
659
661 prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
662{
663 gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
664}
666
667/* --------------------------------------------------------------------
668 * Adaptive sampling.
669 */
670
674 const int sx,
675 const int sy,
676 const int sw,
677 const int sh,
678 const float threshold,
679 const int reset,
680 const int offset,
681 const int stride,
682 ccl_global uint *num_active_pixels)
683{
684 const int work_index = ccl_gpu_global_id_x();
685 const int y = work_index / sw;
686 const int x = work_index - y * sw;
687
688 bool converged = true;
689
690 if (x < sw && y < sh) {
692 nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
693 }
694
695 /* NOTE: All threads specified in the mask must execute the intrinsic. */
696 const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
697 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
698 if (lane_id == 0) {
699 atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
700 }
701}
703
707 const int sx,
708 const int sy,
709 const int sw,
710 const int sh,
711 const int offset,
712 const int stride)
713{
714 const int y = ccl_gpu_global_id_x();
715
716 if (y < sh) {
718 film_adaptive_sampling_filter_x(nullptr, render_buffer, sy + y, sx, sw, offset, stride));
719 }
720}
722
726 const int sx,
727 const int sy,
728 const int sw,
729 const int sh,
730 const int offset,
731 const int stride)
732{
733 const int x = ccl_gpu_global_id_x();
734
735 if (x < sw) {
737 film_adaptive_sampling_filter_y(nullptr, render_buffer, sx + x, sy, sh, offset, stride));
738 }
739}
741
742/* --------------------------------------------------------------------
743 * Cryptomatte.
744 */
745
749 const int num_pixels)
750{
751 const int pixel_index = ccl_gpu_global_id_x();
752
753 if (pixel_index < num_pixels) {
755 }
756}
758
759/* --------------------------------------------------------------------
760 * Film.
761 */
762
763ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
764 const int rgba_offset,
765 const int rgba_stride,
766 const int x,
767 const int y,
768 const half4 half_pixel)
769{
770 /* Work around HIP issue with half float display, see #92972. */
771#ifdef __KERNEL_HIP__
772 ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
773 out[0] = half_pixel.x;
774 out[1] = half_pixel.y;
775 out[2] = half_pixel.z;
776 out[3] = half_pixel.w;
777#else
778 ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
779 *out = half_pixel;
780#endif
781}
782
783#ifdef __KERNEL_METAL__
784
785/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
786 * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
787# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
788 float local_pixel[4]; \
789 film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
790 if (input_channel_count >= 1) { \
791 pixel[0] = local_pixel[0]; \
792 } \
793 if (input_channel_count >= 2) { \
794 pixel[1] = local_pixel[1]; \
795 } \
796 if (input_channel_count >= 3) { \
797 pixel[2] = local_pixel[2]; \
798 } \
799 if (input_channel_count >= 4) { \
800 pixel[3] = local_pixel[3]; \
801 }
802
803#else
804
805# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
806 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
807
808#endif
809
810#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
811 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
812 ccl_gpu_kernel_signature(film_convert_##variant, \
813 const KernelFilmConvert kfilm_convert, \
814 ccl_global float *pixels, \
815 ccl_global float *render_buffer, \
816 int num_pixels, \
817 int width, \
818 int offset, \
819 int stride, \
820 int channel_offset, \
821 int rgba_offset, \
822 int rgba_stride) \
823 { \
824 const int render_pixel_index = ccl_gpu_global_id_x(); \
825 if (render_pixel_index >= num_pixels) { \
826 return; \
827 } \
828\
829 const int x = render_pixel_index % width; \
830 const int y = render_pixel_index / width; \
831\
832 const uint64_t buffer_pixel_index = x + y * stride; \
833 ccl_global const float *buffer = render_buffer + offset + \
834 buffer_pixel_index * kfilm_convert.pass_stride; \
835\
836 ccl_global float *pixel = pixels + channel_offset + \
837 (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
838\
839 FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
840 } \
841 ccl_gpu_kernel_postfix \
842\
843 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
844 ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
845 const KernelFilmConvert kfilm_convert, \
846 ccl_global uchar4 *rgba, \
847 ccl_global float *render_buffer, \
848 int num_pixels, \
849 int width, \
850 int offset, \
851 int stride, \
852 int rgba_offset, \
853 int rgba_stride) \
854 { \
855 const int render_pixel_index = ccl_gpu_global_id_x(); \
856 if (render_pixel_index >= num_pixels) { \
857 return; \
858 } \
859\
860 const int x = render_pixel_index % width; \
861 const int y = render_pixel_index / width; \
862\
863 const uint64_t buffer_pixel_index = x + y * stride; \
864 ccl_global const float *buffer = render_buffer + offset + \
865 buffer_pixel_index * kfilm_convert.pass_stride; \
866\
867 float pixel[4]; \
868 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
869\
870 if (input_channel_count == 1) { \
871 pixel[1] = pixel[2] = pixel[0]; \
872 } \
873 if (input_channel_count <= 3) { \
874 pixel[3] = 1.0f; \
875 } \
876\
877 film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
878\
879 const half4 half_pixel = float4_to_half4_display( \
880 make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
881 kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
882 } \
883 ccl_gpu_kernel_postfix
884
885/* 1 channel inputs */
888KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
890
891/* 3 channel inputs */
892KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
894
895/* 4 channel inputs */
897KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
898KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
899KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
902
903#undef KERNEL_FILM_CONVERT_VARIANT
904
905/* --------------------------------------------------------------------
906 * Shader evaluation.
907 */
908
909/* Displacement */
910
914 ccl_global float *output,
915 const int offset,
916 const int work_size)
917{
918 int i = ccl_gpu_global_id_x();
919 if (i < work_size) {
921 }
922}
924
925/* Background */
926
930 ccl_global float *output,
931 const int offset,
932 const int work_size)
933{
934 int i = ccl_gpu_global_id_x();
935 if (i < work_size) {
937 }
938}
940
941/* Curve Shadow Transparency */
942
946 ccl_global float *output,
947 const int offset,
948 const int work_size)
949{
950 int i = ccl_gpu_global_id_x();
951 if (i < work_size) {
954 }
955}
957
958/* --------------------------------------------------------------------
959 * Denoising.
960 */
961
963 ccl_gpu_kernel_signature(filter_color_preprocess,
965 const int full_x,
966 const int full_y,
967 const int width,
968 const int height,
969 const int offset,
970 const int stride,
971 const int pass_stride,
972 const int pass_denoised)
973{
974 const int work_index = ccl_gpu_global_id_x();
975 const int y = work_index / width;
976 const int x = work_index - y * width;
977
978 if (x >= width || y >= height) {
979 return;
980 }
981
982 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
983 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
984
985 ccl_global float *color_out = buffer + pass_denoised;
986 color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
987 color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
988 color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
989}
991
993 ccl_gpu_kernel_signature(filter_guiding_preprocess,
994 ccl_global float *guiding_buffer,
995 const int guiding_pass_stride,
996 const int guiding_pass_albedo,
997 const int guiding_pass_normal,
998 const int guiding_pass_flow,
999 const ccl_global float *render_buffer,
1000 const int render_offset,
1001 const int render_stride,
1002 const int render_pass_stride,
1003 const int render_pass_sample_count,
1004 const int render_pass_denoising_albedo,
1005 const int render_pass_denoising_normal,
1006 const int render_pass_motion,
1007 const int full_x,
1008 const int full_y,
1009 const int width,
1010 const int height,
1011 const int num_samples)
1012{
1013 const int work_index = ccl_gpu_global_id_x();
1014 const int y = work_index / width;
1015 const int x = work_index - y * width;
1016
1017 if (x >= width || y >= height) {
1018 return;
1019 }
1020
1021 const uint64_t guiding_pixel_index = x + y * width;
1022 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1023
1024 const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
1025 const ccl_global float *buffer = render_buffer + render_pixel_index * render_pass_stride;
1026
1027 float pixel_scale;
1028 if (render_pass_sample_count == PASS_UNUSED) {
1029 pixel_scale = 1.0f / num_samples;
1030 }
1031 else {
1032 pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
1033 }
1034
1035 /* Albedo pass. */
1036 if (guiding_pass_albedo != PASS_UNUSED) {
1037 kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
1038
1039 const ccl_global float *aledo_in = buffer + render_pass_denoising_albedo;
1040 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1041
1042 albedo_out[0] = aledo_in[0] * pixel_scale;
1043 albedo_out[1] = aledo_in[1] * pixel_scale;
1044 albedo_out[2] = aledo_in[2] * pixel_scale;
1045 }
1046
1047 /* Normal pass. */
1048 if (guiding_pass_normal != PASS_UNUSED) {
1049 kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
1050
1051 const ccl_global float *normal_in = buffer + render_pass_denoising_normal;
1052 ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
1053
1054 normal_out[0] = normal_in[0] * pixel_scale;
1055 normal_out[1] = normal_in[1] * pixel_scale;
1056 normal_out[2] = normal_in[2] * pixel_scale;
1057 }
1058
1059 /* Flow pass. */
1060 if (guiding_pass_flow != PASS_UNUSED) {
1061 kernel_assert(render_pass_motion != PASS_UNUSED);
1062
1063 const ccl_global float *motion_in = buffer + render_pass_motion;
1064 ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
1065
1066 flow_out[0] = -motion_in[0] * pixel_scale;
1067 flow_out[1] = -motion_in[1] * pixel_scale;
1068 }
1069}
1071
1073 ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
1074 ccl_global float *guiding_buffer,
1075 const int guiding_pass_stride,
1076 const int guiding_pass_albedo,
1077 const int width,
1078 const int height)
1079{
1080 kernel_assert(guiding_pass_albedo != PASS_UNUSED);
1081
1082 const int work_index = ccl_gpu_global_id_x();
1083 const int y = work_index / width;
1084 const int x = work_index - y * width;
1085
1086 if (x >= width || y >= height) {
1087 return;
1088 }
1089
1090 const uint64_t guiding_pixel_index = x + y * width;
1091 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1092
1093 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1094
1095 albedo_out[0] = 0.5f;
1096 albedo_out[1] = 0.5f;
1097 albedo_out[2] = 0.5f;
1098}
1100
1102 ccl_gpu_kernel_signature(filter_color_postprocess,
1104 const int full_x,
1105 const int full_y,
1106 const int width,
1107 const int height,
1108 const int offset,
1109 const int stride,
1110 const int pass_stride,
1111 const int num_samples,
1112 const int pass_noisy,
1113 const int pass_denoised,
1114 const int pass_sample_count,
1115 const int num_components,
1116 const int use_compositing)
1117{
1118 const int work_index = ccl_gpu_global_id_x();
1119 const int y = work_index / width;
1120 const int x = work_index - y * width;
1121
1122 if (x >= width || y >= height) {
1123 return;
1124 }
1125
1126 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1127 ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
1128
1129 float pixel_scale;
1130 if (pass_sample_count == PASS_UNUSED) {
1131 pixel_scale = num_samples;
1132 }
1133 else {
1134 pixel_scale = __float_as_uint(buffer[pass_sample_count]);
1135 }
1136
1137 ccl_global float *denoised_pixel = buffer + pass_denoised;
1138
1139 denoised_pixel[0] *= pixel_scale;
1140 denoised_pixel[1] *= pixel_scale;
1141 denoised_pixel[2] *= pixel_scale;
1142
1143 if (num_components == 3) {
1144 /* Pass without alpha channel. */
1145 }
1146 else if (!use_compositing) {
1147 /* Currently compositing passes are either 3-component (derived by dividing light passes)
1148 * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
1149 * simplifies logic and avoids extra memory allocation. */
1150 const ccl_global float *noisy_pixel = buffer + pass_noisy;
1151 denoised_pixel[3] = noisy_pixel[3];
1152 }
1153 else {
1154 /* Assigning to zero since this is a default alpha value for 3-component passes, and it
1155 * is an opaque pixel for 4 component passes. */
1156 denoised_pixel[3] = 0;
1157 }
1158}
1160
1161/* --------------------------------------------------------------------
1162 * Shadow catcher.
1163 */
1164
1166 ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
1167 const int num_states,
1168 ccl_global uint *num_possible_splits)
1169{
1170 const int state = ccl_gpu_global_id_x();
1171
1172 bool can_split = false;
1173
1174 if (state < num_states) {
1176 }
1177
1178 /* NOTE: All threads specified in the mask must execute the intrinsic. */
1179 const auto can_split_mask = ccl_gpu_ballot(can_split);
1180 const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
1181 if (lane_id == 0) {
1182 atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
1183 }
1184}
unsigned int uint
unsigned short ushort
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
Definition half.h:41
ccl_device_inline void film_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, const int pixel_index)
#define kernel_assert(cond)
#define PASS_UNUSED
const ThreadKernelGlobalsCPU * KernelGlobals
#define ccl_device_inline
#define ccl_global
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_global_id_x()
#define ccl_gpu_warp_size
#define ccl_gpu_shared
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define kernel_integrator_state
#define __float_as_uint(x)
@ Kernel_DummyConstant
static ushort indices[]
#define input
VecBase< float, 4 > float4
#define out
#define output
constexpr T clamp(T, U, U) RET
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_dedicated_light(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_within_bounds(i, n)
#define ccl_gpu_kernel_call(x)
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_threads(block_num_threads)
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
const int tile_work_index
const int num_states
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
const ccl_global KernelWorkTile * tile
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
const int tile_index
ccl_gpu_kernel_postfix const ccl_global int * path_index_array
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, const int stride)
ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, const int stride)
ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, const int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_background(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
bool KERNEL_FUNCTION_FULL_NAME adaptive_sampling_convergence_check(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME cryptomatte_postprocess(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME shader_eval_displace(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_x(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_y(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_curve_shadow_transparency(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
ccl_device_inline uint popcount(const uint x)
Definition math_base.h:673
static ulong state[N]
int context(const bContext *C, const char *member, bContextDataResult *result)
std::shared_ptr< const T > get(const GenericKey &key, FunctionRef< std::unique_ptr< T >()> compute_fn)
#define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op)
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORT_BLOCK_SIZE
__device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_dedicated_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition shade_light.h:65
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg, ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
Definition state.h:236
#define INTEGRATOR_STATE(state, nested_struct, member)
Definition state.h:235
Definition half.h:60
half x
Definition half.h:61
half w
Definition half.h:61
half z
Definition half.h:61
half y
Definition half.h:61
i
Definition text_draw.cc:230
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(const ccl_global KernelWorkTile *tile, const uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)