16#ifdef __KERNEL_METAL__
18#elif defined(__KERNEL_ONEAPI__)
46#ifdef __KERNEL_METAL__
48#elif defined(__KERNEL_ONEAPI__)
137#if !defined(__HIPRT__)
141# ifdef __KERNEL_ONEAPI__
218# ifdef __KERNEL_ONEAPI__
284#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
288#if !defined(__HIPRT__)
292# ifdef __KERNEL_ONEAPI__
307# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
311 kg += __dummy_constant;
335# ifdef __KERNEL_ONEAPI__
380 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
395 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
423 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
437 num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
444 int num_states_limit,
455 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
465 ccl_gpu_kernel_lambda_pass);
470#ifdef __KERNEL_ONEAPI__
475 int num_states_limit,
478 sycl::local_accessor<int> &local_mem)
484 int num_states_limit,
489#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
497# ifdef __KERNEL_METAL__
498 int max_shaders = context.launch_params_metal.data.max_shaders;
501# ifdef __KERNEL_ONEAPI__
504 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
511 local_mem.get_multi_ptr<sycl::access::decorated::no>().
get();
530#ifdef __KERNEL_ONEAPI__
535 int num_states_limit,
538 sycl::local_accessor<int> &local_mem)
544 int num_states_limit,
550#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
558# ifdef __KERNEL_METAL__
559 int max_shaders = context.launch_params_metal.data.max_shaders;
562# ifdef __KERNEL_ONEAPI__
565 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
572 local_mem.get_multi_ptr<sycl::access::decorated::no>().
get();
597 int num_active_paths)
601 int num_active_paths);
602 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
610 ccl_global const int *active_terminated_states,
611 const int active_states_offset,
612 const int terminated_states_offset,
618 const int from_state = active_terminated_states[active_states_offset + global_index];
619 const int to_state = active_terminated_states[terminated_states_offset + global_index];
631 int num_active_paths)
635 int num_active_paths);
636 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
644 ccl_global const int *active_terminated_states,
645 const int active_states_offset,
646 const int terminated_states_offset,
652 const int from_state = active_terminated_states[active_states_offset + global_index];
653 const int to_state = active_terminated_states[terminated_states_offset + global_index];
685 const int y = work_index / sw;
686 const int x = work_index - y * sw;
688 bool converged =
true;
690 if (x < sw && y < sh) {
753 if (pixel_index < num_pixels) {
764 const int rgba_offset,
765 const int rgba_stride,
768 const half4 half_pixel)
773 out[0] = half_pixel.
x;
774 out[1] = half_pixel.
y;
775 out[2] = half_pixel.
z;
776 out[3] = half_pixel.
w;
783#ifdef __KERNEL_METAL__
787# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
788 float local_pixel[4]; \
789 film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
790 if (input_channel_count >= 1) { \
791 pixel[0] = local_pixel[0]; \
793 if (input_channel_count >= 2) { \
794 pixel[1] = local_pixel[1]; \
796 if (input_channel_count >= 3) { \
797 pixel[2] = local_pixel[2]; \
799 if (input_channel_count >= 4) { \
800 pixel[3] = local_pixel[3]; \
805# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
806 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
810#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
811 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
812 ccl_gpu_kernel_signature(film_convert_##variant, \
813 const KernelFilmConvert kfilm_convert, \
814 ccl_global float *pixels, \
815 ccl_global float *render_buffer, \
820 int channel_offset, \
824 const int render_pixel_index = ccl_gpu_global_id_x(); \
825 if (render_pixel_index >= num_pixels) { \
829 const int x = render_pixel_index % width; \
830 const int y = render_pixel_index / width; \
832 const uint64_t buffer_pixel_index = x + y * stride; \
833 ccl_global const float *buffer = render_buffer + offset + \
834 buffer_pixel_index * kfilm_convert.pass_stride; \
836 ccl_global float *pixel = pixels + channel_offset + \
837 (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
839 FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
841 ccl_gpu_kernel_postfix \
843 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
844 ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
845 const KernelFilmConvert kfilm_convert, \
846 ccl_global uchar4 *rgba, \
847 ccl_global float *render_buffer, \
855 const int render_pixel_index = ccl_gpu_global_id_x(); \
856 if (render_pixel_index >= num_pixels) { \
860 const int x = render_pixel_index % width; \
861 const int y = render_pixel_index / width; \
863 const uint64_t buffer_pixel_index = x + y * stride; \
864 ccl_global const float *buffer = render_buffer + offset + \
865 buffer_pixel_index * kfilm_convert.pass_stride; \
868 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
870 if (input_channel_count == 1) { \
871 pixel[1] = pixel[2] = pixel[0]; \
873 if (input_channel_count <= 3) { \
877 film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
879 const half4 half_pixel = float4_to_half4_display( \
880 make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
881 kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
883 ccl_gpu_kernel_postfix
903#undef KERNEL_FILM_CONVERT_VARIANT
975 const int y = work_index / width;
976 const int x = work_index - y * width;
978 if (x >= width || y >= height) {
982 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
985 ccl_global float *color_out = buffer + pass_denoised;
986 color_out[0] =
clamp(color_out[0], 0.0f, 10000.0f);
987 color_out[1] =
clamp(color_out[1], 0.0f, 10000.0f);
988 color_out[2] =
clamp(color_out[2], 0.0f, 10000.0f);
995 int guiding_pass_stride,
996 int guiding_pass_albedo,
997 int guiding_pass_normal,
998 int guiding_pass_flow,
1002 int render_pass_stride,
1003 int render_pass_sample_count,
1004 int render_pass_denoising_albedo,
1005 int render_pass_denoising_normal,
1006 int render_pass_motion,
1014 const int y = work_index / width;
1015 const int x = work_index - y * width;
1017 if (x >= width || y >= height) {
1021 const uint64_t guiding_pixel_index = x + y * width;
1022 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1024 const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
1029 pixel_scale = 1.0f / num_samples;
1032 pixel_scale = 1.0f /
__float_as_uint(buffer[render_pass_sample_count]);
1039 ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
1040 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1042 albedo_out[0] = aledo_in[0] * pixel_scale;
1043 albedo_out[1] = aledo_in[1] * pixel_scale;
1044 albedo_out[2] = aledo_in[2] * pixel_scale;
1051 ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
1052 ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
1054 normal_out[0] = normal_in[0] * pixel_scale;
1055 normal_out[1] = normal_in[1] * pixel_scale;
1056 normal_out[2] = normal_in[2] * pixel_scale;
1063 ccl_global const float *motion_in = buffer + render_pass_motion;
1064 ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
1066 flow_out[0] = -motion_in[0] * pixel_scale;
1067 flow_out[1] = -motion_in[1] * pixel_scale;
1075 int guiding_pass_stride,
1076 int guiding_pass_albedo,
1083 const int y = work_index / width;
1084 const int x = work_index - y * width;
1086 if (x >= width || y >= height) {
1090 const uint64_t guiding_pixel_index = x + y * width;
1091 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1093 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1095 albedo_out[0] = 0.5f;
1096 albedo_out[1] = 0.5f;
1097 albedo_out[2] = 0.5f;
1114 int pass_sample_count,
1116 int use_compositing)
1119 const int y = work_index / width;
1120 const int x = work_index - y * width;
1122 if (x >= width || y >= height) {
1126 const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
1131 pixel_scale = num_samples;
1137 ccl_global float *denoised_pixel = buffer + pass_denoised;
1139 denoised_pixel[0] *= pixel_scale;
1140 denoised_pixel[1] *= pixel_scale;
1141 denoised_pixel[2] *= pixel_scale;
1143 if (num_components == 3) {
1146 else if (!use_compositing) {
1150 ccl_global const float *noisy_pixel = buffer + pass_noisy;
1151 denoised_pixel[3] = noisy_pixel[3];
1156 denoised_pixel[3] = 0;
1172 bool can_split =
false;
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
void reset()
clear internal cached data and reset random seed
ccl_device_inline void film_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, int pixel_index)
#define kernel_assert(cond)
const KernelGlobalsCPU *ccl_restrict KernelGlobals
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_global_id_x()
#define ccl_gpu_warp_size
#define ccl_device_inline
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define kernel_integrator_state
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_dedicated_light(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_within_bounds(i, n)
#define ccl_gpu_kernel_call(x)
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_threads(block_num_threads)
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
const int tile_work_index
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
ccl_gpu_kernel_postfix ccl_global const int * path_index_array
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
ccl_global const KernelWorkTile * tile
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, int x, int y, float threshold, int reset, int offset, int stride)
ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_x(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME cryptomatte_postprocess(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME shader_eval_background(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME shader_eval_displace(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
bool KERNEL_FUNCTION_FULL_NAME adaptive_sampling_convergence_check(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int y, float threshold, int reset, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_y(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_curve_shadow_transparency(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
std::shared_ptr< const T > get(const GenericKey &key, FunctionRef< std::unique_ptr< T >()> compute_fn)
#define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op)
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORT_BLOCK_SIZE
__device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_dedicated_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg, ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
#define INTEGRATOR_STATE(state, nested_struct, member)
unsigned __int64 uint64_t
ccl_device_inline uint popcount(uint x)
ccl_device_inline int clamp(int a, int mn, int mx)
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)