16#ifdef __KERNEL_METAL__
18#elif defined(__KERNEL_ONEAPI__)
47#ifdef __KERNEL_METAL__
49#elif defined(__KERNEL_ONEAPI__)
138#if !defined(__HIPRT__)
142# ifdef __KERNEL_ONEAPI__
219# ifdef __KERNEL_ONEAPI__
285#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
289#if !defined(__HIPRT__)
293# ifdef __KERNEL_ONEAPI__
308# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
312 kg += __dummy_constant;
336# ifdef __KERNEL_ONEAPI__
392 const int kernel_index)
396 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
407 const int kernel_index)
411 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
434 const int indices_offset)
448 const int indices_offset)
460 const int num_states_limit,
465 const int kernel_index)
471 ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
481 ccl_gpu_kernel_lambda_pass);
486#ifdef __KERNEL_ONEAPI__
490 const int partition_size,
491 const int num_states_limit,
493 const int kernel_index,
494 sycl::local_accessor<int> &local_mem)
499 const int partition_size,
500 const int num_states_limit,
502 const int kernel_index)
505#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
513# ifdef __KERNEL_METAL__
514 int max_shaders =
context.launch_params_metal.data.max_shaders;
517# ifdef __KERNEL_ONEAPI__
520 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
527 local_mem.get_multi_ptr<sycl::access::decorated::no>().
get();
546#ifdef __KERNEL_ONEAPI__
550 const int partition_size,
551 const int num_states_limit,
553 const int kernel_index,
554 sycl::local_accessor<int> &local_mem)
559 const int partition_size,
560 const int num_states_limit,
562 const int kernel_index)
566#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
574# ifdef __KERNEL_METAL__
575 int max_shaders =
context.launch_params_metal.data.max_shaders;
578# ifdef __KERNEL_ONEAPI__
581 int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
588 local_mem.get_multi_ptr<sycl::access::decorated::no>().
get();
613 const int num_active_paths)
617 int num_active_paths);
618 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
626 const ccl_global int *active_terminated_states,
627 const int active_states_offset,
628 const int terminated_states_offset,
634 const int from_state = active_terminated_states[active_states_offset + global_index];
635 const int to_state = active_terminated_states[terminated_states_offset + global_index];
647 const int num_active_paths)
651 int num_active_paths);
652 ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
660 const ccl_global int *active_terminated_states,
661 const int active_states_offset,
662 const int terminated_states_offset,
668 const int from_state = active_terminated_states[active_states_offset + global_index];
669 const int to_state = active_terminated_states[terminated_states_offset + global_index];
694 const float threshold,
701 const int y = work_index / sw;
702 const int x = work_index -
y * sw;
704 bool converged =
true;
706 if (
x < sw &&
y < sh) {
765 const int num_pixels)
769 if (pixel_index < num_pixels) {
780 const int rgba_offset,
781 const int rgba_stride,
784 const half4 half_pixel)
789 out[0] = half_pixel.
x;
790 out[1] = half_pixel.
y;
791 out[2] = half_pixel.
z;
792 out[3] = half_pixel.
w;
799#ifdef __KERNEL_METAL__
803# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
804 float local_pixel[4]; \
805 film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
806 if (input_channel_count >= 1) { \
807 pixel[0] = local_pixel[0]; \
809 if (input_channel_count >= 2) { \
810 pixel[1] = local_pixel[1]; \
812 if (input_channel_count >= 3) { \
813 pixel[2] = local_pixel[2]; \
815 if (input_channel_count >= 4) { \
816 pixel[3] = local_pixel[3]; \
821# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
822 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
826#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
827 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
828 ccl_gpu_kernel_signature(film_convert_##variant, \
829 const KernelFilmConvert kfilm_convert, \
830 ccl_global float *pixels, \
831 ccl_global float *render_buffer, \
836 int channel_offset, \
840 const int render_pixel_index = ccl_gpu_global_id_x(); \
841 if (render_pixel_index >= num_pixels) { \
845 const int x = render_pixel_index % width; \
846 const int y = render_pixel_index / width; \
848 const uint64_t buffer_pixel_index = x + y * stride; \
849 ccl_global const float *buffer = render_buffer + offset + \
850 buffer_pixel_index * kfilm_convert.pass_stride; \
852 ccl_global float *pixel = pixels + channel_offset + \
853 (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
855 FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
857 ccl_gpu_kernel_postfix \
859 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
860 ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
861 const KernelFilmConvert kfilm_convert, \
862 ccl_global uchar4 *rgba, \
863 ccl_global float *render_buffer, \
871 const int render_pixel_index = ccl_gpu_global_id_x(); \
872 if (render_pixel_index >= num_pixels) { \
876 const int x = render_pixel_index % width; \
877 const int y = render_pixel_index / width; \
879 const uint64_t buffer_pixel_index = x + y * stride; \
880 ccl_global const float *buffer = render_buffer + offset + \
881 buffer_pixel_index * kfilm_convert.pass_stride; \
884 film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
886 if (input_channel_count == 1) { \
887 pixel[1] = pixel[2] = pixel[0]; \
889 if (input_channel_count <= 3) { \
893 film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
895 const half4 half_pixel = float4_to_half4_display( \
896 make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
897 kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
899 ccl_gpu_kernel_postfix
921#undef KERNEL_FILM_CONVERT_VARIANT
1005 const int pass_stride,
1006 const int pass_denoised)
1009 const int y = work_index / width;
1010 const int x = work_index -
y * width;
1012 if (
x >= width ||
y >= height) {
1016 const uint64_t render_pixel_index = offset + (
x + full_x) + (
y + full_y) * stride;
1019 ccl_global float *color_out = buffer + pass_denoised;
1020 color_out[0] =
clamp(color_out[0], 0.0f, 10000.0f);
1021 color_out[1] =
clamp(color_out[1], 0.0f, 10000.0f);
1022 color_out[2] =
clamp(color_out[2], 0.0f, 10000.0f);
1029 const int guiding_pass_stride,
1030 const int guiding_pass_albedo,
1031 const int guiding_pass_normal,
1032 const int guiding_pass_flow,
1034 const int render_offset,
1035 const int render_stride,
1036 const int render_pass_stride,
1037 const int render_pass_sample_count,
1038 const int render_pass_denoising_albedo,
1039 const int render_pass_denoising_normal,
1040 const int render_pass_motion,
1045 const int num_samples)
1048 const int y = work_index / width;
1049 const int x = work_index -
y * width;
1051 if (
x >= width ||
y >= height) {
1055 const uint64_t guiding_pixel_index =
x +
y * width;
1056 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1058 const uint64_t render_pixel_index = render_offset + (
x + full_x) + (
y + full_y) * render_stride;
1063 pixel_scale = 1.0f / num_samples;
1066 pixel_scale = 1.0f /
__float_as_uint(buffer[render_pass_sample_count]);
1073 const ccl_global float *albedo_in = buffer + render_pass_denoising_albedo;
1074 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1076 albedo_out[0] = albedo_in[0] * pixel_scale;
1077 albedo_out[1] = albedo_in[1] * pixel_scale;
1078 albedo_out[2] = albedo_in[2] * pixel_scale;
1085 const ccl_global float *normal_in = buffer + render_pass_denoising_normal;
1086 ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
1088 normal_out[0] = normal_in[0] * pixel_scale;
1089 normal_out[1] = normal_in[1] * pixel_scale;
1090 normal_out[2] = normal_in[2] * pixel_scale;
1097 const ccl_global float *motion_in = buffer + render_pass_motion;
1098 ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
1100 flow_out[0] = -motion_in[0] * pixel_scale;
1101 flow_out[1] = -motion_in[1] * pixel_scale;
1109 const int guiding_pass_stride,
1110 const int guiding_pass_albedo,
1117 const int y = work_index / width;
1118 const int x = work_index -
y * width;
1120 if (
x >= width ||
y >= height) {
1124 const uint64_t guiding_pixel_index =
x +
y * width;
1125 ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
1127 ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
1129 albedo_out[0] = 0.5f;
1130 albedo_out[1] = 0.5f;
1131 albedo_out[2] = 0.5f;
1144 const int pass_stride,
1145 const int num_samples,
1146 const int pass_noisy,
1147 const int pass_denoised,
1148 const int pass_sample_count,
1149 const int num_components,
1150 const int use_compositing)
1153 const int y = work_index / width;
1154 const int x = work_index -
y * width;
1156 if (
x >= width ||
y >= height) {
1160 const uint64_t render_pixel_index = offset + (
x + full_x) + (
y + full_y) * stride;
1165 pixel_scale = num_samples;
1171 ccl_global float *denoised_pixel = buffer + pass_denoised;
1173 denoised_pixel[0] *= pixel_scale;
1174 denoised_pixel[1] *= pixel_scale;
1175 denoised_pixel[2] *= pixel_scale;
1177 if (num_components == 3) {
1180 else if (!use_compositing) {
1184 const ccl_global float *noisy_pixel = buffer + pass_noisy;
1185 denoised_pixel[3] = noisy_pixel[3];
1190 denoised_pixel[3] = 0;
1204 const int pass_stride,
1205 const int pass_denoised)
1208 const int y = work_index / width;
1209 const int x = work_index -
y * width;
1211 if (
x >= width ||
y >= height / 2) {
1215 const uint64_t render_pixel_index = offset + (
x + full_x) + (
y + full_y) * stride;
1217 ccl_global float *buffer_flipped = buffer + (height - 1 -
y * 2) * stride * pass_stride;
1223 buffer[0] = buffer_flipped[0];
1224 buffer[1] = buffer_flipped[1];
1225 buffer[2] = buffer_flipped[2];
1226 buffer_flipped[0] = temp.x;
1227 buffer_flipped[1] = temp.y;
1228 buffer_flipped[2] = temp.z;
1243 bool can_split =
false;
1273 const int y = work_index / sw;
1274 const int x = work_index % sw;
1278 nullptr,
render_buffer, sy +
y, sx +
x, sx, sx + sw, offset, stride));
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORT_BLOCK_SIZE
unsigned long long int uint64_t
void reset()
clear internal cached data and reset random seed
ccl_device_inline void film_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, const int pixel_index)
#define kernel_assert(cond)
const ThreadKernelGlobalsCPU * KernelGlobals
#define ccl_device_inline
#define ccl_gpu_block_dim_x
#define ccl_gpu_thread_idx_x
#define ccl_gpu_global_id_x()
#define ccl_gpu_warp_size
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define kernel_integrator_state
constexpr T clamp(T, U, U) RET
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, const ccl_global KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_dedicated_light(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
ccl_device void kernel_volume_density_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, const ccl_global KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_within_bounds(i, n)
#define ccl_gpu_kernel_call(x)
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_threads(block_num_threads)
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
const int tile_work_index
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
const ccl_global KernelWorkTile * tile
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
ccl_gpu_kernel_postfix const ccl_global int * path_index_array
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, const int stride)
ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, const int stride)
ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, const int stride)
void KERNEL_FUNCTION_FULL_NAME volume_guiding_filter_x(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int y, const int center_x, const int min_x, const int max_x, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME volume_guiding_filter_y(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int center_y, const int height, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_background(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
bool KERNEL_FUNCTION_FULL_NAME adaptive_sampling_convergence_check(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int y, const float threshold, const int reset, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME cryptomatte_postprocess(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME shader_eval_displace(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_x(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int y, const int start_x, const int width, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME adaptive_sampling_filter_y(const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int x, const int start_y, const int height, const int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME shader_eval_curve_shadow_transparency(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME shader_eval_volume_density(const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
ccl_device_inline uint popcount(const uint x)
int context(const bContext *C, const char *member, bContextDataResult *result)
std::shared_ptr< const T > get(const GenericKey &key, FunctionRef< std::unique_ptr< T >()> compute_fn)
#define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op)
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
CCL_NAMESPACE_BEGIN __device__ void gpu_parallel_sorted_index_array(const uint state_index, const uint num_states, const int num_states_limit, ccl_global int *indices, ccl_global int *num_indices, ccl_global int *key_counter, ccl_global int *key_prefix_sum, GetKeyOp get_key_op)
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_dedicated_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_volume_ray_marching(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
#define INTEGRATOR_STATE(state, nested_struct, member)
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(const ccl_global KernelWorkTile *tile, const uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)