Blender V4.3
device/metal/compat.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
7#define __KERNEL_GPU__
8#define __KERNEL_METAL__
9#define CCL_NAMESPACE_BEGIN
10#define CCL_NAMESPACE_END
11
12#ifndef ATTR_FALLTHROUGH
13# define ATTR_FALLTHROUGH
14#endif
15
16#include <metal_atomic>
17#include <metal_pack>
18#include <metal_stdlib>
19#include <simd/simd.h>
20
21using namespace metal;
22
23#ifdef __METALRT__
24using namespace metal::raytracing;
25#endif
26
27#pragma clang diagnostic ignored "-Wunused-variable"
28#pragma clang diagnostic ignored "-Wsign-compare"
29#pragma clang diagnostic ignored "-Wuninitialized"
30#pragma clang diagnostic ignored "-Wc++17-extensions"
31#pragma clang diagnostic ignored "-Wmacro-redefined"
32
33/* Qualifiers */
34
35#define ccl_device
36#define ccl_device_inline ccl_device __attribute__((always_inline))
37#define ccl_device_forceinline ccl_device __attribute__((always_inline))
38#if defined(__KERNEL_METAL_APPLE__)
39# define ccl_device_noinline ccl_device
40#else
41# define ccl_device_noinline ccl_device __attribute__((noinline))
42#endif
43
44#define ccl_device_extern extern "C"
45#define ccl_device_noinline_cpu ccl_device
46#define ccl_device_inline_method ccl_device
47#define ccl_global device
48#define ccl_inline_constant static constant constexpr
49#define ccl_device_constant constant
50#define ccl_static_constexpr static constant constexpr
51#define ccl_constant constant
52#define ccl_gpu_shared threadgroup
53#define ccl_private thread
54#ifdef __METALRT__
55# define ccl_ray_data ray_data
56#else
57# define ccl_ray_data ccl_private
58#endif
59#define ccl_may_alias
60#define ccl_restrict __restrict
61#define ccl_loop_no_unroll
62#define ccl_align(n) alignas(n)
63#define ccl_optional_struct_init
64
65/* No assert supported for Metal */
66
67#define kernel_assert(cond)
68
69#define offsetof(t, d) __builtin_offsetof(t, d)
70
71#define ccl_gpu_global_id_x() metal_global_id
72#define ccl_gpu_warp_size simdgroup_size
73#define ccl_gpu_thread_idx_x simd_group_index
74#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
75
76#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
77#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
78
79// clang-format off
80
81/* kernel.h adapters */
82
83#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
84#define ccl_gpu_kernel_threads(block_num_threads)
85
86/* Convert a comma-separated list into a semicolon-separated list
87 * (so that we can generate a struct based on kernel entry-point parameters). */
88#define FN0()
89#define FN1(p1) p1;
90#define FN2(p1, p2) p1; p2;
91#define FN3(p1, p2, p3) p1; p2; p3;
92#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
93#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
94#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
95#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
96#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
97#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
98#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
99#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
100#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
101#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
102#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
103#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
104#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
105#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
106#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
107#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
108#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
109#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
110#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
111
112/* Generate a struct containing the entry-point parameters and a "run"
113 * method which can access them implicitly via this-> */
114
115#ifdef __METAL_GLOBAL_BUILTINS__
116
117#define ccl_gpu_kernel_signature(name, ...) \
118struct kernel_gpu_##name \
119{ \
120 PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
121 void run(thread MetalKernelContext& context, \
122 threadgroup atomic_int *threadgroup_array) ccl_global const; \
123}; \
124kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
125 constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
126 constant MetalAncillaries *_metal_ancillaries, \
127 threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]]) { \
128 MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
129 params_struct->run(context, threadgroup_array); \
130} \
131void kernel_gpu_##name::run(thread MetalKernelContext& context, \
132 threadgroup atomic_int *threadgroup_array) ccl_global const
133
134#else
135
136/* On macOS versions before 14.x, builtin constants (e.g. metal_global_id) must
137 * be accessed through attributed entry-point parameters. */
138
139#define ccl_gpu_kernel_signature(name, ...) \
140struct kernel_gpu_##name \
141{ \
142 PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
143 void run(thread MetalKernelContext& context, \
144 threadgroup atomic_int *threadgroup_array, \
145 const uint metal_global_id, \
146 const ushort metal_local_id, \
147 const ushort metal_local_size, \
148 const uint metal_grid_id, \
149 uint simdgroup_size, \
150 uint simd_lane_index, \
151 uint simd_group_index, \
152 uint num_simd_groups) ccl_global const; \
153}; \
154kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
155 constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
156 constant MetalAncillaries *_metal_ancillaries, \
157 threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
158 const uint metal_global_id [[thread_position_in_grid]], \
159 const ushort metal_local_id [[thread_position_in_threadgroup]], \
160 const ushort metal_local_size [[threads_per_threadgroup]], \
161 const uint metal_grid_id [[threadgroup_position_in_grid]], \
162 uint simdgroup_size [[threads_per_simdgroup]], \
163 uint simd_lane_index [[thread_index_in_simdgroup]], \
164 uint simd_group_index [[simdgroup_index_in_threadgroup]], \
165 uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
166 MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
167 params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
168} \
169void kernel_gpu_##name::run(thread MetalKernelContext& context, \
170 threadgroup atomic_int *threadgroup_array, \
171 const uint metal_global_id, \
172 const ushort metal_local_id, \
173 const ushort metal_local_size, \
174 const uint metal_grid_id, \
175 uint simdgroup_size, \
176 uint simd_lane_index, \
177 uint simd_group_index, \
178 uint num_simd_groups) ccl_global const
179
180#endif /* __METAL_GLOBAL_BUILTINS__ */
181
182#define ccl_gpu_kernel_postfix
183#define ccl_gpu_kernel_call(x) context.x
184#define ccl_gpu_kernel_within_bounds(i,n) true
185
186/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state. */
187#define ccl_gpu_kernel_lambda(func, ...) \
188 struct KernelLambda \
189 { \
190 KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
191 ccl_private MetalKernelContext &context; \
192 __VA_ARGS__; \
193 int operator()(const int state) const { return (func); } \
194 } ccl_gpu_kernel_lambda_pass(context)
195
196// clang-format on
197
198/* volumetric lambda functions - use function objects for lambda-like functionality */
199#define VOLUME_READ_LAMBDA(function_call) \
200 struct FnObjectRead { \
201 KernelGlobals kg; \
202 ccl_private MetalKernelContext *context; \
203 int state; \
204\
205 VolumeStack operator()(const int i) const \
206 { \
207 return context->function_call; \
208 } \
209 } volume_read_lambda_pass{kg, this, state};
210
211#define VOLUME_WRITE_LAMBDA(function_call) \
212 struct FnObjectWrite { \
213 KernelGlobals kg; \
214 ccl_private MetalKernelContext *context; \
215 int state; \
216\
217 void operator()(const int i, VolumeStack entry) const \
218 { \
219 context->function_call; \
220 } \
221 } volume_write_lambda_pass{kg, this, state};
222
223/* make_type definitions with Metal style element initializers */
224ccl_device_forceinline float2 make_float2(const float x, const float y)
225{
226 return float2(x, y);
227}
228
229ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
230{
231 return float3(x, y, z);
232}
233
235 const float y,
236 const float z,
237 const float w)
238{
239 return float4(x, y, z, w);
240}
241
242ccl_device_forceinline int2 make_int2(const int x, const int y)
243{
244 return int2(x, y);
245}
246
247ccl_device_forceinline int3 make_int3(const int x, const int y, const int z)
248{
249 return int3(x, y, z);
250}
251
252ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
253{
254 return int4(x, y, z, w);
255}
256
258{
259 return uint2(x, y);
260}
261
263{
264 return uint3(x, y, z);
265}
266
267ccl_device_forceinline uint4 make_uint4(const uint x, const uint y, const uint z, const uint w)
268{
269 return uint4(x, y, z, w);
270}
271
273 const uchar y,
274 const uchar z,
275 const uchar w)
276{
277 return uchar4(x, y, z, w);
278}
279
280/* Math functions */
281
282#define __uint_as_float(x) as_type<float>(x)
283#define __float_as_uint(x) as_type<uint>(x)
284#define __int_as_float(x) as_type<float>(x)
285#define __float_as_int(x) as_type<int>(x)
286#define __float2half(x) half(x)
287#define powf(x, y) pow(float(x), float(y))
288#define fabsf(x) fabs(float(x))
289#define copysignf(x, y) copysign(float(x), float(y))
290#define asinf(x) asin(float(x))
291#define acosf(x) acos(float(x))
292#define atanf(x) atan(float(x))
293#define floorf(x) floor(float(x))
294#define ceilf(x) ceil(float(x))
295#define hypotf(x, y) hypot(float(x), float(y))
296#define atan2f(x, y) atan2(float(x), float(y))
297#define fmaxf(x, y) fmax(float(x), float(y))
298#define fminf(x, y) fmin(float(x), float(y))
299#define fmodf(x, y) fmod(float(x), float(y))
300#define sinhf(x) sinh(float(x))
301#define coshf(x) cosh(float(x))
302#define tanhf(x) tanh(float(x))
303#define saturatef(x) saturate(float(x))
304
305/* Use native functions with possibly lower precision for performance,
306 * no issues found so far. */
307#define trigmode fast
308#define sinf(x) trigmode::sin(float(x))
309#define cosf(x) trigmode::cos(float(x))
310#define tanf(x) trigmode::tan(float(x))
311#define expf(x) trigmode::exp(float(x))
312#define sqrtf(x) trigmode::sqrt(float(x))
313#define logf(x) trigmode::log(float(x))
314
315#define NULL 0
316
317#define __device__
318
319#ifdef __METALRT__
320
321# if defined(__METALRT_MOTION__)
322# define METALRT_TAGS instancing, instance_motion, primitive_motion
323# define METALRT_BLAS_TAGS , primitive_motion
324# else
325# define METALRT_TAGS instancing
326# define METALRT_BLAS_TAGS
327# endif /* __METALRT_MOTION__ */
328
329typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
330typedef intersection_function_table<triangle_data, curve_data, METALRT_TAGS, extended_limits>
331 metalrt_ift_type;
332typedef metal::raytracing::intersector<triangle_data, curve_data, METALRT_TAGS, extended_limits>
333 metalrt_intersector_type;
334# if defined(__METALRT_MOTION__)
335typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
336typedef intersection_function_table<triangle_data, curve_data, primitive_motion, extended_limits>
337 metalrt_blas_ift_type;
338typedef metal::raytracing::
339 intersector<triangle_data, curve_data, primitive_motion, extended_limits>
340 metalrt_blas_intersector_type;
341# else
342typedef acceleration_structure<> metalrt_blas_as_type;
343typedef intersection_function_table<triangle_data, curve_data, extended_limits>
344 metalrt_blas_ift_type;
345typedef metal::raytracing::intersector<triangle_data, curve_data, extended_limits>
346 metalrt_blas_intersector_type;
347# endif
348
349#endif /* __METALRT__ */
350
351/* texture bindings and sampler setup */
352
354 device float *buf;
355};
356
358 texture2d<float, access::sample> tex;
359};
361 texture3d<float, access::sample> tex;
362};
363
364#ifdef __METALRT__
365struct MetalRTBlasWrapper {
366 metalrt_blas_as_type blas;
367};
368#endif
369
374
375#ifdef __METALRT__
376 metalrt_as_type accel_struct;
377 metalrt_ift_type ift_default;
378 metalrt_ift_type ift_shadow;
379 metalrt_ift_type ift_shadow_all;
380 metalrt_ift_type ift_volume;
381 metalrt_blas_ift_type ift_local;
382 metalrt_ift_type ift_local_mblur;
383 metalrt_blas_ift_type ift_local_single_hit;
384 metalrt_ift_type ift_local_single_hit_mblur;
385 constant MetalRTBlasWrapper *blas_accel_structs;
386#endif
387};
388
389#include "util/half.h"
390#include "util/types.h"
391
405
407 sampler(address::repeat, filter::nearest),
408 sampler(address::clamp_to_edge, filter::nearest),
409 sampler(address::clamp_to_zero, filter::nearest),
410 sampler(address::mirrored_repeat, filter::nearest),
411 sampler(address::repeat, filter::linear),
412 sampler(address::clamp_to_edge, filter::linear),
413 sampler(address::clamp_to_zero, filter::linear),
414 sampler(address::mirrored_repeat, filter::linear),
415};
416
417#ifdef __METAL_GLOBAL_BUILTINS__
418const uint metal_global_id [[thread_position_in_grid]];
419const ushort metal_local_id [[thread_position_in_threadgroup]];
420const ushort metal_local_size [[threads_per_threadgroup]];
421const uint metal_grid_id [[threadgroup_position_in_grid]];
422const uint simdgroup_size [[threads_per_simdgroup]];
423const uint simd_lane_index [[thread_index_in_simdgroup]];
424const uint simd_group_index [[simdgroup_index_in_threadgroup]];
425const uint num_simd_groups [[simdgroups_per_threadgroup]];
426#endif /* __METAL_GLOBAL_BUILTINS__ */
unsigned char uchar
unsigned short ushort
unsigned int uint
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
local_group_size(16, 16) .push_constant(Type local_group_size(16, 16) .push_constant(Type input_tx sampler(1, ImageType::FLOAT_2D, "matte_tx") .image(0
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
ccl_device_forceinline uchar4 make_uchar4(const uchar x, const uchar y, const uchar z, const uchar w)
#define ccl_device_forceinline
ccl_device_forceinline uint3 make_uint3(const uint x, const uint y, const uint z)
constant constexpr array< sampler, SamplerCount > metal_samplers
ccl_device_forceinline float2 make_float2(const float x, const float y)
ccl_device_forceinline uint4 make_uint4(const uint x, const uint y, const uint z, const uint w)
ccl_device_forceinline int3 make_int3(const int x, const int y, const int z)
ccl_device_forceinline uint2 make_uint2(const uint x, const uint y)
ccl_device_forceinline int2 make_int2(const int x, const int y)
@ SamplerFilterLinear_AddressMirroredRepeat
@ SamplerFilterNearest_AddressRepeat
@ SamplerFilterLinear_AddressClampEdge
@ SamplerFilterNearest_AddressMirroredRepeat
@ SamplerFilterNearest_AddressClampZero
@ SamplerFilterLinear_AddressClampZero
@ SamplerFilterNearest_AddressClampEdge
@ SamplerFilterLinear_AddressRepeat
@ SamplerCount
ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
device Buffer1DParamsMetal * buffers
device Texture3DParamsMetal * textures_3d
device Texture2DParamsMetal * textures_2d
texture2d< float, access::sample > tex
texture3d< float, access::sample > tex