6#ifndef __UTIL_SIMD_TYPES_H__
7#define __UTIL_SIMD_TYPES_H__
20#if defined(FREE_WINDOWS64)
22#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
24#elif (defined(__x86_64__) || defined(__i386__))
25# include <x86intrin.h>
26#elif defined(__KERNEL_NEON__)
27# define SSE2NEON_PRECISE_MINMAX 1
32#if defined(__x86_64__) || defined(_M_X64)
33# define SIMD_SET_FLUSH_TO_ZERO \
34 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
35 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
36#elif defined(__aarch64__) || defined(_M_ARM64)
39# if defined(_MM_SET_FLUSH_ZERO_MODE)
40# define SIMD_SET_FLUSH_TO_ZERO \
41 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
42 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
43# elif !defined(_M_ARM64)
44# define _MM_FLUSH_ZERO_ON 24
45# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
46# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
47# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
48# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
50# define _MM_FLUSH_ZERO_ON 24
51# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
52# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
53# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
54# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
57# define SIMD_SET_FLUSH_TO_ZERO
65extern const __m128 _mm_lookupmask_ps[16];
74static struct FalseTy {
103static struct NegInfTy {
106 return -std::numeric_limits<float>::infinity();
110 return std::numeric_limits<int>::min();
114static struct PosInfTy {
117 return std::numeric_limits<float>::infinity();
121 return std::numeric_limits<int>::max();
125static struct StepTy {
129#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
133 __get_fpcr(old_fpcr);
134 new_fpcr = old_fpcr | (1ULL <<
flag);
135 __set_fpcr(new_fpcr);
136 __get_fpcr(old_fpcr);
137 return old_fpcr == new_fpcr;
142 __get_fpcr(cur_fpcr);
143 return (cur_fpcr & (1ULL <<
flag)) > 0 ? 1 : 0;
148#if defined(__KERNEL_NEON__)
149template<
class type,
int i0,
int i1,
int i2,
int i3> type shuffle_neon(
const type &a)
151 if (i0 == i1 && i0 == i2 && i0 == i3) {
152 return type(vdupq_laneq_s32(int32x4_t(a), i0));
154 static const uint8_t tbl[16] = {(i0 * 4) + 0,
171 return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
174template<
class type,
int i0,
int i1,
int i2,
int i3>
175type shuffle_neon(
const type &a,
const type &
b)
178 static const uint8_t tbl[16] = {(i0 * 4) + 0,
195 return type(vqtbl1q_s8(int8x16_t(
b), *(uint8x16_t *)tbl));
199 static const uint8_t tbl[16] = {(i0 * 4) + 0,
218 int8x16x2_t t = {int8x16_t(a), int8x16_t(
b)};
219 uint8x16_t idx = *(uint8x16_t *)tbl;
220 return type(vqtbl2q_s8(t, idx));
229#if defined(__BMI__) && defined(__GNUC__)
231# define _tzcnt_u32 __tzcnt_u32
234# define _tzcnt_u64 __tzcnt_u64
238#if defined(__LZCNT__)
239# define _lzcnt_u32 __lzcnt32
240# define _lzcnt_u64 __lzcnt64
243#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
247# if defined(__KERNEL_AVX2__)
248 return _tzcnt_u32(
v);
251 _BitScanForward(&r,
v);
259 _BitScanReverse(&r,
v);
266 _bittestandcomplement(&r, i);
272# if defined(__KERNEL_AVX2__)
273 return _tzcnt_u32(
v);
279# if defined(__KERNEL_64_BIT__)
283# if defined(__KERNEL_AVX2__)
284 return _tzcnt_u64(
v);
287 _BitScanForward64(&r,
v);
295 _BitScanReverse64(&r,
v);
302 _bittestandcomplement64((__int64 *)&r, i);
308# if defined(__KERNEL_AVX2__)
309# if defined(__KERNEL_64_BIT__)
310 return _tzcnt_u64(
v);
312 return _tzcnt_u32(
v);
321#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
327 asm(
"bsf %1,%0" :
"=r"(r) :
"r"(
v));
334 asm(
"bsr %1,%0" :
"=r"(r) :
"r"(
v));
341 asm(
"btc %1,%0" :
"=r"(r) :
"r"(i),
"0"(
v) :
"flags");
345# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
346 !(defined(__ILP32__) && defined(__x86_64__))
350 asm(
"bsf %1,%0" :
"=r"(r) :
"r"(
v));
358 asm(
"bsr %1,%0" :
"=r"(r) :
"r"(
v));
365 asm(
"btc %1,%0" :
"=r"(r) :
"r"(i),
"0"(
v) :
"flags");
371# if defined(__KERNEL_AVX2__)
372 return _tzcnt_u32(
v);
378# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
379 !(defined(__ILP32__) && defined(__x86_64__))
382# if defined(__KERNEL_AVX2__)
383# if defined(__KERNEL_64_BIT__)
384 return _tzcnt_u64(
v);
386 return _tzcnt_u32(
v);
398 for (
int i = 0; i < 32; i++) {
408 for (
int i = 0; i < 32; i++) {
409 if (x & (1U << (31 - i))) {
424 for (
int i = 0; i < 64; i++) {
425 if (x & (1UL << i)) {
434 for (
int i = 0; i < 64; i++) {
435 if (x & (1UL << (63 - i))) {
452 while ((value & (1 << bit)) == 0) {
462 while ((value & (1 << bit)) == 0) {
472#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
473# undef _mm256_cvtss_f32
474# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
478#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
479 defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
ATTR_WARN_UNUSED_RESULT const BMVert * v
local_group_size(16, 16) .push_constant(Type b
#define ccl_attr_maybe_unused
#define CCL_NAMESPACE_END
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
T step(const T &edge, const T &value)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
__forceinline uint32_t __bsr(const uint32_t x)
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
__forceinline uint32_t bitscan(uint32_t value)
unsigned __int64 uint64_t