19#if defined(FREE_WINDOWS64)
21#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
23#elif (defined(__x86_64__) || defined(__i386__))
24# include <x86intrin.h>
25#elif defined(__KERNEL_NEON__)
26# define SSE2NEON_PRECISE_MINMAX 1
31#if defined(__x86_64__) || defined(_M_X64)
32# define SIMD_SET_FLUSH_TO_ZERO \
33 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
34 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
35#elif defined(__aarch64__) || defined(_M_ARM64)
38# if defined(_MM_SET_FLUSH_ZERO_MODE)
39# define SIMD_SET_FLUSH_TO_ZERO \
40 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
41 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
42# elif !defined(_M_ARM64)
43# define _MM_FLUSH_ZERO_ON 24
44# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
45# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
46# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
47# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
49# define _MM_FLUSH_ZERO_ON 24
50# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
51# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
52# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
53# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
56# define SIMD_SET_FLUSH_TO_ZERO
64extern const __m128 _mm_lookupmask_ps[16];
73static struct FalseTy {
102static struct NegInfTy {
105 return -std::numeric_limits<float>::infinity();
109 return std::numeric_limits<int>::min();
113static struct PosInfTy {
116 return std::numeric_limits<float>::infinity();
120 return std::numeric_limits<int>::max();
124static struct StepTy {
128#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
133 __get_fpcr(old_fpcr);
134 new_fpcr = old_fpcr | (1ULL <<
flag);
135 __set_fpcr(new_fpcr);
136 __get_fpcr(old_fpcr);
137 return old_fpcr == new_fpcr;
142 __get_fpcr(cur_fpcr);
143 return (cur_fpcr & (1ULL <<
flag)) > 0 ? 1 : 0;
148#if defined(__KERNEL_NEON__)
149template<
class type, const
int i0, const
int i1, const
int i2, const
int i3>
150type shuffle_neon(
const type &a)
152 if (i0 == i1 && i0 == i2 && i0 == i3) {
153 return type(vdupq_laneq_s32(int32x4_t(a), i0));
155 static const uint8_t tbl[16] = {(i0 * 4) + 0,
172 return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
175template<
class type, const
int i0, const
int i1, const
int i2, const
int i3>
176type shuffle_neon(
const type &a,
const type &
b)
179 static const uint8_t tbl[16] = {(i0 * 4) + 0,
196 return type(vqtbl1q_s8(int8x16_t(
b), *(uint8x16_t *)tbl));
200 static const uint8_t tbl[16] = {(i0 * 4) + 0,
219 int8x16x2_t t = {int8x16_t(a), int8x16_t(
b)};
220 uint8x16_t idx = *(uint8x16_t *)tbl;
221 return type(vqtbl2q_s8(t, idx));
230#if defined(__BMI__) && defined(__GNUC__)
232# define _tzcnt_u32 __tzcnt_u32
235# define _tzcnt_u64 __tzcnt_u64
239#if defined(__LZCNT__)
240# define _lzcnt_u32 __lzcnt32
241# define _lzcnt_u64 __lzcnt64
244#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
248# if defined(__KERNEL_AVX2__)
249 return _tzcnt_u32(
v);
252 _BitScanForward(&r,
v);
260 _BitScanReverse(&r,
v);
267 _bittestandcomplement(&r,
i);
273# if defined(__KERNEL_AVX2__)
274 return _tzcnt_u32(
v);
280# if defined(__KERNEL_64_BIT__)
284# if defined(__KERNEL_AVX2__)
285 return _tzcnt_u64(
v);
288 _BitScanForward64(&r,
v);
296 _BitScanReverse64(&r,
v);
303 _bittestandcomplement64((__int64 *)&r,
i);
309# if defined(__KERNEL_AVX2__)
310# if defined(__KERNEL_64_BIT__)
311 return _tzcnt_u64(
v);
313 return _tzcnt_u32(
v);
322#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
328 asm(
"bsf %1,%0" :
"=r"(r) :
"r"(
v));
335 asm(
"bsr %1,%0" :
"=r"(r) :
"r"(
v));
342 asm(
"btc %1,%0" :
"=r"(r) :
"r"(
i),
"0"(
v) :
"flags");
346# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
347 !(defined(__ILP32__) && defined(__x86_64__))
351 asm(
"bsf %1,%0" :
"=r"(r) :
"r"(
v));
359 asm(
"bsr %1,%0" :
"=r"(r) :
"r"(
v));
366 asm(
"btc %1,%0" :
"=r"(r) :
"r"(
i),
"0"(
v) :
"flags");
372# if defined(__KERNEL_AVX2__)
373 return _tzcnt_u32(
v);
379# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
380 !(defined(__ILP32__) && defined(__x86_64__))
383# if defined(__KERNEL_AVX2__)
384# if defined(__KERNEL_64_BIT__)
385 return _tzcnt_u64(
v);
387 return _tzcnt_u32(
v);
399 for (
int i = 0;
i < 32;
i++) {
409 for (
int i = 0;
i < 32;
i++) {
410 if (
x & (1U << (31 -
i))) {
419 const uint32_t
mask = 1U << bit;
425 for (
int i = 0;
i < 64;
i++) {
426 if (
x & (1UL <<
i)) {
435 for (
int i = 0;
i < 64;
i++) {
436 if (
x & (1UL << (63 -
i))) {
453 while ((value & (1 << bit)) == 0) {
463 while ((value & (1 << bit)) == 0) {
473#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
474# undef _mm256_cvtss_f32
475# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
479#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
480 defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
ATTR_WARN_UNUSED_RESULT const BMVert * v
unsigned long long int uint64_t
#define ccl_attr_maybe_unused
#define CCL_NAMESPACE_END
#define assert(assertion)
VecBase< float, D > step(VecOp< float, D >, VecOp< float, D >) RET
ccl_device_inline float2 mask(const MaskType mask, const float2 a)
__forceinline uint32_t bitscan(const uint32_t value)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
__forceinline uint32_t __bsr(const uint32_t x)
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)