Blender V5.0
simd.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
2 * SPDX-FileCopyrightText: 2014-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#pragma once
7
8#include <cstdint>
9#include <limits>
10
11#include "util/defines.h"
12
13/* SSE Intrinsics includes
14 *
15 * We assume __KERNEL_SSEX__ flags to have been defined at this point.
16 *
17 * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
18 * Since we can't avoid including <windows.h>, better only include that */
19#if defined(FREE_WINDOWS64)
20# include "util/windows.h"
21#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
22# include <intrin.h>
23#elif (defined(__x86_64__) || defined(__i386__))
24# include <x86intrin.h>
25#elif defined(__KERNEL_NEON__)
26# define SSE2NEON_PRECISE_MINMAX 1
27# include <sse2neon.h>
28#endif
29
30/* Floating Point Control, for Embree. */
31#if defined(__x86_64__) || defined(_M_X64)
32# define SIMD_SET_FLUSH_TO_ZERO \
33 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
34 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
35#elif defined(__aarch64__) || defined(_M_ARM64)
36/* The get/set denormals to zero was implemented in sse2neon v1.5.0.
37 * Keep the compatibility code until the minimum library version is increased. */
38# if defined(_MM_SET_FLUSH_ZERO_MODE)
39# define SIMD_SET_FLUSH_TO_ZERO \
40 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
41 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
42# elif !defined(_M_ARM64)
43# define _MM_FLUSH_ZERO_ON 24
44# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
45# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
46# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
47# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
48# else
49# define _MM_FLUSH_ZERO_ON 24
50# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
51# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
52# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
53# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
54# endif
55#else
56# define SIMD_SET_FLUSH_TO_ZERO
57#endif
58
60
61/* Data structures used by SSE classes. */
62#ifdef __KERNEL_SSE2__
63
64extern const __m128 _mm_lookupmask_ps[16];
65
66static struct TrueTy {
67 __forceinline operator bool() const
68 {
69 return true;
70 }
72
73static struct FalseTy {
74 __forceinline operator bool() const
75 {
76 return false;
77 }
79
80static struct ZeroTy {
81 __forceinline operator float() const
82 {
83 return 0;
84 }
85 __forceinline operator int() const
86 {
87 return 0;
88 }
90
91static struct OneTy {
92 __forceinline operator float() const
93 {
94 return 1;
95 }
96 __forceinline operator int() const
97 {
98 return 1;
99 }
101
102static struct NegInfTy {
103 __forceinline operator float() const
104 {
105 return -std::numeric_limits<float>::infinity();
106 }
107 __forceinline operator int() const
108 {
109 return std::numeric_limits<int>::min();
110 }
111} neg_inf ccl_attr_maybe_unused;
112
113static struct PosInfTy {
114 __forceinline operator float() const
115 {
116 return std::numeric_limits<float>::infinity();
117 }
118 __forceinline operator int() const
119 {
120 return std::numeric_limits<int>::max();
121 }
123
124static struct StepTy {
126
127#endif
128#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
129__forceinline int set_fz(const uint32_t flag)
130{
131 uint64_t old_fpcr;
132 uint64_t new_fpcr;
133 __get_fpcr(old_fpcr);
134 new_fpcr = old_fpcr | (1ULL << flag);
135 __set_fpcr(new_fpcr);
136 __get_fpcr(old_fpcr);
137 return old_fpcr == new_fpcr;
138}
139__forceinline int get_fz(const uint32_t flag)
140{
141 uint64_t cur_fpcr;
142 __get_fpcr(cur_fpcr);
143 return (cur_fpcr & (1ULL << flag)) > 0 ? 1 : 0;
144}
145#endif
146
147/* Utilities used by Neon */
148#if defined(__KERNEL_NEON__)
149template<class type, const int i0, const int i1, const int i2, const int i3>
150type shuffle_neon(const type &a)
151{
152 if (i0 == i1 && i0 == i2 && i0 == i3) {
153 return type(vdupq_laneq_s32(int32x4_t(a), i0));
154 }
155 static const uint8_t tbl[16] = {(i0 * 4) + 0,
156 (i0 * 4) + 1,
157 (i0 * 4) + 2,
158 (i0 * 4) + 3,
159 (i1 * 4) + 0,
160 (i1 * 4) + 1,
161 (i1 * 4) + 2,
162 (i1 * 4) + 3,
163 (i2 * 4) + 0,
164 (i2 * 4) + 1,
165 (i2 * 4) + 2,
166 (i2 * 4) + 3,
167 (i3 * 4) + 0,
168 (i3 * 4) + 1,
169 (i3 * 4) + 2,
170 (i3 * 4) + 3};
171
172 return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
173}
174
175template<class type, const int i0, const int i1, const int i2, const int i3>
176type shuffle_neon(const type &a, const type &b)
177{
178 if (&a == &b) {
179 static const uint8_t tbl[16] = {(i0 * 4) + 0,
180 (i0 * 4) + 1,
181 (i0 * 4) + 2,
182 (i0 * 4) + 3,
183 (i1 * 4) + 0,
184 (i1 * 4) + 1,
185 (i1 * 4) + 2,
186 (i1 * 4) + 3,
187 (i2 * 4) + 0,
188 (i2 * 4) + 1,
189 (i2 * 4) + 2,
190 (i2 * 4) + 3,
191 (i3 * 4) + 0,
192 (i3 * 4) + 1,
193 (i3 * 4) + 2,
194 (i3 * 4) + 3};
195
196 return type(vqtbl1q_s8(int8x16_t(b), *(uint8x16_t *)tbl));
197 }
198 else {
199
200 static const uint8_t tbl[16] = {(i0 * 4) + 0,
201 (i0 * 4) + 1,
202 (i0 * 4) + 2,
203 (i0 * 4) + 3,
204 (i1 * 4) + 0,
205 (i1 * 4) + 1,
206 (i1 * 4) + 2,
207 (i1 * 4) + 3,
208 (i2 * 4) + 0 + 16,
209 (i2 * 4) + 1 + 16,
210 (i2 * 4) + 2 + 16,
211 (i2 * 4) + 3 + 16,
212 (i3 * 4) + 0 + 16,
213 (i3 * 4) + 1 + 16,
214 (i3 * 4) + 2 + 16,
215 (i3 * 4) + 3 + 16};
216
217 /* NOTE: This cannot all be put in a single line due to how MSVC ARM64
218 * implements the function calls as several layers of macros. */
219 int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
220 uint8x16_t idx = *(uint8x16_t *)tbl;
221 return type(vqtbl2q_s8(t, idx));
222 }
223}
224#endif /* __KERNEL_NEON */
225
226/* Intrinsics Functions
227 *
228 * For fast bit operations. */
229
230#if defined(__BMI__) && defined(__GNUC__)
231# ifndef _tzcnt_u32
232# define _tzcnt_u32 __tzcnt_u32
233# endif
234# ifndef _tzcnt_u64
235# define _tzcnt_u64 __tzcnt_u64
236# endif
237#endif
238
239#if defined(__LZCNT__)
240# define _lzcnt_u32 __lzcnt32
241# define _lzcnt_u64 __lzcnt64
242#endif
243
244#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
245/* Intrinsic functions on Windows. */
246__forceinline uint32_t __bsf(const uint32_t v)
247{
248# if defined(__KERNEL_AVX2__)
249 return _tzcnt_u32(v);
250# else
251 unsigned long r = 0;
252 _BitScanForward(&r, v);
253 return r;
254# endif
255}
256
257__forceinline uint32_t __bsr(const uint32_t v)
258{
259 unsigned long r = 0;
260 _BitScanReverse(&r, v);
261 return r;
262}
263
264__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
265{
266 long r = v;
267 _bittestandcomplement(&r, i);
268 return r;
269}
270
271__forceinline uint32_t bitscan(const uint32_t v)
272{
273# if defined(__KERNEL_AVX2__)
274 return _tzcnt_u32(v);
275# else
276 return __bsf(v);
277# endif
278}
279
280# if defined(__KERNEL_64_BIT__)
281
283{
284# if defined(__KERNEL_AVX2__)
285 return _tzcnt_u64(v);
286# else
287 unsigned long r = 0;
288 _BitScanForward64(&r, v);
289 return r;
290# endif
291}
292
294{
295 unsigned long r = 0;
296 _BitScanReverse64(&r, v);
297 return r;
298}
299
301{
302 uint64_t r = v;
303 _bittestandcomplement64((__int64 *)&r, i);
304 return r;
305}
306
308{
309# if defined(__KERNEL_AVX2__)
310# if defined(__KERNEL_64_BIT__)
311 return _tzcnt_u64(v);
312# else
313 return _tzcnt_u32(v);
314# endif
315# else
316 return __bsf(v);
317# endif
318}
319
320# endif /* __KERNEL_64_BIT__ */
321
322#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
323/* Intrinsic functions with x86 SSE. */
324
325__forceinline uint32_t __bsf(const uint32_t v)
326{
327 uint32_t r = 0;
328 asm("bsf %1,%0" : "=r"(r) : "r"(v));
329 return r;
330}
331
332__forceinline uint32_t __bsr(const uint32_t v)
333{
334 uint32_t r = 0;
335 asm("bsr %1,%0" : "=r"(r) : "r"(v));
336 return r;
337}
338
339__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
340{
341 uint32_t r = 0;
342 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
343 return r;
344}
345
346# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
347 !(defined(__ILP32__) && defined(__x86_64__))
349{
350 uint64_t r = 0;
351 asm("bsf %1,%0" : "=r"(r) : "r"(v));
352 return r;
353}
354# endif
355
357{
358 uint64_t r = 0;
359 asm("bsr %1,%0" : "=r"(r) : "r"(v));
360 return r;
361}
362
364{
365 uint64_t r = 0;
366 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
367 return r;
368}
369
370__forceinline uint32_t bitscan(const uint32_t v)
371{
372# if defined(__KERNEL_AVX2__)
373 return _tzcnt_u32(v);
374# else
375 return __bsf(v);
376# endif
377}
378
379# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
380 !(defined(__ILP32__) && defined(__x86_64__))
382{
383# if defined(__KERNEL_AVX2__)
384# if defined(__KERNEL_64_BIT__)
385 return _tzcnt_u64(v);
386# else
387 return _tzcnt_u32(v);
388# endif
389# else
390 return __bsf(v);
391# endif
392}
393# endif
394
395#else
396/* Intrinsic functions fallback for arbitrary processor. */
397__forceinline uint32_t __bsf(const uint32_t x)
398{
399 for (int i = 0; i < 32; i++) {
400 if (x & (1U << i)) {
401 return i;
402 }
403 }
404 return 32;
405}
406
407__forceinline uint32_t __bsr(const uint32_t x)
408{
409 for (int i = 0; i < 32; i++) {
410 if (x & (1U << (31 - i))) {
411 return (31 - i);
412 }
413 }
414 return 32;
415}
416
417__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
418{
419 const uint32_t mask = 1U << bit;
420 return x & (~mask);
421}
422
424{
425 for (int i = 0; i < 64; i++) {
426 if (x & (1UL << i)) {
427 return i;
428 }
429 }
430 return 64;
431}
432
434{
435 for (int i = 0; i < 64; i++) {
436 if (x & (1UL << (63 - i))) {
437 return (63 - i);
438 }
439 }
440 return 64;
441}
442
443__forceinline uint64_t __btc(const uint64_t x, const uint32_t bit)
444{
445 const uint64_t mask = 1UL << bit;
446 return x & (~mask);
447}
448
449__forceinline uint32_t bitscan(const uint32_t value)
450{
451 assert(value != 0);
452 uint32_t bit = 0;
453 while ((value & (1 << bit)) == 0) {
454 ++bit;
455 }
456 return bit;
457}
458
460{
461 assert(value != 0);
462 uint64_t bit = 0;
463 while ((value & (1 << bit)) == 0) {
464 ++bit;
465 }
466 return bit;
467}
468
469#endif /* Intrinsics */
470
471/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
472 * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
473#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
474# undef _mm256_cvtss_f32
475# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
476#endif
477
478/* quiet unused define warnings */
479#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
480 defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
481/* do nothing */
482#endif
483
ATTR_WARN_UNUSED_RESULT const BMVert * v
unsigned long long int uint64_t
nullptr float
#define ccl_attr_maybe_unused
#define __forceinline
#define CCL_NAMESPACE_END
#define assert(assertion)
VecBase< float, D > step(VecOp< float, D >, VecOp< float, D >) RET
ccl_device_inline float2 mask(const MaskType mask, const float2 a)
__forceinline uint32_t bitscan(const uint32_t value)
Definition simd.h:449
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
Definition simd.h:397
__forceinline uint32_t __bsr(const uint32_t x)
Definition simd.h:407
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
Definition simd.h:417
i
Definition text_draw.cc:230
uint8_t flag
Definition wm_window.cc:145