Blender V4.3
simd.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
2 * SPDX-FileCopyrightText: 2014-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#ifndef __UTIL_SIMD_TYPES_H__
7#define __UTIL_SIMD_TYPES_H__
8
9#include <limits>
10#include <stdint.h>
11
12#include "util/defines.h"
13
14/* SSE Intrinsics includes
15 *
16 * We assume __KERNEL_SSEX__ flags to have been defined at this point.
17 *
18 * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
19 * Since we can't avoid including <windows.h>, better only include that */
20#if defined(FREE_WINDOWS64)
21# include "util/windows.h"
22#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
23# include <intrin.h>
24#elif (defined(__x86_64__) || defined(__i386__))
25# include <x86intrin.h>
26#elif defined(__KERNEL_NEON__)
27# define SSE2NEON_PRECISE_MINMAX 1
28# include <sse2neon.h>
29#endif
30
31/* Floating Point Control, for Embree. */
32#if defined(__x86_64__) || defined(_M_X64)
33# define SIMD_SET_FLUSH_TO_ZERO \
34 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
35 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
36#elif defined(__aarch64__) || defined(_M_ARM64)
37/* The get/set denormals to zero was implemented in sse2neon v1.5.0.
38 * Keep the compatibility code until the minimum library version is increased. */
39# if defined(_MM_SET_FLUSH_ZERO_MODE)
40# define SIMD_SET_FLUSH_TO_ZERO \
41 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
42 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
43# elif !defined(_M_ARM64)
44# define _MM_FLUSH_ZERO_ON 24
45# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
46# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
47# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
48# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
49# else
50# define _MM_FLUSH_ZERO_ON 24
51# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
52# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
53# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
54# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
55# endif
56#else
57# define SIMD_SET_FLUSH_TO_ZERO
58#endif
59
61
62/* Data structures used by SSE classes. */
63#ifdef __KERNEL_SSE2__
64
65extern const __m128 _mm_lookupmask_ps[16];
66
67static struct TrueTy {
68 __forceinline operator bool() const
69 {
70 return true;
71 }
73
74static struct FalseTy {
75 __forceinline operator bool() const
76 {
77 return false;
78 }
80
81static struct ZeroTy {
82 __forceinline operator float() const
83 {
84 return 0;
85 }
86 __forceinline operator int() const
87 {
88 return 0;
89 }
91
92static struct OneTy {
93 __forceinline operator float() const
94 {
95 return 1;
96 }
97 __forceinline operator int() const
98 {
99 return 1;
100 }
102
103static struct NegInfTy {
104 __forceinline operator float() const
105 {
106 return -std::numeric_limits<float>::infinity();
107 }
108 __forceinline operator int() const
109 {
110 return std::numeric_limits<int>::min();
111 }
112} neg_inf ccl_attr_maybe_unused;
113
114static struct PosInfTy {
115 __forceinline operator float() const
116 {
117 return std::numeric_limits<float>::infinity();
118 }
119 __forceinline operator int() const
120 {
121 return std::numeric_limits<int>::max();
122 }
124
125static struct StepTy {
127
128#endif
129#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
130__forceinline int set_fz(uint32_t flag)
131{
132 uint64_t old_fpcr, new_fpcr;
133 __get_fpcr(old_fpcr);
134 new_fpcr = old_fpcr | (1ULL << flag);
135 __set_fpcr(new_fpcr);
136 __get_fpcr(old_fpcr);
137 return old_fpcr == new_fpcr;
138}
139__forceinline int get_fz(uint32_t flag)
140{
141 uint64_t cur_fpcr;
142 __get_fpcr(cur_fpcr);
143 return (cur_fpcr & (1ULL << flag)) > 0 ? 1 : 0;
144}
145#endif
146
147/* Utilities used by Neon */
148#if defined(__KERNEL_NEON__)
149template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
150{
151 if (i0 == i1 && i0 == i2 && i0 == i3) {
152 return type(vdupq_laneq_s32(int32x4_t(a), i0));
153 }
154 static const uint8_t tbl[16] = {(i0 * 4) + 0,
155 (i0 * 4) + 1,
156 (i0 * 4) + 2,
157 (i0 * 4) + 3,
158 (i1 * 4) + 0,
159 (i1 * 4) + 1,
160 (i1 * 4) + 2,
161 (i1 * 4) + 3,
162 (i2 * 4) + 0,
163 (i2 * 4) + 1,
164 (i2 * 4) + 2,
165 (i2 * 4) + 3,
166 (i3 * 4) + 0,
167 (i3 * 4) + 1,
168 (i3 * 4) + 2,
169 (i3 * 4) + 3};
170
171 return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
172}
173
174template<class type, int i0, int i1, int i2, int i3>
175type shuffle_neon(const type &a, const type &b)
176{
177 if (&a == &b) {
178 static const uint8_t tbl[16] = {(i0 * 4) + 0,
179 (i0 * 4) + 1,
180 (i0 * 4) + 2,
181 (i0 * 4) + 3,
182 (i1 * 4) + 0,
183 (i1 * 4) + 1,
184 (i1 * 4) + 2,
185 (i1 * 4) + 3,
186 (i2 * 4) + 0,
187 (i2 * 4) + 1,
188 (i2 * 4) + 2,
189 (i2 * 4) + 3,
190 (i3 * 4) + 0,
191 (i3 * 4) + 1,
192 (i3 * 4) + 2,
193 (i3 * 4) + 3};
194
195 return type(vqtbl1q_s8(int8x16_t(b), *(uint8x16_t *)tbl));
196 }
197 else {
198
199 static const uint8_t tbl[16] = {(i0 * 4) + 0,
200 (i0 * 4) + 1,
201 (i0 * 4) + 2,
202 (i0 * 4) + 3,
203 (i1 * 4) + 0,
204 (i1 * 4) + 1,
205 (i1 * 4) + 2,
206 (i1 * 4) + 3,
207 (i2 * 4) + 0 + 16,
208 (i2 * 4) + 1 + 16,
209 (i2 * 4) + 2 + 16,
210 (i2 * 4) + 3 + 16,
211 (i3 * 4) + 0 + 16,
212 (i3 * 4) + 1 + 16,
213 (i3 * 4) + 2 + 16,
214 (i3 * 4) + 3 + 16};
215
216 /* NOTE: This cannot all be put in a single line due to how MSVC ARM64
217 * implements the function calls as several layers of macros. */
218 int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
219 uint8x16_t idx = *(uint8x16_t *)tbl;
220 return type(vqtbl2q_s8(t, idx));
221 }
222}
223#endif /* __KERNEL_NEON */
224
225/* Intrinsics Functions
226 *
227 * For fast bit operations. */
228
229#if defined(__BMI__) && defined(__GNUC__)
230# ifndef _tzcnt_u32
231# define _tzcnt_u32 __tzcnt_u32
232# endif
233# ifndef _tzcnt_u64
234# define _tzcnt_u64 __tzcnt_u64
235# endif
236#endif
237
238#if defined(__LZCNT__)
239# define _lzcnt_u32 __lzcnt32
240# define _lzcnt_u64 __lzcnt64
241#endif
242
243#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
244/* Intrinsic functions on Windows. */
246{
247# if defined(__KERNEL_AVX2__)
248 return _tzcnt_u32(v);
249# else
250 unsigned long r = 0;
251 _BitScanForward(&r, v);
252 return r;
253# endif
254}
255
257{
258 unsigned long r = 0;
259 _BitScanReverse(&r, v);
260 return r;
261}
262
264{
265 long r = v;
266 _bittestandcomplement(&r, i);
267 return r;
268}
269
271{
272# if defined(__KERNEL_AVX2__)
273 return _tzcnt_u32(v);
274# else
275 return __bsf(v);
276# endif
277}
278
279# if defined(__KERNEL_64_BIT__)
280
282{
283# if defined(__KERNEL_AVX2__)
284 return _tzcnt_u64(v);
285# else
286 unsigned long r = 0;
287 _BitScanForward64(&r, v);
288 return r;
289# endif
290}
291
293{
294 unsigned long r = 0;
295 _BitScanReverse64(&r, v);
296 return r;
297}
298
300{
301 uint64_t r = v;
302 _bittestandcomplement64((__int64 *)&r, i);
303 return r;
304}
305
307{
308# if defined(__KERNEL_AVX2__)
309# if defined(__KERNEL_64_BIT__)
310 return _tzcnt_u64(v);
311# else
312 return _tzcnt_u32(v);
313# endif
314# else
315 return __bsf(v);
316# endif
317}
318
319# endif /* __KERNEL_64_BIT__ */
320
321#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
322/* Intrinsic functions with x86 SSE. */
323
325{
326 uint32_t r = 0;
327 asm("bsf %1,%0" : "=r"(r) : "r"(v));
328 return r;
329}
330
332{
333 uint32_t r = 0;
334 asm("bsr %1,%0" : "=r"(r) : "r"(v));
335 return r;
336}
337
339{
340 uint32_t r = 0;
341 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
342 return r;
343}
344
345# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
346 !(defined(__ILP32__) && defined(__x86_64__))
348{
349 uint64_t r = 0;
350 asm("bsf %1,%0" : "=r"(r) : "r"(v));
351 return r;
352}
353# endif
354
356{
357 uint64_t r = 0;
358 asm("bsr %1,%0" : "=r"(r) : "r"(v));
359 return r;
360}
361
363{
364 uint64_t r = 0;
365 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
366 return r;
367}
368
370{
371# if defined(__KERNEL_AVX2__)
372 return _tzcnt_u32(v);
373# else
374 return __bsf(v);
375# endif
376}
377
378# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
379 !(defined(__ILP32__) && defined(__x86_64__))
381{
382# if defined(__KERNEL_AVX2__)
383# if defined(__KERNEL_64_BIT__)
384 return _tzcnt_u64(v);
385# else
386 return _tzcnt_u32(v);
387# endif
388# else
389 return __bsf(v);
390# endif
391}
392# endif
393
394#else
395/* Intrinsic functions fallback for arbitrary processor. */
397{
398 for (int i = 0; i < 32; i++) {
399 if (x & (1U << i)) {
400 return i;
401 }
402 }
403 return 32;
404}
405
407{
408 for (int i = 0; i < 32; i++) {
409 if (x & (1U << (31 - i))) {
410 return (31 - i);
411 }
412 }
413 return 32;
414}
415
417{
418 uint32_t mask = 1U << bit;
419 return x & (~mask);
420}
421
423{
424 for (int i = 0; i < 64; i++) {
425 if (x & (1UL << i)) {
426 return i;
427 }
428 }
429 return 64;
430}
431
433{
434 for (int i = 0; i < 64; i++) {
435 if (x & (1UL << (63 - i))) {
436 return (63 - i);
437 }
438 }
439 return 64;
440}
441
443{
444 uint64_t mask = 1UL << bit;
445 return x & (~mask);
446}
447
449{
450 assert(value != 0);
451 uint32_t bit = 0;
452 while ((value & (1 << bit)) == 0) {
453 ++bit;
454 }
455 return bit;
456}
457
459{
460 assert(value != 0);
461 uint64_t bit = 0;
462 while ((value & (1 << bit)) == 0) {
463 ++bit;
464 }
465 return bit;
466}
467
468#endif /* Intrinsics */
469
470/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
471 * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
472#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
473# undef _mm256_cvtss_f32
474# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
475#endif
476
477/* quiet unused define warnings */
478#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
479 defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
480/* do nothing */
481#endif
482
484
485#endif /* __UTIL_SIMD_TYPES_H__ */
ATTR_WARN_UNUSED_RESULT const BMVert * v
local_group_size(16, 16) .push_constant(Type b
#define ccl_attr_maybe_unused
Definition defines.h:72
#define CCL_NAMESPACE_END
#define __forceinline
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
T step(const T &edge, const T &value)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
Definition simd.h:396
__forceinline uint32_t __bsr(const uint32_t x)
Definition simd.h:406
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
Definition simd.h:416
__forceinline uint32_t bitscan(uint32_t value)
Definition simd.h:448
unsigned int uint32_t
Definition stdint.h:80
unsigned char uint8_t
Definition stdint.h:78
unsigned __int64 uint64_t
Definition stdint.h:90
uint8_t flag
Definition wm_window.cc:138