Blender V4.3
math_float4.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
2 * SPDX-FileCopyrightText: 2011-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#ifndef __UTIL_MATH_FLOAT4_H__
7#define __UTIL_MATH_FLOAT4_H__
8
9#ifndef __UTIL_MATH_H__
10# error "Do not include this file directly, include util/types.h instead."
11#endif
12
14
16{
17#ifdef __KERNEL_SSE__
18 return float4(_mm_setzero_ps());
19#else
20 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
21#endif
22}
23
25{
26 return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
27}
28
30{
31#ifdef __KERNEL_SSE__
32 return int4(_mm_castps_si128(a));
33#else
34 return make_int4(
36#endif
37}
38
39#if !defined(__KERNEL_METAL__)
40ccl_device_inline float4 operator-(const float4 &a)
41{
42# ifdef __KERNEL_SSE__
43 __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
44 return float4(_mm_xor_ps(a.m128, mask));
45# else
46 return make_float4(-a.x, -a.y, -a.z, -a.w);
47# endif
48}
49
50ccl_device_inline float4 operator*(const float4 a, const float4 b)
51{
52# ifdef __KERNEL_SSE__
53 return float4(_mm_mul_ps(a.m128, b.m128));
54# else
55 return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
56# endif
57}
58
59ccl_device_inline float4 operator*(const float4 a, float f)
60{
61# if defined(__KERNEL_SSE__)
62 return a * make_float4(f);
63# else
64 return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
65# endif
66}
67
68ccl_device_inline float4 operator*(float f, const float4 a)
69{
70 return a * f;
71}
72
73ccl_device_inline float4 operator/(const float4 a, float f)
74{
75 return a * (1.0f / f);
76}
77
78ccl_device_inline float4 operator/(const float4 a, const float4 b)
79{
80# ifdef __KERNEL_SSE__
81 return float4(_mm_div_ps(a.m128, b.m128));
82# else
83 return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
84# endif
85}
86
87ccl_device_inline float4 operator+(const float4 a, const float4 b)
88{
89# ifdef __KERNEL_SSE__
90 return float4(_mm_add_ps(a.m128, b.m128));
91# else
92 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
93# endif
94}
95
96ccl_device_inline float4 operator+(const float4 a, const float f)
97{
98 return a + make_float4(f);
99}
100
101ccl_device_inline float4 operator-(const float4 a, const float4 b)
102{
103# ifdef __KERNEL_SSE__
104 return float4(_mm_sub_ps(a.m128, b.m128));
105# else
106 return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
107# endif
108}
109
110ccl_device_inline float4 operator-(const float4 a, const float f)
111{
112 return a - make_float4(f);
113}
114
115ccl_device_inline float4 operator+=(float4 &a, const float4 b)
116{
117 return a = a + b;
118}
119
120ccl_device_inline float4 operator-=(float4 &a, const float4 b)
121{
122 return a = a - b;
123}
124
125ccl_device_inline float4 operator*=(float4 &a, const float4 b)
126{
127 return a = a * b;
128}
129
130ccl_device_inline float4 operator*=(float4 &a, float f)
131{
132 return a = a * f;
133}
134
135ccl_device_inline float4 operator/=(float4 &a, const float4 b)
136{
137 return a = a / b;
138}
139
140ccl_device_inline float4 operator/=(float4 &a, float f)
141{
142 return a = a / f;
143}
144
145ccl_device_inline int4 operator<(const float4 a, const float4 b)
146{
147# ifdef __KERNEL_SSE__
148 return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
149# else
150 return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
151# endif
152}
153
154ccl_device_inline int4 operator>=(const float4 a, const float4 b)
155{
156# ifdef __KERNEL_SSE__
157 return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
158# else
159 return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
160# endif
161}
162
163ccl_device_inline int4 operator<=(const float4 a, const float4 b)
164{
165# ifdef __KERNEL_SSE__
166 return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
167# else
168 return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
169# endif
170}
171
172ccl_device_inline bool operator==(const float4 a, const float4 b)
173{
174# ifdef __KERNEL_SSE__
175 return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
176# else
177 return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
178# endif
179}
180
181ccl_device_inline const float4 operator^(const float4 a, const float4 b)
182{
183# ifdef __KERNEL_SSE__
184 return float4(_mm_xor_ps(a.m128, b.m128));
185# else
190# endif
191}
192
193ccl_device_inline float4 min(const float4 a, const float4 b)
194{
195# ifdef __KERNEL_SSE__
196 return float4(_mm_min_ps(a.m128, b.m128));
197# else
198 return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
199# endif
200}
201
202ccl_device_inline float4 max(const float4 a, const float4 b)
203{
204# ifdef __KERNEL_SSE__
205 return float4(_mm_max_ps(a.m128, b.m128));
206# else
207 return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
208# endif
209}
210
211ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
212{
213 return min(max(a, mn), mx);
214}
215#endif /* !__KERNEL_METAL__*/
216
217ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
218{
219#ifdef __KERNEL_SSE__
220# ifdef __KERNEL_NEON__
221 return float4(vfmaq_f32(c, a, b));
222# elif defined(__KERNEL_AVX2__)
223 return float4(_mm_fmadd_ps(a, b, c));
224# else
225 return a * b + c;
226# endif
227#else
228 return a * b + c;
229#endif
230}
231
232ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
233{
234#ifdef __KERNEL_SSE__
235# ifdef __KERNEL_NEON__
236 return float4(vfmaq_f32(vnegq_f32(c), a, b));
237# elif defined(__KERNEL_AVX2__)
238 return float4(_mm_fmsub_ps(a, b, c));
239# else
240 return a * b - c;
241# endif
242#else
243 return a * b - c;
244#endif
245}
246
247#ifdef __KERNEL_SSE__
248template<size_t i0, size_t i1, size_t i2, size_t i3>
249__forceinline const float4 shuffle(const float4 b)
250{
251# ifdef __KERNEL_NEON__
252 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
253# else
254 return float4(
255 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
256# endif
257}
258
259template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
260{
261 return float4(_mm_movelh_ps(a, a));
262}
263
264template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
265{
266 return float4(_mm_movehl_ps(a, a));
267}
268
269# ifdef __KERNEL_SSE3__
270template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
271{
272 return float4(_mm_moveldup_ps(b));
273}
274
275template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
276{
277 return float4(_mm_movehdup_ps(b));
278}
279# endif /* __KERNEL_SSE3__ */
280
281template<size_t i0, size_t i1, size_t i2, size_t i3>
282__forceinline const float4 shuffle(const float4 a, const float4 b)
283{
284# ifdef __KERNEL_NEON__
285 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
286# else
287 return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
288# endif
289}
290
291template<size_t i0> __forceinline const float4 shuffle(const float4 b)
292{
293 return shuffle<i0, i0, i0, i0>(b);
294}
295template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
296{
297# ifdef __KERNEL_NEON__
298 return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
299# else
300 return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
301# endif
302}
303
304template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
305{
306 return float4(_mm_movelh_ps(a, b));
307}
308
309template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
310{
311 return float4(_mm_movehl_ps(b, a));
312}
313
314template<size_t i> __forceinline float extract(const float4 a)
315{
316 return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
317}
318template<> __forceinline float extract<0>(const float4 a)
319{
320 return _mm_cvtss_f32(a);
321}
322#endif
323
324ccl_device_inline float reduce_add(const float4 a)
325{
326#if defined(__KERNEL_SSE__)
327# if defined(__KERNEL_NEON__)
328 return vaddvq_f32(a);
329# elif defined(__KERNEL_SSE3__)
330 float4 h(_mm_hadd_ps(a.m128, a.m128));
331 return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
332# else
333 float4 h(shuffle<1, 0, 3, 2>(a) + a);
334 return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
335# endif
336#else
337 return a.x + a.y + a.z + a.w;
338#endif
339}
340
341ccl_device_inline float reduce_min(const float4 a)
342{
343#if defined(__KERNEL_SSE__)
344# if defined(__KERNEL_NEON__)
345 return vminvq_f32(a);
346# else
347 float4 h = min(shuffle<1, 0, 3, 2>(a), a);
348 return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
349# endif
350#else
351 return min(min(a.x, a.y), min(a.z, a.w));
352#endif
353}
354
355ccl_device_inline float reduce_max(const float4 a)
356{
357#if defined(__KERNEL_SSE__)
358# if defined(__KERNEL_NEON__)
359 return vmaxvq_f32(a);
360# else
361 float4 h = max(shuffle<1, 0, 3, 2>(a), a);
362 return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
363# endif
364#else
365 return max(max(a.x, a.y), max(a.z, a.w));
366#endif
367}
368
369#if !defined(__KERNEL_METAL__)
370ccl_device_inline float dot(const float4 a, const float4 b)
371{
372# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
373# if defined(__KERNEL_NEON__)
374 __m128 t = vmulq_f32(a, b);
375 return vaddvq_f32(t);
376# else
377 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
378# endif
379# else
380 return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
381# endif
382}
383#endif /* !defined(__KERNEL_METAL__) */
384
385ccl_device_inline float len(const float4 a)
386{
387 return sqrtf(dot(a, a));
388}
389
390ccl_device_inline float len_squared(const float4 a)
391{
392 return dot(a, a);
393}
394
395ccl_device_inline float4 sqr(const float4 a)
396{
397 return a * a;
398}
399
400#if !defined(__KERNEL_METAL__)
401ccl_device_inline float distance(const float4 a, const float4 b)
402{
403 return len(a - b);
404}
405
406ccl_device_inline float4 rcp(const float4 a)
407{
408# ifdef __KERNEL_SSE__
409 /* Don't use _mm_rcp_ps due to poor precision. */
410 return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
411# else
412 return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
413# endif
414}
415
416ccl_device_inline float4 sqrt(const float4 a)
417{
418# ifdef __KERNEL_SSE__
419 return float4(_mm_sqrt_ps(a.m128));
420# else
421 return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
422# endif
423}
424
425ccl_device_inline float4 cross(const float4 a, const float4 b)
426{
427# ifdef __KERNEL_SSE__
428 return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
429 (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
430# else
431 return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
432# endif
433}
434
435ccl_device_inline bool is_zero(const float4 a)
436{
437# ifdef __KERNEL_SSE__
438 return a == zero_float4();
439# else
440 return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
441# endif
442}
443
444ccl_device_inline float average(const float4 a)
445{
446 return reduce_add(a) * 0.25f;
447}
448
449ccl_device_inline float4 normalize(const float4 a)
450{
451 return a / len(a);
452}
453
454ccl_device_inline float4 safe_normalize(const float4 a)
455{
456 float t = len(a);
457 return (t != 0.0f) ? a / t : a;
458}
459
460ccl_device_inline float4 fabs(const float4 a)
461{
462# if defined(__KERNEL_SSE__)
463# if defined(__KERNEL_NEON__)
464 return float4(vabsq_f32(a));
465# else
466 return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
467# endif
468# else
469 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
470# endif
471}
472
473ccl_device_inline float4 fmod(const float4 a, const float b)
474{
475 return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b));
476}
477
478ccl_device_inline float4 floor(const float4 a)
479{
480# ifdef __KERNEL_SSE__
481# if defined(__KERNEL_NEON__)
482 return float4(vrndmq_f32(a));
483# else
484 return float4(_mm_floor_ps(a));
485# endif
486# else
487 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
488# endif
489}
490
491ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
492{
493# ifdef __KERNEL_SSE__
494 const float4 f = floor(x);
495 *i = int4(_mm_cvttps_epi32(f.m128));
496 return x - f;
497# else
498 float4 r;
499 r.x = floorfrac(x.x, &i->x);
500 r.y = floorfrac(x.y, &i->y);
501 r.z = floorfrac(x.z, &i->z);
502 r.w = floorfrac(x.w, &i->w);
503 return r;
504# endif
505}
506
507ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
508{
509 return a + t * (b - a);
510}
511
512ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
513{
514 return a + t * (b - a);
515}
516
517ccl_device_inline float4 saturate(const float4 a)
518{
519 return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
520}
521
522ccl_device_inline float4 exp(float4 v)
523{
524 return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
525}
526
527ccl_device_inline float4 log(float4 v)
528{
529 return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
530}
531
532#endif /* !__KERNEL_METAL__*/
533
534ccl_device_inline bool isequal(const float4 a, const float4 b)
535{
536#if defined(__KERNEL_METAL__)
537 return all(a == b);
538#else
539 return a == b;
540#endif
541}
542
543#ifndef __KERNEL_GPU__
544ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
545{
546# ifdef __KERNEL_SSE__
547# ifdef __KERNEL_SSE42__
548 return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
549# else
550 return float4(
551 _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
552# endif
553# else
554 return make_float4(
555 (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
556# endif
557}
558
559ccl_device_inline float4 mask(const int4 mask, const float4 a)
560{
561 /* Replace elements of x with zero where mask isn't set. */
562 return select(mask, a, zero_float4());
563}
564
566{
567# ifdef __KERNEL_SSE__
568 return float4(_mm_loadu_ps(v));
569# else
570 return make_float4(v[0], v[1], v[2], v[3]);
571# endif
572}
573
574#endif /* !__KERNEL_GPU__ */
575
576ccl_device_inline float4 safe_divide(const float4 a, const float b)
577{
578 return (b != 0.0f) ? a / b : zero_float4();
579}
580
581ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
582{
583 return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
584 (b.y != 0.0f) ? a.y / b.y : 0.0f,
585 (b.z != 0.0f) ? a.z / b.z : 0.0f,
586 (b.w != 0.0f) ? a.w / b.w : 0.0f);
587}
588
590{
591 return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
592}
593
595{
596 if (!isfinite_safe(v.x))
597 v.x = 0.0f;
598 if (!isfinite_safe(v.y))
599 v.y = 0.0f;
600 if (!isfinite_safe(v.z))
601 v.z = 0.0f;
602 if (!isfinite_safe(v.w))
603 v.w = 0.0f;
604 return v;
605}
606
607/* Consistent name for this would be pow, but HIP compiler crashes in name mangling. */
608ccl_device_inline float4 power(float4 v, float e)
609{
610 return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.w, e));
611}
612
614
615#endif /* __UTIL_MATH_FLOAT4_H__ */
#define saturate(a)
__forceinline float extract(const int4 &b)
Definition binning.cpp:27
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
ATTR_WARN_UNUSED_RESULT const BMVert * v
SIMD_FORCE_INLINE btVector3 & normalize()
Normalize this vector x^2 + y^2 + z^2 = 1.
Definition btVector3.h:303
local_group_size(16, 16) .push_constant(Type b
#define logf(x)
#define expf(x)
#define ccl_private
#define ccl_device_inline
#define powf(x, y)
#define CCL_NAMESPACE_END
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
#define saturatef(x)
#define fmodf(x, y)
#define floorf(x)
#define __float_as_int(x)
#define fabsf(x)
#define __float_as_uint(x)
#define sqrtf(x)
ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
#define __uint_as_float(x)
#define __forceinline
int len
#define mix(a, b, c)
Definition hash.h:36
ccl_device_inline float4 safe_normalize(const float4 a)
ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
ccl_device_inline float4 operator+(const float4 a, const float4 b)
Definition math_float4.h:87
ccl_device_inline bool operator==(const float4 a, const float4 b)
ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
ccl_device_inline float4 operator*(const float4 a, const float4 b)
Definition math_float4.h:50
ccl_device_inline float4 one_float4()
Definition math_float4.h:24
ccl_device_inline float4 cross(const float4 a, const float4 b)
ccl_device_inline float reduce_add(const float4 a)
ccl_device_inline int4 operator>=(const float4 a, const float4 b)
ccl_device_inline float4 floor(const float4 a)
ccl_device_inline float4 operator/(const float4 a, float f)
Definition math_float4.h:73
ccl_device_inline const float4 operator^(const float4 a, const float4 b)
ccl_device_inline int4 operator<=(const float4 a, const float4 b)
ccl_device_inline float4 operator/=(float4 &a, const float4 b)
ccl_device_inline float4 operator-(const float4 &a)
Definition math_float4.h:40
ccl_device_inline float4 mask(const int4 mask, const float4 a)
ccl_device_inline float4 operator-=(float4 &a, const float4 b)
ccl_device_inline float4 fmod(const float4 a, const float b)
ccl_device_inline bool isfinite_safe(float4 v)
ccl_device_inline float average(const float4 a)
ccl_device_inline float4 power(float4 v, float e)
ccl_device_inline float4 operator+=(float4 &a, const float4 b)
ccl_device_inline int4 cast(const float4 a)
Definition math_float4.h:29
ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
ccl_device_inline float4 operator*=(float4 &a, const float4 b)
ccl_device_inline float4 load_float4(ccl_private const float *v)
ccl_device_inline float dot(const float4 a, const float4 b)
ccl_device_inline float distance(const float4 a, const float4 b)
ccl_device_inline float4 sqrt(const float4 a)
ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
ccl_device_inline bool isequal(const float4 a, const float4 b)
ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
ccl_device_inline float4 sqr(const float4 a)
ccl_device_inline float reduce_max(const float4 a)
ccl_device_inline float4 ensure_finite(float4 v)
ccl_device_inline bool is_zero(const float4 a)
ccl_device_inline float4 exp(float4 v)
ccl_device_inline float4 fabs(const float4 a)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 zero_float4()
Definition math_float4.h:15
ccl_device_inline float4 rcp(const float4 a)
ccl_device_inline float4 safe_divide(const float4 a, const float b)
ccl_device_inline float reduce_min(const float4 a)
ccl_device_inline float4 log(float4 v)
ccl_device_inline int4 operator<(const float4 a, const float4 b)
ccl_device_inline float len_squared(const float4 a)
VecBase< float, 4 > float4
#define min(a, b)
Definition sort.c:32
float max