Blender V5.0
math_float4.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
2 * SPDX-FileCopyrightText: 2011-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#pragma once
7
8#include "util/math_base.h"
9#include "util/types_float4.h"
10
12
14{
15#ifdef __KERNEL_SSE__
16 return float4(_mm_setzero_ps());
17#else
18 return make_float4(0.0f);
19#endif
20}
21
26
31
33{
34#ifdef __KERNEL_SSE__
35 return int4(_mm_castps_si128(a));
36#else
37 return make_int4(
39#endif
40}
41
42#if !defined(__KERNEL_METAL__)
44{
45# ifdef __KERNEL_SSE__
46 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
47 return float4(_mm_xor_ps(a.m128, mask));
48# else
49 return make_float4(-a.x, -a.y, -a.z, -a.w);
50# endif
51}
52
54{
55# ifdef __KERNEL_SSE__
56 return float4(_mm_mul_ps(a.m128, b.m128));
57# else
58 return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
59# endif
60}
61
63{
64# if defined(__KERNEL_SSE__)
65 return a * make_float4(f);
66# else
67 return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
68# endif
69}
70
72{
73 return a * f;
74}
75
77{
78 return a * (1.0f / f);
79}
80
82{
83# ifdef __KERNEL_SSE__
84 return float4(_mm_div_ps(a.m128, b.m128));
85# else
86 return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
87# endif
88}
89
91{
92# ifdef __KERNEL_SSE__
93 return float4(_mm_add_ps(a.m128, b.m128));
94# else
95 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
96# endif
97}
98
100{
101 return a + make_float4(f);
102}
103
105{
106# ifdef __KERNEL_SSE__
107 return float4(_mm_sub_ps(a.m128, b.m128));
108# else
109 return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
110# endif
111}
112
114{
115 return a - make_float4(f);
116}
117
119{
120 return a = a + b;
121}
122
124{
125 return a = a - b;
126}
127
129{
130 return a = a * b;
131}
132
134{
135 return a = a * f;
136}
137
139{
140 return a = a / b;
141}
142
144{
145 return a = a / f;
146}
147
149{
150# ifdef __KERNEL_SSE__
151 return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
152# else
153 return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
154# endif
155}
156
158{
159# ifdef __KERNEL_SSE__
160 return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
161# else
162 return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
163# endif
164}
165
167{
168# ifdef __KERNEL_SSE__
169 return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
170# else
171 return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
172# endif
173}
174
176{
177# ifdef __KERNEL_SSE__
178 return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
179# else
180 return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
181# endif
182}
183
185{
186# ifdef __KERNEL_SSE__
187 return float4(_mm_xor_ps(a.m128, b.m128));
188# else
193# endif
194}
195
197{
198# ifdef __KERNEL_SSE__
199 return float4(_mm_min_ps(a.m128, b.m128));
200# else
201 return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
202# endif
203}
204
206{
207# ifdef __KERNEL_SSE__
208 return float4(_mm_max_ps(a.m128, b.m128));
209# else
210 return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
211# endif
212}
213
214ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
215{
216 return min(max(a, mn), mx);
217}
218#endif /* !__KERNEL_METAL__*/
219
221{
222#ifdef __KERNEL_SSE__
223# ifdef __KERNEL_NEON__
224 return float4(vfmaq_f32(c, a, b));
225# elif defined(__KERNEL_AVX2__)
226 return float4(_mm_fmadd_ps(a, b, c));
227# else
228 return a * b + c;
229# endif
230#else
231 return a * b + c;
232#endif
233}
234
236{
237#ifdef __KERNEL_SSE__
238# ifdef __KERNEL_NEON__
239 return float4(vfmaq_f32(vnegq_f32(c), a, b));
240# elif defined(__KERNEL_AVX2__)
241 return float4(_mm_fmsub_ps(a, b, c));
242# else
243 return a * b - c;
244# endif
245#else
246 return a * b - c;
247#endif
248}
249
250#ifdef __KERNEL_SSE__
251template<size_t i0, const size_t i1, const size_t i2, const size_t i3>
252__forceinline float4 shuffle(const float4 a)
253{
254# ifdef __KERNEL_NEON__
255 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a.m128));
256# else
257 return float4(
258 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))));
259# endif
260}
261
262template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a)
263{
264 return float4(_mm_movelh_ps(a, a));
265}
266
267template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a)
268{
269 return float4(_mm_movehl_ps(a, a));
270}
271
272# ifdef __KERNEL_SSE3__
273template<> __forceinline float4 shuffle<0, 0, 2, 2>(const float4 a)
274{
275 return float4(_mm_moveldup_ps(a));
276}
277
278template<> __forceinline float4 shuffle<1, 1, 3, 3>(const float4 a)
279{
280 return float4(_mm_movehdup_ps(a));
281}
282# endif /* __KERNEL_SSE3__ */
283
284template<size_t i0, const size_t i1, const size_t i2, const size_t i3>
285__forceinline float4 shuffle(const float4 a, const float4 b)
286{
287# ifdef __KERNEL_NEON__
288 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
289# else
290 return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
291# endif
292}
293
294template<size_t i0> __forceinline float4 shuffle(const float4 a)
295{
296 return shuffle<i0, i0, i0, i0>(a);
297}
298template<size_t i0> __forceinline float4 shuffle(const float4 a, const float4 b)
299{
300# ifdef __KERNEL_NEON__
301 return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
302# else
303 return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
304# endif
305}
306
307template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
308{
309 return float4(_mm_movelh_ps(a, b));
310}
311
312template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
313{
314 return float4(_mm_movehl_ps(b, a));
315}
316
317template<size_t i> __forceinline float extract(const float4 a)
318{
319 return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
320}
321template<> __forceinline float extract<0>(const float4 a)
322{
323 return _mm_cvtss_f32(a);
324}
325#endif
326
328{
329#if defined(__KERNEL_SSE__)
330# if defined(__KERNEL_NEON__)
331 return vaddvq_f32(a);
332# elif defined(__KERNEL_SSE3__)
333 float4 h(_mm_hadd_ps(a.m128, a.m128));
334 return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
335# else
336 float4 h(shuffle<1, 0, 3, 2>(a) + a);
337 return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
338# endif
339#else
340 return a.x + a.y + a.z + a.w;
341#endif
342}
343
345{
346#if defined(__KERNEL_SSE__)
347# if defined(__KERNEL_NEON__)
348 return vminvq_f32(a);
349# else
350 float4 h = min(shuffle<1, 0, 3, 2>(a), a);
351 return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
352# endif
353#else
354 return min(min(a.x, a.y), min(a.z, a.w));
355#endif
356}
357
359{
360#if defined(__KERNEL_SSE__)
361# if defined(__KERNEL_NEON__)
362 return vmaxvq_f32(a);
363# else
364 float4 h = max(shuffle<1, 0, 3, 2>(a), a);
365 return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
366# endif
367#else
368 return max(max(a.x, a.y), max(a.z, a.w));
369#endif
370}
371
372#if !defined(__KERNEL_METAL__)
373ccl_device_inline float dot(const float4 a, const float4 b)
374{
375# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
376# if defined(__KERNEL_NEON__)
377 const __m128 t = vmulq_f32(a, b);
378 return vaddvq_f32(t);
379# else
380 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
381# endif
382# else
383 return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
384# endif
385}
386#endif /* !defined(__KERNEL_METAL__) */
387
389{
390 return sqrtf(dot(a, a));
391}
392
394{
395 return dot(a, a);
396}
397
399{
400 return a * a;
401}
402
403#if !defined(__KERNEL_METAL__)
405{
406 return len(a - b);
407}
408
410{
411# ifdef __KERNEL_SSE__
412 return float4(_mm_sqrt_ps(a.m128));
413# else
414 return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
415# endif
416}
417
419{
420# ifdef __KERNEL_SSE__
421 return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
422 (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
423# else
424 return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
425# endif
426}
427
429{
430# ifdef __KERNEL_SSE__
431 return a == zero_float4();
432# else
433 return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
434# endif
435}
436
438{
439 return reduce_add(a) * 0.25f;
440}
441
443{
444 return a / len(a);
445}
446
448{
449 const float t = len(a);
450 return (t != 0.0f) ? a / t : a;
451}
452
454{
455# if defined(__KERNEL_SSE__)
456# if defined(__KERNEL_NEON__)
457 return float4(vabsq_f32(a));
458# else
459 return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
460# endif
461# else
462 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
463# endif
464}
465
466/* The floating-point remainder of the division operation `a / b` calculated by this function is
467 * exactly the value `a - iquot * b`, where `iquot` is `a / b with` its fractional part truncated.
468 *
469 * The returned value has the same sign as `a` and is less than `b` in magnitude. */
470ccl_device_inline float4 fmod(const float4 a, const float b)
471{
472# if defined(__KERNEL_NEON__)
473 /* Use native Neon instructions.
474 * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
475 * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
476 const float32x4_t iquot = vrndq_f32(a / b);
477 return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
478# elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
479 const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
480 return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
481# else
482 return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b));
483# endif
484}
485
487{
488# ifdef __KERNEL_SSE__
489# if defined(__KERNEL_NEON__)
490 return float4(vrndmq_f32(a));
491# else
492 return float4(_mm_floor_ps(a));
493# endif
494# else
495 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
496# endif
497}
498
500{
501# ifdef __KERNEL_SSE__
502 const float4 f = floor(x);
503 *i = int4(_mm_cvttps_epi32(f.m128));
504 return x - f;
505# else
506 float4 r;
507 r.x = floorfrac(x.x, &i->x);
508 r.y = floorfrac(x.y, &i->y);
509 r.z = floorfrac(x.z, &i->z);
510 r.w = floorfrac(x.w, &i->w);
511 return r;
512# endif
513}
514
515ccl_device_inline float4 mix(const float4 a, const float4 b, const float t)
516{
517 return a + t * (b - a);
518}
519
521{
522 return a + t * (b - a);
523}
524
529
531{
532 return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
533}
534
536{
537 return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
538}
539
540#endif /* !__KERNEL_METAL__*/
541
543{
544#if defined(__KERNEL_METAL__)
545 return all(a == b);
546#else
547 return a == b;
548#endif
549}
550
551template<class MaskType>
552ccl_device_inline float4 select(const MaskType mask, const float4 a, const float4 b)
553{
554#if defined(__KERNEL_METAL__)
555 return metal::select(b, a, bool4(mask));
556#elif defined(__KERNEL_SSE__)
557# ifdef __KERNEL_SSE42__
558 return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
559# else
560 return float4(
561 _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
562# endif
563#else
564 return make_float4(
565 (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
566#endif
567}
568
569template<class MaskType> ccl_device_inline float4 mask(const MaskType mask, const float4 a)
570{
571 /* Replace elements of x with zero where mask isn't set. */
572 return select(mask, a, zero_float4());
573}
574
575#ifndef __KERNEL_GPU__
576
578{
579# ifdef __KERNEL_SSE__
580 return float4(_mm_loadu_ps(v));
581# else
582 return make_float4(v[0], v[1], v[2], v[3]);
583# endif
584}
585
586#endif /* !__KERNEL_GPU__ */
587
589{
590 return (b != 0.0f) ? a / b : zero_float4();
591}
592
594{
595 return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
596 (b.y != 0.0f) ? a.y / b.y : 0.0f,
597 (b.z != 0.0f) ? a.z / b.z : 0.0f,
598 (b.w != 0.0f) ? a.w / b.w : 0.0f);
599}
600
602{
603 return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
604}
605
607{
608 float4 r = v;
609 if (!isfinite_safe(r.x)) {
610 r.x = 0.0f;
611 }
612 if (!isfinite_safe(r.y)) {
613 r.y = 0.0f;
614 }
615 if (!isfinite_safe(r.z)) {
616 r.z = 0.0f;
617 }
618 if (!isfinite_safe(r.w)) {
619 r.w = 0.0f;
620 }
621 return r;
622}
623
624/* Consistent name for this would be pow, but HIP compiler crashes in name mangling. */
626{
627 return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.w, e));
628}
629
631{
632 return a + t * (b - a);
633}
634
635#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
636/* Int/Float conversion */
638{
639# ifdef __KERNEL_SSE__
640 return int4(_mm_castps_si128(f.m128));
641# else
642 return make_int4(
644# endif
645}
646
648{
649# ifdef __KERNEL_SSE__
650 return float4(_mm_castsi128_ps(i.m128));
651# else
652 return make_float4(
654# endif
655}
656#endif /* !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__) */
657
659{
660 r[0] = val.x;
661 r[1] = val.y;
662 r[2] = val.z;
663 r[3] = val.w;
664}
665
__forceinline float extract(const int4 &b)
Definition binning.cpp:27
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define ccl_private
#define ccl_device_inline
#define ccl_device_template_spec
#define __forceinline
#define logf(x)
#define expf(x)
#define powf(x, y)
#define CCL_NAMESPACE_END
#define saturatef(x)
#define fmodf(x, y)
#define __int_as_float(x)
#define __float_as_int(x)
#define __float_as_uint(x)
ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
#define __uint_as_float(x)
#define cast
#define log
#define exp
VecBase< float, D > normalize(VecOp< float, D >) RET
#define select(A, B, C)
#define floor
bool all(VecOp< bool, D >) RET
#define sqrt
VecBase< float, 3 > cross(VecOp< float, 3 >, VecOp< float, 3 >) RET
VecBase< bool, 4 > bool4
VecBase< float, 4 > float4
VecBase< int, 4 > int4
ccl_device_inline float2 mask(const MaskType mask, const float2 a)
ccl_device_inline float4 safe_normalize(const float4 a)
ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
ccl_device_inline float4 operator+(const float4 a, const float4 b)
Definition math_float4.h:90
ccl_device_inline bool isfinite_safe(const float4 v)
ccl_device_inline bool operator==(const float4 a, const float4 b)
ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
ccl_device_inline int4 __float4_as_int4(const float4 f)
ccl_device_template_spec float4 make_zero()
Definition math_float4.h:27
ccl_device_inline float4 operator*(const float4 a, const float4 b)
Definition math_float4.h:53
ccl_device_inline float4 one_float4()
Definition math_float4.h:22
ccl_device_inline float reduce_add(const float4 a)
ccl_device_inline int4 operator>=(const float4 a, const float4 b)
ccl_device_inline int4 operator<=(const float4 a, const float4 b)
ccl_device_inline float4 operator/=(float4 &a, const float4 b)
ccl_device_inline float4 operator-(const float4 &a)
Definition math_float4.h:43
ccl_device_inline float4 operator-=(float4 &a, const float4 b)
ccl_device_inline float4 fmod(const float4 a, const float b)
ccl_device_inline float average(const float4 a)
ccl_device_inline float4 operator+=(float4 &a, const float4 b)
ccl_device_inline float4 operator^(const float4 a, const float4 b)
ccl_device_inline float4 operator*=(float4 &a, const float4 b)
ccl_device_inline float4 __int4_as_float4(const int4 i)
ccl_device_inline float dot(const float4 a, const float4 b)
ccl_device_inline float distance(const float4 a, const float4 b)
ccl_device_inline bool isequal(const float4 a, const float4 b)
ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
ccl_device_inline float reduce_max(const float4 a)
ccl_device_inline bool is_zero(const float4 a)
ccl_device_inline float4 fabs(const float4 a)
ccl_device_inline float4 ensure_finite(const float4 v)
ccl_device_inline float4 load_float4(const ccl_private float *v)
ccl_device_inline float4 operator/(const float4 a, const float f)
Definition math_float4.h:76
ccl_device_inline void copy_v4_v4(ccl_private float *r, const float4 val)
ccl_device_inline float4 mask(const MaskType mask, const float4 a)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 zero_float4()
Definition math_float4.h:13
ccl_device_inline float4 power(const float4 v, const float e)
ccl_device_inline float4 safe_divide(const float4 a, const float b)
ccl_device_inline float reduce_min(const float4 a)
ccl_device_inline float4 madd(const float4 a, const float4 b, const float4 c)
ccl_device_inline int4 operator<(const float4 a, const float4 b)
ccl_device_inline float4 interp(float4 a, float4 b, float t)
ccl_device_inline float len_squared(const float4 a)
#define mix
#define sqr
#define floorf
#define fabsf
#define sqrtf
#define make_float4
#define saturate(a)
Definition smaa.cc:315
#define min(a, b)
Definition sort.cc:36
float y
Definition sky_math.h:225
float z
Definition sky_math.h:225
float x
Definition sky_math.h:225
float w
Definition sky_math.h:225
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
uint len