6#ifndef __UTIL_MATH_FLOAT4_H__
7#define __UTIL_MATH_FLOAT4_H__
10# error "Do not include this file directly, include util/types.h instead."
18 return float4(_mm_setzero_ps());
32 return int4(_mm_castps_si128(a));
39#if !defined(__KERNEL_METAL__)
43 __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
44 return float4(_mm_xor_ps(a.m128, mask));
53 return float4(_mm_mul_ps(a.m128,
b.m128));
61# if defined(__KERNEL_SSE__)
64 return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
75 return a * (1.0f / f);
81 return float4(_mm_div_ps(a.m128,
b.m128));
90 return float4(_mm_add_ps(a.m128,
b.m128));
103# ifdef __KERNEL_SSE__
104 return float4(_mm_sub_ps(a.m128,
b.m128));
147# ifdef __KERNEL_SSE__
148 return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128,
b.m128)));
150 return make_int4(a.x <
b.x, a.y <
b.y, a.z <
b.z, a.w <
b.w);
156# ifdef __KERNEL_SSE__
157 return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128,
b.m128)));
159 return make_int4(a.x >=
b.x, a.y >=
b.y, a.z >=
b.z, a.w >=
b.w);
165# ifdef __KERNEL_SSE__
166 return int4(_mm_castps_si128(_mm_cmple_ps(a.m128,
b.m128)));
168 return make_int4(a.x <=
b.x, a.y <=
b.y, a.z <=
b.z, a.w <=
b.w);
174# ifdef __KERNEL_SSE__
175 return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128,
b.m128)) & 15) == 15;
177 return (a.x ==
b.x && a.y ==
b.y && a.z ==
b.z && a.w ==
b.w);
183# ifdef __KERNEL_SSE__
184 return float4(_mm_xor_ps(a.m128,
b.m128));
195# ifdef __KERNEL_SSE__
196 return float4(_mm_min_ps(a.m128,
b.m128));
204# ifdef __KERNEL_SSE__
205 return float4(_mm_max_ps(a.m128,
b.m128));
213 return min(
max(a, mn), mx);
220# ifdef __KERNEL_NEON__
221 return float4(vfmaq_f32(c, a,
b));
222# elif defined(__KERNEL_AVX2__)
223 return float4(_mm_fmadd_ps(a,
b, c));
235# ifdef __KERNEL_NEON__
236 return float4(vfmaq_f32(vnegq_f32(c), a,
b));
237# elif defined(__KERNEL_AVX2__)
238 return float4(_mm_fmsub_ps(a,
b, c));
248template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
251# ifdef __KERNEL_NEON__
252 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(
b.m128));
255 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
b), _MM_SHUFFLE(i3, i2, i1, i0))));
261 return float4(_mm_movelh_ps(a, a));
266 return float4(_mm_movehl_ps(a, a));
269# ifdef __KERNEL_SSE3__
272 return float4(_mm_moveldup_ps(
b));
277 return float4(_mm_movehdup_ps(
b));
281template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
284# ifdef __KERNEL_NEON__
285 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a,
b));
287 return float4(_mm_shuffle_ps(a,
b, _MM_SHUFFLE(i3, i2, i1, i0)));
293 return shuffle<i0, i0, i0, i0>(
b);
297# ifdef __KERNEL_NEON__
298 return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a,
b));
300 return float4(_mm_shuffle_ps(a,
b, _MM_SHUFFLE(i0, i0, i0, i0)));
306 return float4(_mm_movelh_ps(a,
b));
311 return float4(_mm_movehl_ps(
b, a));
316 return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
320 return _mm_cvtss_f32(a);
326#if defined(__KERNEL_SSE__)
327# if defined(__KERNEL_NEON__)
328 return vaddvq_f32(a);
329# elif defined(__KERNEL_SSE3__)
330 float4 h(_mm_hadd_ps(a.m128, a.m128));
331 return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
333 float4 h(shuffle<1, 0, 3, 2>(a) + a);
334 return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
337 return a.x + a.y + a.z + a.w;
343#if defined(__KERNEL_SSE__)
344# if defined(__KERNEL_NEON__)
345 return vminvq_f32(a);
347 float4 h =
min(shuffle<1, 0, 3, 2>(a), a);
348 return _mm_cvtss_f32(
min(shuffle<2, 3, 0, 1>(h), h));
351 return min(
min(a.x, a.y),
min(a.z, a.w));
357#if defined(__KERNEL_SSE__)
358# if defined(__KERNEL_NEON__)
359 return vmaxvq_f32(a);
361 float4 h =
max(shuffle<1, 0, 3, 2>(a), a);
362 return _mm_cvtss_f32(
max(shuffle<2, 3, 0, 1>(h), h));
365 return max(
max(a.x, a.y), max(a.z, a.w));
369#if !defined(__KERNEL_METAL__)
372# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
373# if defined(__KERNEL_NEON__)
374 __m128 t = vmulq_f32(a,
b);
375 return vaddvq_f32(t);
377 return _mm_cvtss_f32(_mm_dp_ps(a,
b, 0xFF));
380 return (a.x *
b.x + a.y *
b.y) + (a.z *
b.z + a.w *
b.w);
400#if !defined(__KERNEL_METAL__)
408# ifdef __KERNEL_SSE__
410 return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
412 return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
418# ifdef __KERNEL_SSE__
419 return float4(_mm_sqrt_ps(a.m128));
427# ifdef __KERNEL_SSE__
428 return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(
b)) -
429 (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(
b));
431 return make_float4(a.y *
b.z - a.z *
b.y, a.z *
b.x - a.x *
b.z, a.x *
b.y - a.y *
b.x, 0.0f);
437# ifdef __KERNEL_SSE__
440 return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
457 return (t != 0.0f) ? a / t : a;
462# if defined(__KERNEL_SSE__)
463# if defined(__KERNEL_NEON__)
464 return float4(vabsq_f32(a));
466 return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
480# ifdef __KERNEL_SSE__
481# if defined(__KERNEL_NEON__)
482 return float4(vrndmq_f32(a));
484 return float4(_mm_floor_ps(a));
493# ifdef __KERNEL_SSE__
494 const float4 f =
floor(x);
495 *i =
int4(_mm_cvttps_epi32(f.m128));
509 return a + t * (
b - a);
514 return a + t * (
b - a);
536#if defined(__KERNEL_METAL__)
543#ifndef __KERNEL_GPU__
546# ifdef __KERNEL_SSE__
547# ifdef __KERNEL_SSE42__
548 return float4(_mm_blendv_ps(
b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
551 _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask),
b)));
555 (mask.x) ? a.x :
b.x, (mask.y) ? a.y :
b.y, (mask.z) ? a.z :
b.z, (mask.w) ? a.w :
b.w);
567# ifdef __KERNEL_SSE__
568 return float4(_mm_loadu_ps(
v));
584 (
b.y != 0.0f) ? a.y /
b.y : 0.0f,
585 (
b.z != 0.0f) ? a.z /
b.z : 0.0f,
586 (
b.w != 0.0f) ? a.w /
b.w : 0.0f);
__forceinline float extract(const int4 &b)
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
ATTR_WARN_UNUSED_RESULT const BMVert * v
SIMD_FORCE_INLINE btVector3 & normalize()
Normalize this vector x^2 + y^2 + z^2 = 1.
local_group_size(16, 16) .push_constant(Type b
#define ccl_device_inline
#define CCL_NAMESPACE_END
ccl_device_inline float4 safe_normalize(const float4 a)
ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
ccl_device_inline float4 operator+(const float4 a, const float4 b)
ccl_device_inline bool operator==(const float4 a, const float4 b)
ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
ccl_device_inline float4 operator*(const float4 a, const float4 b)
ccl_device_inline float4 one_float4()
ccl_device_inline float4 cross(const float4 a, const float4 b)
ccl_device_inline float reduce_add(const float4 a)
ccl_device_inline int4 operator>=(const float4 a, const float4 b)
ccl_device_inline float4 floor(const float4 a)
ccl_device_inline float4 operator/(const float4 a, float f)
ccl_device_inline const float4 operator^(const float4 a, const float4 b)
ccl_device_inline int4 operator<=(const float4 a, const float4 b)
ccl_device_inline float4 operator/=(float4 &a, const float4 b)
ccl_device_inline float4 operator-(const float4 &a)
ccl_device_inline float4 mask(const int4 mask, const float4 a)
ccl_device_inline float4 operator-=(float4 &a, const float4 b)
ccl_device_inline float4 fmod(const float4 a, const float b)
ccl_device_inline bool isfinite_safe(float4 v)
ccl_device_inline float average(const float4 a)
ccl_device_inline float4 power(float4 v, float e)
ccl_device_inline float4 operator+=(float4 &a, const float4 b)
ccl_device_inline int4 cast(const float4 a)
ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
ccl_device_inline float4 operator*=(float4 &a, const float4 b)
ccl_device_inline float4 load_float4(ccl_private const float *v)
ccl_device_inline float dot(const float4 a, const float4 b)
ccl_device_inline float distance(const float4 a, const float4 b)
ccl_device_inline float4 sqrt(const float4 a)
ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
ccl_device_inline bool isequal(const float4 a, const float4 b)
ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
ccl_device_inline float4 sqr(const float4 a)
ccl_device_inline float reduce_max(const float4 a)
ccl_device_inline float4 ensure_finite(float4 v)
ccl_device_inline bool is_zero(const float4 a)
ccl_device_inline float4 exp(float4 v)
ccl_device_inline float4 fabs(const float4 a)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 zero_float4()
ccl_device_inline float4 rcp(const float4 a)
ccl_device_inline float4 safe_divide(const float4 a, const float b)
ccl_device_inline float reduce_min(const float4 a)
ccl_device_inline float4 log(float4 v)
ccl_device_inline int4 operator<(const float4 a, const float4 b)
ccl_device_inline float len_squared(const float4 a)
VecBase< float, 4 > float4