16 return float4(_mm_setzero_ps());
35 return int4(_mm_castps_si128(a));
42#if !defined(__KERNEL_METAL__)
46 const __m128
mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
56 return float4(_mm_mul_ps(a.m128,
b.m128));
64# if defined(__KERNEL_SSE__)
78 return a * (1.0f / f);
84 return float4(_mm_div_ps(a.m128,
b.m128));
93 return float4(_mm_add_ps(a.m128,
b.m128));
106# ifdef __KERNEL_SSE__
107 return float4(_mm_sub_ps(a.m128,
b.m128));
150# ifdef __KERNEL_SSE__
151 return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128,
b.m128)));
159# ifdef __KERNEL_SSE__
160 return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128,
b.m128)));
168# ifdef __KERNEL_SSE__
169 return int4(_mm_castps_si128(_mm_cmple_ps(a.m128,
b.m128)));
177# ifdef __KERNEL_SSE__
178 return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128,
b.m128)) & 15) == 15;
180 return (a.
x ==
b.x && a.
y ==
b.y && a.
z ==
b.z && a.
w ==
b.w);
186# ifdef __KERNEL_SSE__
187 return float4(_mm_xor_ps(a.m128,
b.m128));
198# ifdef __KERNEL_SSE__
199 return float4(_mm_min_ps(a.m128,
b.m128));
207# ifdef __KERNEL_SSE__
208 return float4(_mm_max_ps(a.m128,
b.m128));
216 return min(
max(a, mn), mx);
223# ifdef __KERNEL_NEON__
224 return float4(vfmaq_f32(c, a,
b));
225# elif defined(__KERNEL_AVX2__)
226 return float4(_mm_fmadd_ps(a,
b, c));
238# ifdef __KERNEL_NEON__
239 return float4(vfmaq_f32(vnegq_f32(c), a,
b));
240# elif defined(__KERNEL_AVX2__)
241 return float4(_mm_fmsub_ps(a,
b, c));
251template<
size_t i0, const
size_t i1, const
size_t i2, const
size_t i3>
254# ifdef __KERNEL_NEON__
255 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a.m128));
258 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))));
264 return float4(_mm_movelh_ps(a, a));
269 return float4(_mm_movehl_ps(a, a));
272# ifdef __KERNEL_SSE3__
275 return float4(_mm_moveldup_ps(a));
280 return float4(_mm_movehdup_ps(a));
284template<
size_t i0, const
size_t i1, const
size_t i2, const
size_t i3>
287# ifdef __KERNEL_NEON__
288 return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a,
b));
290 return float4(_mm_shuffle_ps(a,
b, _MM_SHUFFLE(i3, i2, i1, i0)));
296 return shuffle<i0, i0, i0, i0>(a);
300# ifdef __KERNEL_NEON__
301 return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a,
b));
303 return float4(_mm_shuffle_ps(a,
b, _MM_SHUFFLE(i0, i0, i0, i0)));
309 return float4(_mm_movelh_ps(a,
b));
314 return float4(_mm_movehl_ps(
b, a));
319 return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
323 return _mm_cvtss_f32(a);
329#if defined(__KERNEL_SSE__)
330# if defined(__KERNEL_NEON__)
331 return vaddvq_f32(a);
332# elif defined(__KERNEL_SSE3__)
333 float4 h(_mm_hadd_ps(a.m128, a.m128));
334 return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
336 float4 h(shuffle<1, 0, 3, 2>(a) + a);
337 return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
340 return a.
x + a.
y + a.
z + a.
w;
346#if defined(__KERNEL_SSE__)
347# if defined(__KERNEL_NEON__)
348 return vminvq_f32(a);
350 float4 h =
min(shuffle<1, 0, 3, 2>(a), a);
351 return _mm_cvtss_f32(
min(shuffle<2, 3, 0, 1>(h), h));
360#if defined(__KERNEL_SSE__)
361# if defined(__KERNEL_NEON__)
362 return vmaxvq_f32(a);
364 float4 h =
max(shuffle<1, 0, 3, 2>(a), a);
365 return _mm_cvtss_f32(
max(shuffle<2, 3, 0, 1>(h), h));
372#if !defined(__KERNEL_METAL__)
375# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
376# if defined(__KERNEL_NEON__)
377 const __m128 t = vmulq_f32(a,
b);
378 return vaddvq_f32(t);
380 return _mm_cvtss_f32(_mm_dp_ps(a,
b, 0xFF));
383 return (a.
x *
b.x + a.
y *
b.y) + (a.
z *
b.z + a.
w *
b.w);
403#if !defined(__KERNEL_METAL__)
411# ifdef __KERNEL_SSE__
412 return float4(_mm_sqrt_ps(a.m128));
420# ifdef __KERNEL_SSE__
421 return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(
b)) -
422 (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(
b));
430# ifdef __KERNEL_SSE__
433 return (a.
x == 0.0f && a.
y == 0.0f && a.
z == 0.0f && a.
w == 0.0f);
449 const float t =
len(a);
450 return (t != 0.0f) ? a / t : a;
455# if defined(__KERNEL_SSE__)
456# if defined(__KERNEL_NEON__)
457 return float4(vabsq_f32(a));
459 return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
472# if defined(__KERNEL_NEON__)
476 const float32x4_t iquot = vrndq_f32(a /
b);
477 return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(
b))));
478# elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
479 const __m128 iquot = _mm_round_ps(a /
b, _MM_FROUND_TRUNC);
480 return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(
b))));
488# ifdef __KERNEL_SSE__
489# if defined(__KERNEL_NEON__)
490 return float4(vrndmq_f32(a));
492 return float4(_mm_floor_ps(a));
501# ifdef __KERNEL_SSE__
503 *
i =
int4(_mm_cvttps_epi32(f.m128));
517 return a + t * (
b - a);
522 return a + t * (
b - a);
544#if defined(__KERNEL_METAL__)
551template<
class MaskType>
554#if defined(__KERNEL_METAL__)
556#elif defined(__KERNEL_SSE__)
557# ifdef __KERNEL_SSE42__
558 return float4(_mm_blendv_ps(
b.m128, a.m128, _mm_castsi128_ps(
mask.m128)));
561 _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(
mask), a), _mm_andnot_ps(_mm_castsi128_ps(
mask),
b)));
575#ifndef __KERNEL_GPU__
579# ifdef __KERNEL_SSE__
580 return float4(_mm_loadu_ps(
v));
596 (
b.y != 0.0f) ? a.
y /
b.y : 0.0f,
597 (
b.z != 0.0f) ? a.
z /
b.z : 0.0f,
598 (
b.w != 0.0f) ? a.
w /
b.w : 0.0f);
632 return a + t * (
b - a);
635#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
639# ifdef __KERNEL_SSE__
640 return int4(_mm_castps_si128(f.m128));
649# ifdef __KERNEL_SSE__
650 return float4(_mm_castsi128_ps(
i.m128));
__forceinline float extract(const int4 &b)
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define ccl_device_inline
#define ccl_device_template_spec
#define CCL_NAMESPACE_END
VecBase< float, D > normalize(VecOp< float, D >) RET
bool all(VecOp< bool, D >) RET
VecBase< float, 3 > cross(VecOp< float, 3 >, VecOp< float, 3 >) RET
VecBase< float, 4 > float4
ccl_device_inline float2 mask(const MaskType mask, const float2 a)
ccl_device_inline float4 safe_normalize(const float4 a)
ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
ccl_device_inline float4 operator+(const float4 a, const float4 b)
ccl_device_inline bool isfinite_safe(const float4 v)
ccl_device_inline bool operator==(const float4 a, const float4 b)
ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
ccl_device_inline int4 __float4_as_int4(const float4 f)
ccl_device_template_spec float4 make_zero()
ccl_device_inline float4 operator*(const float4 a, const float4 b)
ccl_device_inline float4 one_float4()
ccl_device_inline float reduce_add(const float4 a)
ccl_device_inline int4 operator>=(const float4 a, const float4 b)
ccl_device_inline int4 operator<=(const float4 a, const float4 b)
ccl_device_inline float4 operator/=(float4 &a, const float4 b)
ccl_device_inline float4 operator-(const float4 &a)
ccl_device_inline float4 operator-=(float4 &a, const float4 b)
ccl_device_inline float4 fmod(const float4 a, const float b)
ccl_device_inline float average(const float4 a)
ccl_device_inline float4 operator+=(float4 &a, const float4 b)
ccl_device_inline float4 operator^(const float4 a, const float4 b)
ccl_device_inline float4 operator*=(float4 &a, const float4 b)
ccl_device_inline float4 __int4_as_float4(const int4 i)
ccl_device_inline float dot(const float4 a, const float4 b)
ccl_device_inline float distance(const float4 a, const float4 b)
ccl_device_inline bool isequal(const float4 a, const float4 b)
ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
ccl_device_inline float reduce_max(const float4 a)
ccl_device_inline bool is_zero(const float4 a)
ccl_device_inline float4 fabs(const float4 a)
ccl_device_inline float4 ensure_finite(const float4 v)
ccl_device_inline float4 load_float4(const ccl_private float *v)
ccl_device_inline float4 operator/(const float4 a, const float f)
ccl_device_inline void copy_v4_v4(ccl_private float *r, const float4 val)
ccl_device_inline float4 mask(const MaskType mask, const float4 a)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 zero_float4()
ccl_device_inline float4 power(const float4 v, const float e)
ccl_device_inline float4 safe_divide(const float4 a, const float b)
ccl_device_inline float reduce_min(const float4 a)
ccl_device_inline float4 madd(const float4 a, const float4 b, const float4 c)
ccl_device_inline int4 operator<(const float4 a, const float4 b)
ccl_device_inline float4 interp(float4 a, float4 b, float t)
ccl_device_inline float len_squared(const float4 a)