6#ifndef __UTIL_MATH_INT8_H__
7#define __UTIL_MATH_INT8_H__
10# error "Do not include this file directly, include util/types.h instead."
19 return vint8(_mm256_add_epi32(a.m256,
b.m256));
22 a.a +
b.a, a.b +
b.b, a.c +
b.c, a.d +
b.d, a.e +
b.e, a.f +
b.f, a.g +
b.g, a.h +
b.h);
34 return vint8(_mm256_sub_epi32(a.m256,
b.m256));
37 a.a -
b.a, a.b -
b.b, a.c -
b.c, a.d -
b.d, a.e -
b.e, a.f -
b.f, a.g -
b.g, a.h -
b.h);
49 return vint8(_mm256_srai_epi32(a.m256, i));
52 a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i);
59 return vint8(_mm256_slli_epi32(a.m256, i));
62 a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i);
69 return vint8(_mm256_cmpgt_epi32(
b.m256, a.m256));
72 a.a <
b.a, a.b <
b.b, a.c <
b.c, a.d <
b.d, a.e <
b.e, a.f <
b.f, a.g <
b.g, a.h <
b.h);
84 return vint8(_mm256_cmpeq_epi32(a.m256,
b.m256));
104# ifdef __KERNEL_AVX__
106 _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(
b.m256, a.m256)));
126# ifdef __KERNEL_AVX__
127 return vint8(_mm256_and_si256(a.m256,
b.m256));
130 a.a &
b.a, a.b &
b.b, a.c &
b.c, a.d &
b.d, a.e &
b.e, a.f &
b.f, a.g &
b.g, a.h &
b.h);
136# ifdef __KERNEL_AVX__
137 return vint8(_mm256_or_si256(a.m256,
b.m256));
140 a.a |
b.a, a.b |
b.b, a.c |
b.c, a.d |
b.d, a.e |
b.e, a.f |
b.f, a.g |
b.g, a.h |
b.h);
146# ifdef __KERNEL_AVX__
147 return vint8(_mm256_xor_si256(a.m256,
b.m256));
150 a.a ^
b.a, a.b ^
b.b, a.c ^
b.c, a.d ^
b.d, a.e ^
b.e, a.f ^
b.f, a.g ^
b.g, a.h ^
b.h);
220# ifdef __KERNEL_AVX__
223 return vint8(_mm256_srli_epi32(a.m256,
b));
229# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
230 return vint8(_mm256_min_epi32(a.m256,
b.m256));
245# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
246 return vint8(_mm256_max_epi32(a.m256,
b.m256));
261 return min(
max(a, mn), mx);
266# ifdef __KERNEL_AVX__
267 return vint8(_mm256_castps_si256(_mm256_blendv_ps(
268 _mm256_castsi256_ps(
b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))));
271 (mask.b) ? a.b :
b.b,
272 (mask.c) ? a.c :
b.c,
273 (mask.d) ? a.d :
b.d,
274 (mask.e) ? a.e :
b.e,
275 (mask.f) ? a.f :
b.f,
276 (mask.g) ? a.g :
b.g,
277 (mask.h) ? a.h :
b.h);
283# ifdef __KERNEL_AVX__
284 return vint8(_mm256_loadu_si256((__m256i *)
v));
294 return vfloat8(_mm256_castsi256_ps(a));
311 _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))));
316 return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)));
319template<
size_t i0,
size_t i1>
322 return vint8(_mm256_permute2f128_si256(a,
b, (i1 << 4) | (i0 << 0)));
325template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
329 _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))));
332template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
335 return vint8(_mm256_castps_si256(_mm256_shuffle_ps(
336 _mm256_castsi256_ps(a), _mm256_castsi256_ps(
b), _MM_SHUFFLE(i3, i2, i1, i0))));
339template<>
__forceinline const vint8 shuffle<0, 0, 2, 2>(
const vint8
b)
341 return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(
b))));
343template<>
__forceinline const vint8 shuffle<1, 1, 3, 3>(
const vint8
b)
345 return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(
b))));
347template<>
__forceinline const vint8 shuffle<0, 1, 0, 1>(
const vint8
b)
349 return vint8(_mm256_castps_si256(
350 _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(
b))))));
ATTR_WARN_UNUSED_RESULT const BMVert * v
local_group_size(16, 16) .push_constant(Type b
#define ccl_device_forceinline
#define ccl_device_inline
#define CCL_NAMESPACE_END
ccl_device_inline vint8 operator==(const vint8 a, const vint8 b)
ccl_device_inline vint8 operator<<(const vint8 a, int i)
ccl_device_inline vint8 & operator&=(vint8 &a, const vint8 b)
ccl_device_inline vint8 operator-(const vint8 a, const vint8 b)
ccl_device_inline vint8 operator^(const vint8 a, const vint8 b)
ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx)
ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b)
ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b)
ccl_device_inline vint8 operator>>(const vint8 a, int i)
ccl_device_inline vint8 load_vint8(const int *v)
ccl_device_inline vint8 operator|(const vint8 a, const vint8 b)
ccl_device_inline vint8 operator&(const vint8 a, const vint8 b)
ccl_device_inline vint8 & operator>>=(vint8 &a, const int32_t b)
ccl_device_inline vint8 operator<(const vint8 a, const vint8 b)
ccl_device_inline vint8 & operator^=(vint8 &a, const vint8 b)
CCL_NAMESPACE_BEGIN ccl_device_inline vint8 operator+(const vint8 a, const vint8 b)
ccl_device_inline vfloat8 cast(const vint8 a)
ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b)
ccl_device_inline vint8 & operator<<=(vint8 &a, const int32_t b)
ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b)
ccl_device_inline vint8 & operator|=(vint8 &a, const vint8 b)
ccl_device_inline vfloat8 make_vfloat8(float f)
ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)