11#if defined(__ARM_NEON)
13# define USE_HARDWARE_FP16_NEON
16#if (defined(__x86_64__) || defined(_M_X64))
20# define USE_HARDWARE_FP16_F16C
24# include <immintrin.h>
29#if defined(USE_HARDWARE_FP16_NEON)
30 float16x4_t h4 = vcvt_f16_f32(vdupq_n_f32(
v));
31 float16_t h = vget_lane_f16(h4, 0);
43 FP32 f32infty = {255 << 23};
44 FP32 f16max = {(127 + 16) << 23};
45 FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
58 if (f.u >= f16max.u) {
60 o = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;
64 if (f.u < (113 << 23)) {
69 f.f += denorm_magic.f;
72 o = f.u - denorm_magic.u;
78 f.u += (
uint32_t(15 - 127) << 23) + 0xfff;
93#if defined(USE_HARDWARE_FP16_NEON)
94 uint16x4_t v4 = vdup_n_u16(
v);
95 float16x4_t h4 = vreinterpret_f16_u16(v4);
96 float32x4_t f4 = vcvt_f32_f16(h4);
97 return vgetq_lane_f32(f4, 0);
106 constexpr FP32
magic = {113 << 23};
107 constexpr uint32_t shifted_exp = 0x7c00 << 13;
110 o.u = (
v & 0x7fff) << 13;
112 o.u += (127 - 15) << 23;
115 if (
exp == shifted_exp) {
116 o.u += (128 - 16) << 23;
123 o.u |= (
v & 0x8000) << 16;
134static inline __m128i F32_to_F16_4x(
const __m128 &f)
136 const __m128 mask_sign = _mm_set1_ps(-0.0f);
138 const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23);
139 const __m128i c_nanbit = _mm_set1_epi32(0x200);
140 const __m128i c_nanlobits = _mm_set1_epi32(0x1ff);
141 const __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
143 const __m128i c_min_normal = _mm_set1_epi32((127 - 14) << 23);
144 const __m128i c_subnorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
146 const __m128i c_normal_bias = _mm_set1_epi32(0xfff - ((127 - 15) << 23));
148 __m128 justsign = _mm_and_ps(f, mask_sign);
149 __m128 absf = _mm_andnot_ps(mask_sign, f);
151 __m128i absf_int = _mm_castps_si128(absf);
152 __m128 b_isnan = _mm_cmpunord_ps(absf, absf);
153 __m128i b_isregular = _mm_cmpgt_epi32(c_f16max, absf_int);
154 __m128i nan_payload = _mm_and_si128(_mm_srli_epi32(absf_int, 13),
156 __m128i nan_quiet = _mm_or_si128(nan_payload, c_nanbit);
157 __m128i nanfinal = _mm_and_si128(_mm_castps_si128(b_isnan), nan_quiet);
158 __m128i inf_or_nan = _mm_or_si128(nanfinal, c_infty_as_fp16);
161 __m128i b_issub = _mm_cmpgt_epi32(c_min_normal, absf_int);
164 __m128 subnorm1 = _mm_add_ps(
165 absf, _mm_castsi128_ps(c_subnorm_magic));
166 __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1),
170 __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13);
171 __m128i mantodd = _mm_srai_epi32(mantoddbit, 31);
173 __m128i round1 = _mm_add_epi32(absf_int, c_normal_bias);
175 __m128i round2 = _mm_sub_epi32(round1, mantodd);
176 __m128i normal = _mm_srli_epi32(round2, 13);
179 __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub),
180 _mm_andnot_si128(b_issub, normal));
183 __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular),
184 _mm_andnot_si128(b_isregular, inf_or_nan));
186 __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
187 __m128i result = _mm_or_si128(joined, sign_shift);
193static inline __m128 F16_to_F32_4x(
const __m128i &h)
195 const __m128i mask_nosign = _mm_set1_epi32(0x7fff);
196 const __m128 magic_mult = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23));
197 const __m128i was_infnan = _mm_set1_epi32(0x7bff);
198 const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23));
199 const __m128i was_nan = _mm_set1_epi32(0x7c00);
200 const __m128i nan_quiet = _mm_set1_epi32(1 << 22);
202 __m128i expmant = _mm_and_si128(mask_nosign, h);
203 __m128i justsign = _mm_xor_si128(h, expmant);
204 __m128i shifted = _mm_slli_epi32(expmant, 13);
205 __m128
scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic_mult);
206 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan);
207 __m128i sign = _mm_slli_epi32(justsign, 16);
208 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan);
209 __m128i b_wasnan = _mm_cmpgt_epi32(expmant, was_nan);
210 __m128i nanquiet = _mm_and_si128(b_wasnan, nan_quiet);
211 __m128 infnandone = _mm_or_ps(infnanexp, _mm_castsi128_ps(nanquiet));
213 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnandone);
214 __m128 result = _mm_or_ps(
scaled, sign_inf);
224#if defined(USE_HARDWARE_FP16_F16C)
225 for (; i + 7 <
length; i += 8) {
226 __m256 src8 = _mm256_loadu_ps(src);
227 __m128i h8 = _mm256_cvtps_ph(src8, _MM_FROUND_TO_NEAREST_INT);
228 _mm_storeu_epi32(dst, h8);
232#elif defined(USE_SSE2_FP16)
233 for (; i + 3 <
length; i += 4) {
234 __m128 src4 = _mm_loadu_ps(src);
235 __m128i h4 = F32_to_F16_4x(src4);
236 __m128i h4_packed = _mm_packs_epi32(h4, h4);
237 _mm_storeu_si64(dst, h4_packed);
241#elif defined(USE_HARDWARE_FP16_NEON)
242 for (; i + 3 <
length; i += 4) {
243 float32x4_t src4 = vld1q_f32(src);
244 float16x4_t h4 = vcvt_f16_f32(src4);
245 vst1_f16((float16_t *)dst, h4);
260#if defined(USE_HARDWARE_FP16_F16C)
261 for (; i + 7 <
length; i += 8) {
262 __m128i src8 = _mm_loadu_epi32(src);
263 __m256 f8 = _mm256_cvtph_ps(src8);
264 _mm256_storeu_ps(dst, f8);
268#elif defined(USE_SSE2_FP16)
269 for (; i + 3 <
length; i += 4) {
270 __m128i src4 = _mm_loadu_si64(src);
271 src4 = _mm_unpacklo_epi16(src4, src4);
272 __m128 f4 = F16_to_F32_4x(src4);
273 _mm_storeu_ps(dst, f4);
277#elif defined(USE_HARDWARE_FP16_NEON)
278 for (; i + 3 <
length; i += 4) {
279 float16x4_t src4 = vld1_f16((
const float16_t *)src);
280 float32x4_t f4 = vcvt_f32_f16(src4);
293#ifdef USE_HARDWARE_FP16_NEON
294# undef USE_HARDWARE_FP16_NEON
ATTR_WARN_UNUSED_RESULT const BMVert * v
btMatrix3x3 scaled(const btVector3 &s) const
Create a scaled copy of the matrix.
SIMD_FORCE_INLINE btScalar length() const
Return the length of the vector.
ccl_device_inline float3 exp(float3 v)
void float_to_half_array(const float *src, uint16_t *dst, size_t length)
uint16_t float_to_half(float v)
void half_to_float_array(const uint16_t *src, float *dst, size_t length)
float half_to_float(uint16_t v)
static int magic(const Tex *tex, const float texvec[3], TexResult *texres)