277#if defined(USE_HARDWARE_FP16_F16C)
279 __m256 src8 = _mm256_loadu_ps(src);
280 __m128i h8 = _mm256_cvtps_ph(src8, _MM_FROUND_TO_NEAREST_INT);
283 const __m128i exp_mask = _mm_set1_epi16(0x7c00u);
284 __m128i exp_all_ones = _mm_cmpeq_epi16(_mm_and_si128(h8, exp_mask), exp_mask);
285 const __m128i mant_mask = _mm_set1_epi16(0x03ffu);
286 const __m128i zero = _mm_setzero_si128();
287 __m128i mant_is_zero = _mm_cmpeq_epi16(_mm_and_si128(h8, mant_mask), zero);
288 __m128i is_inf = _mm_and_si128(exp_all_ones, mant_is_zero);
289 const __m128i all_ones = _mm_cmpeq_epi16(zero, zero);
290 __m128i is_nan = _mm_and_si128(exp_all_ones, _mm_andnot_si128(mant_is_zero, all_ones));
291 const __m128i sign_mask = _mm_set1_epi16(0x8000u);
292 __m128i signbits = _mm_and_si128(h8, sign_mask);
293 __m128i inf_res = _mm_or_si128(signbits, _mm_set1_epi16(0x7bffu));
294 __m128i nan_res = signbits;
296 h8 = _mm_blendv_epi8(h8, inf_res, is_inf);
297 h8 = _mm_blendv_epi8(h8, nan_res, is_nan);
299 _mm_storeu_si128((__m128i *)dst, h8);
303#elif defined(USE_SSE2_FP16)
305 __m128 src4 = _mm_loadu_ps(src);
306 __m128i h4 = F32_to_F16_4x(src4);
309 __m128i hi_part = _mm_and_si128(h4, _mm_set1_epi32(0xffff0000u));
310 const __m128i exp_mask = _mm_set1_epi16(0x7c00u);
311 __m128i exp_all_ones = _mm_cmpeq_epi16(_mm_and_si128(h4, exp_mask), exp_mask);
312 const __m128i mant_mask = _mm_set1_epi16(0x03ffu);
313 const __m128i zero = _mm_setzero_si128();
314 __m128i mant_is_zero = _mm_cmpeq_epi16(_mm_and_si128(h4, mant_mask), zero);
315 __m128i is_inf = _mm_and_si128(exp_all_ones, mant_is_zero);
316 const __m128i all_ones = _mm_cmpeq_epi16(zero, zero);
317 __m128i is_nan = _mm_and_si128(exp_all_ones, _mm_andnot_si128(mant_is_zero, all_ones));
318 const __m128i sign_mask = _mm_set1_epi16(0x8000u);
319 __m128i signbits = _mm_and_si128(h4, sign_mask);
320 __m128i inf_res = _mm_or_si128(signbits, _mm_set1_epi16(0x7bffu));
321 __m128i nan_res = signbits;
323 h4 = _mm_blendv_epi8(h4, inf_res, is_inf);
324 h4 = _mm_blendv_epi8(h4, nan_res, is_nan);
325 h4 = _mm_and_si128(h4, _mm_set1_epi32(0xffff));
326 h4 = _mm_or_si128(h4, hi_part);
328 __m128i h4_packed = _mm_packs_epi32(h4, h4);
329 _mm_storeu_si64(dst, h4_packed);
333#elif defined(USE_HARDWARE_FP16_NEON)
335 float32x4_t src4 = vld1q_f32(src);
336 float16x4_t h4 = vcvt_f16_f32(src4);
339 uint16x4_t hu4 = vreinterpret_u16_f16(h4);
340 const uint16x4_t exp_mask = vdup_n_u16(0x7c00u);
341 uint16x4_t exp_all_ones = vceq_u16(vand_u16(hu4, exp_mask), exp_mask);
342 const uint16x4_t mant_mask = vdup_n_u16(0x03ffu);
343 const uint16x4_t zero = vdup_n_u16(0);
344 uint16x4_t mant_is_zero = vceq_u16(vand_u16(hu4, mant_mask), zero);
345 uint16x4_t is_inf = vand_u16(exp_all_ones, mant_is_zero);
346 uint16x4_t is_nan = vand_u16(exp_all_ones, vmvn_u16(mant_is_zero));
347 const uint16x4_t sign_mask = vdup_n_u16(0x8000u);
348 uint16x4_t signbits = vand_u16(hu4, sign_mask);
349 uint16x4_t inf_res = vorr_u16(signbits, vdup_n_u16(0x7bffu));
350 uint16x4_t nan_res = signbits;
352 hu4 = vbsl_u16(is_inf, inf_res, hu4);
353 hu4 = vbsl_u16(is_nan, nan_res, hu4);
354 h4 = vreinterpret_f16_u16(hu4);
356 vst1_f16((float16_t *)dst, h4);
static int magic(const Tex *tex, const float texvec[3], TexResult *texres)