Blender V4.3
math_fast.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2004 NVIDIA Corporation
2 * SPDX-FileCopyrightText: 2008-2014 Larry Gritz
3 * SPDX-FileCopyrightText: 2009-2014 Sony Pictures Imageworks Inc., et al.
4 *
5 * SPDX-License-Identifier: BSD-3-Clause
6 *
7 * Adapted from OpenImageIO
8 * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
9 * All Rights Reserved.
10 *
11 * A few bits here are based upon code from NVIDIA that was also released
12 * under the same modified BSD license, and marked as:
13 * `Copyright 2004 NVIDIA Corporation. All Rights Reserved.`
14 *
15 * Some parts of this file were first open-sourced in Open Shading Language,
16 * then later moved here. The original copyright notice was:
17 * `Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.`
18 *
19 * Many of the math functions were copied from or inspired by other
20 * public domain sources or open source packages with compatible licenses.
21 * The individual functions give references were applicable.
22 */
23
24#ifndef __UTIL_FAST_MATH__
25#define __UTIL_FAST_MATH__
26
28
29ccl_device_inline float madd(const float a, const float b, const float c)
30{
31 /* NOTE: In the future we may want to explicitly ask for a fused
32 * multiply-add in a specialized version for float.
33 *
34 * NOTE: GCC/ICC will turn this (for float) into a FMA unless
35 * explicitly asked not to, clang seems to leave the code alone.
36 */
37 return a * b + c;
38}
39
40ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
41{
42 return a * b + c;
43}
44
45/*
46 * FAST & APPROXIMATE MATH
47 *
48 * The functions named "fast_*" provide a set of replacements to `libm` that
49 * are much faster at the expense of some accuracy and robust handling of
50 * extreme values. One design goal for these approximation was to avoid
51 * branches as much as possible and operate on single precision values only
52 * so that SIMD versions should be straightforward ports We also try to
53 * implement "safe" semantics (ie: clamp to valid range where possible)
54 * natively since wrapping these inline calls in another layer would be
55 * wasteful.
56 *
57 * Some functions are fast_safe_*, which is both a faster approximation as
58 * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
59 */
60
61/* Round to nearest integer, returning as an int. */
63{
64 /* used by sin/cos/tan range reduction. */
65#ifdef __KERNEL_SSE42__
66 /* Single `roundps` instruction on SSE4.1+ for gcc/clang but not MSVC 19.35:
67 * float_to_int(rintf(x)); so we use the equivalent intrinsics. */
68 __m128 vec = _mm_set_ss(x);
69 vec = _mm_round_ss(vec, vec, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
70 return _mm_cvtss_si32(vec);
71#else
72 /* emulate rounding by adding/subtracting 0.5. */
73 return float_to_int(x + copysignf(0.5f, x));
74#endif
75}
76
77ccl_device float fast_sinf(float x)
78{
79 /* Very accurate argument reduction from SLEEF,
80 * starts failing around x=262000
81 *
82 * Results on: [-2pi,2pi].
83 *
84 * Examined 2173837240 values of sin: 0.00662760244 avg ULP diff, 2 max ULP,
85 * 1.19209e-07 max error
86 */
87 int q = fast_rint(x * M_1_PI_F);
88 float qf = (float)q;
89 x = madd(qf, -0.78515625f * 4, x);
90 x = madd(qf, -0.00024187564849853515625f * 4, x);
91 x = madd(qf, -3.7747668102383613586e-08f * 4, x);
92 x = madd(qf, -1.2816720341285448015e-12f * 4, x);
93 x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
94 float s = x * x;
95 if ((q & 1) != 0)
96 x = -x;
97 /* This polynomial approximation has very low error on [-pi/2,+pi/2]
98 * 1.19209e-07 max error in total over [-2pi,+2pi]. */
99 float u = 2.6083159809786593541503e-06f;
100 u = madd(u, s, -0.0001981069071916863322258f);
101 u = madd(u, s, +0.00833307858556509017944336f);
102 u = madd(u, s, -0.166666597127914428710938f);
103 u = madd(s, u * x, x);
104 /* For large x, the argument reduction can fail and the polynomial can be
105 * evaluated with arguments outside the valid internal. Just clamp the bad
106 * values away (setting to 0.0f means no branches need to be generated). */
107 if (fabsf(u) > 1.0f) {
108 u = 0.0f;
109 }
110 return u;
111}
112
113ccl_device float fast_cosf(float x)
114{
115 /* Same argument reduction as fast_sinf(). */
116 int q = fast_rint(x * M_1_PI_F);
117 float qf = (float)q;
118 x = madd(qf, -0.78515625f * 4, x);
119 x = madd(qf, -0.00024187564849853515625f * 4, x);
120 x = madd(qf, -3.7747668102383613586e-08f * 4, x);
121 x = madd(qf, -1.2816720341285448015e-12f * 4, x);
122 x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals. */
123 float s = x * x;
124 /* Polynomial from SLEEF's sincosf, max error is
125 * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
126 float u = -2.71811842367242206819355e-07f;
127 u = madd(u, s, +2.47990446951007470488548e-05f);
128 u = madd(u, s, -0.00138888787478208541870117f);
129 u = madd(u, s, +0.0416666641831398010253906f);
130 u = madd(u, s, -0.5f);
131 u = madd(u, s, +1.0f);
132 if ((q & 1) != 0) {
133 u = -u;
134 }
135 if (fabsf(u) > 1.0f) {
136 u = 0.0f;
137 }
138 return u;
139}
140
141ccl_device void fast_sincosf(float x, ccl_private float *sine, ccl_private float *cosine)
142{
143 /* Same argument reduction as fast_sin. */
144 int q = fast_rint(x * M_1_PI_F);
145 float qf = (float)q;
146 x = madd(qf, -0.78515625f * 4, x);
147 x = madd(qf, -0.00024187564849853515625f * 4, x);
148 x = madd(qf, -3.7747668102383613586e-08f * 4, x);
149 x = madd(qf, -1.2816720341285448015e-12f * 4, x);
150 x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
151 float s = x * x;
152 /* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
153 if ((q & 1) != 0) {
154 x = -x;
155 }
156 float su = 2.6083159809786593541503e-06f;
157 su = madd(su, s, -0.0001981069071916863322258f);
158 su = madd(su, s, +0.00833307858556509017944336f);
159 su = madd(su, s, -0.166666597127914428710938f);
160 su = madd(s, su * x, x);
161 float cu = -2.71811842367242206819355e-07f;
162 cu = madd(cu, s, +2.47990446951007470488548e-05f);
163 cu = madd(cu, s, -0.00138888787478208541870117f);
164 cu = madd(cu, s, +0.0416666641831398010253906f);
165 cu = madd(cu, s, -0.5f);
166 cu = madd(cu, s, +1.0f);
167 if ((q & 1) != 0) {
168 cu = -cu;
169 }
170 if (fabsf(su) > 1.0f) {
171 su = 0.0f;
172 }
173 if (fabsf(cu) > 1.0f) {
174 cu = 0.0f;
175 }
176 *sine = su;
177 *cosine = cu;
178}
179
180/* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
181 * becoming really poor outside of this range because the reciprocal amplifies
182 * errors.
183 */
184ccl_device float fast_tanf(float x)
185{
186 /* Derived from SLEEF implementation.
187 *
188 * Note that we cannot apply the "denormal crush" trick everywhere because
189 * we sometimes need to take the reciprocal of the polynomial
190 */
191 int q = fast_rint(x * 2.0f * M_1_PI_F);
192 float qf = (float)q;
193 x = madd(qf, -0.78515625f * 2, x);
194 x = madd(qf, -0.00024187564849853515625f * 2, x);
195 x = madd(qf, -3.7747668102383613586e-08f * 2, x);
196 x = madd(qf, -1.2816720341285448015e-12f * 2, x);
197 if ((q & 1) == 0) {
198 /* Crush denormals (only if we aren't inverting the result later). */
199 x = M_PI_4_F - (M_PI_4_F - x);
200 }
201 float s = x * x;
202 float u = 0.00927245803177356719970703f;
203 u = madd(u, s, 0.00331984995864331722259521f);
204 u = madd(u, s, 0.0242998078465461730957031f);
205 u = madd(u, s, 0.0534495301544666290283203f);
206 u = madd(u, s, 0.133383005857467651367188f);
207 u = madd(u, s, 0.333331853151321411132812f);
208 u = madd(s, u * x, x);
209 if ((q & 1) != 0) {
210 u = -1.0f / u;
211 }
212 return u;
213}
214
215/* Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
216 *
217 * Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
218 */
220{
221 /* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
222 const float z = x - ((x + 25165824.0f) - 25165824.0f);
223 const float y = z - z * fabsf(z);
224 const float Q = 3.10396624f;
225 const float P = 3.584135056f; /* P = 16-4*Q */
226 return y * (Q + P * fabsf(y));
227
228 /* The original article used inferior constants for Q and P and
229 * so had max error 1.091e-3.
230 *
231 * The optimal value for Q was determined by exhaustive search, minimizing
232 * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
233 * over the interval [0,2] (which is where most of the invocations happen).
234 *
235 * The basic idea of this approximation starts with the coarse approximation:
236 * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
237 *
238 * This approximation always _over_ estimates the target. On the other hand,
239 * the curve:
240 * sin(pi*x) ~= f(x) * abs(f(x)) / 4
241 *
242 * always lies _under_ the target. Thus we can simply numerically search for
243 * the optimal constant to LERP these curves into a more precise
244 * approximation.
245 *
246 * After folding the constants together and simplifying the resulting math,
247 * we end up with the compact implementation above.
248 *
249 * NOTE: this function actually computes sin(x * pi) which avoids one or two
250 * mults in many cases and guarantees exact values at integer periods.
251 */
252}
253
254/* Fast approximate cos(x*M_PI) with ~0.1% absolute error. */
256{
257 return fast_sinpif(x + 0.5f);
258}
259
260ccl_device float fast_acosf(float x)
261{
262 const float f = fabsf(x);
263 /* clamp and crush denormals. */
264 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
265 /* Based on http://www.pouet.net/topic.php?which=9132&page=2
266 * 85% accurate (ULP 0)
267 * Examined 2130706434 values of acos:
268 * 15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
269 * Examined 2130706434 values of acos:
270 * 15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
271 */
272 const float a = sqrtf(1.0f - m) *
273 (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
274 return x < 0 ? M_PI_F - a : a;
275}
276
277ccl_device float fast_asinf(float x)
278{
279 /* Based on acosf approximation above.
280 * Max error is 4.51133e-05 (ULPS are higher because we are consistently off
281 * by a little amount). */
282 const float f = fabsf(x);
283 /* Clamp and crush denormals. */
284 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
285 const float a = M_PI_2_F -
286 sqrtf(1.0f - m) * (1.5707963267f +
287 m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
288 return copysignf(a, x);
289}
290
291ccl_device float fast_atanf(float x)
292{
293 const float a = fabsf(x);
294 const float k = a > 1.0f ? 1 / a : a;
295 const float s = 1.0f - (1.0f - k); /* Crush denormals. */
296 const float t = s * s;
297 /* http://mathforum.org/library/drmath/view/62672.html
298 * Examined 4278190080 values of atan:
299 * 2.36864877 avg ULP diff, 302 max ULP, 6.55651e-06 max error // (with denormals)
300 * Examined 4278190080 values of atan:
301 * 171160502 avg ULP diff, 855638016 max ULP, 6.55651e-06 max error // (crush denormals)
302 */
303 float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
304 if (a > 1.0f) {
305 r = M_PI_2_F - r;
306 }
307 return copysignf(r, x);
308}
309
310ccl_device float fast_atan2f(float y, float x)
311{
312 /* Based on atan approximation above.
313 *
314 * The special cases around 0 and infinity were tested explicitly.
315 *
316 * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
317 * of nan.
318 */
319 const float a = fabsf(x);
320 const float b = fabsf(y);
321
322 const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
323 const float s = 1.0f - (1.0f - k); /* Crush denormals */
324 const float t = s * s;
325
326 float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
327
328 if (b > a) {
329 /* Account for arg reduction. */
330 r = M_PI_2_F - r;
331 }
332 /* Test sign bit of x. */
333 if (__float_as_uint(x) & 0x80000000u) {
334 r = M_PI_F - r;
335 }
336 return copysignf(r, y);
337}
338
339/* Same as precise_angle, but using fast_atan2f. Still much better that acos(dot(a, b)). */
341{
342 return 2.0f * fast_atan2f(len(a - b), len(a + b));
343}
344
345/* Based on:
346 *
347 * https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
348 */
349ccl_device float fast_log2f(float x)
350{
351 /* NOTE: clamp to avoid special cases and make result "safe" from large
352 * negative values/NAN's. */
353 x = clamp(x, FLT_MIN, FLT_MAX);
354 unsigned bits = __float_as_uint(x);
355 int exponent = (int)(bits >> 23) - 127;
356 float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
357 /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
358 * 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
359 * ULP histogram:
360 * 0 = 97.46%
361 * 1 = 2.29%
362 * 2 = 0.11%
363 */
364 float f2 = f * f;
365 float f4 = f2 * f2;
366 float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
367 float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
368 hi = madd(f, hi, -0.13753123777116f);
369 hi = madd(f, hi, 0.24187369696082f);
370 hi = madd(f, hi, -0.34730547155299f);
371 lo = madd(f, lo, 1.442689881667200f);
372 return ((f4 * hi) + (f * lo)) + exponent;
373}
374
376{
377 /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
378 * 0.313865375 avg ULP diff, 5148137 max ULP, 7.62939e-06 max error.
379 */
380 return fast_log2f(x) * M_LN2_F;
381}
382
384{
385 /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
386 * 0.631237033 avg ULP diff, 4471615 max ULP, 3.8147e-06 max error.
387 */
388 return fast_log2f(x) * M_LN2_F / M_LN10_F;
389}
390
391ccl_device float fast_logb(float x)
392{
393 /* Don't bother with denormals. */
394 x = fabsf(x);
395 x = clamp(x, FLT_MIN, FLT_MAX);
396 unsigned bits = __float_as_uint(x);
397 return (float)((int)(bits >> 23) - 127);
398}
399
400ccl_device float fast_exp2f(float x)
401{
402 /* Clamp to safe range for final addition. */
403 x = clamp(x, -126.0f, 126.0f);
404 /* Range reduction. */
405 int m = (int)x;
406 x -= m;
407 x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ULPS!). */
408 /* 5th degree polynomial generated with sollya
409 * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ULP diff,
410 * 232 max ULP.
411 *
412 * ULP histogram:
413 * 0 = 87.81%
414 * 1 = 4.18%
415 */
416 float r = 1.33336498402e-3f;
417 r = madd(x, r, 9.810352697968e-3f);
418 r = madd(x, r, 5.551834031939e-2f);
419 r = madd(x, r, 0.2401793301105f);
420 r = madd(x, r, 0.693144857883f);
421 r = madd(x, r, 1.0f);
422 /* Multiply by 2 ^ m by adding in the exponent. */
423 /* NOTE: left-shift of negative number is undefined behavior. */
424 return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
425}
426
428{
429 /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
430 * 2.6666452 avg ULP diff, 230 max ULP.
431 */
432 return fast_exp2f(x / M_LN2_F);
433}
434
435#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
436/* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
437 * #78047 and #78869 for details. Just disable for now, it only makes
438 * a small difference in denoising performance. */
439ccl_device float4 fast_exp2f4(float4 x)
440{
441 const float4 one = make_float4(1.0f);
442 const float4 limit = make_float4(126.0f);
443 x = clamp(x, -limit, limit);
444 int4 m = make_int4(x);
445 x = one - (one - (x - make_float4(m)));
446 float4 r = make_float4(1.33336498402e-3f);
447 r = madd4(x, r, make_float4(9.810352697968e-3f));
448 r = madd4(x, r, make_float4(5.551834031939e-2f));
449 r = madd4(x, r, make_float4(0.2401793301105f));
450 r = madd4(x, r, make_float4(0.693144857883f));
451 r = madd4(x, r, make_float4(1.0f));
452 return __int4_as_float4(__float4_as_int4(r) + (m << 23));
453}
454
456{
457 return fast_exp2f4(x / M_LN2_F);
458}
459#else
461{
462 return make_float4(fast_expf(x.x), fast_expf(x.y), fast_expf(x.z), fast_expf(x.w));
463}
464#endif
465
467{
468 /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
469 * 2.71732409 avg ULP diff, 232 max ULP.
470 */
471 return fast_exp2f(x * M_LN10_F / M_LN2_F);
472}
473
475{
476 if (fabsf(x) < 1e-5f) {
477 x = 1.0f - (1.0f - x); /* Crush denormals. */
478 return madd(0.5f, x * x, x);
479 }
480 else {
481 return fast_expf(x) - 1.0f;
482 }
483}
484
485ccl_device float fast_sinhf(float x)
486{
487 float a = fabsf(x);
488 if (a > 1.0f) {
489 /* Examined 53389559 values of sinh on [1,87.3300018]:
490 * 33.6886442 avg ULP diff, 178 max ULP. */
491 float e = fast_expf(a);
492 return copysignf(0.5f * e - 0.5f / e, x);
493 }
494 else {
495 a = 1.0f - (1.0f - a); /* Crush denorms. */
496 float a2 = a * a;
497 /* Degree 7 polynomial generated with sollya. */
498 /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
499 float r = 2.03945513931e-4f;
500 r = madd(r, a2, 8.32990277558e-3f);
501 r = madd(r, a2, 0.1666673421859f);
502 r = madd(r * a, a2, a);
503 return copysignf(r, x);
504 }
505}
506
508{
509 /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
510 * 1.78256726 avg ULP diff, 178 max ULP.
511 */
512 float e = fast_expf(fabsf(x));
513 return 0.5f * e + 0.5f / e;
514}
515
517{
518 /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
519 * 3.12924e-06 max error.
520 */
521 /* NOTE: ULP error is high because of sub-optimal handling around the origin. */
522 float e = fast_expf(2.0f * fabsf(x));
523 return copysignf(1.0f - 2.0f / (1.0f + e), x);
524}
525
526ccl_device float fast_safe_powf(float x, float y)
527{
528 if (y == 0)
529 return 1.0f; /* x^1=1 */
530 if (x == 0)
531 return 0.0f; /* 0^y=0 */
532 float sign = 1.0f;
533 if (x < 0.0f) {
534 /* if x is negative, only deal with integer powers
535 * powf returns NaN for non-integers, we will return 0 instead.
536 */
537 int ybits = __float_as_int(y) & 0x7fffffff;
538 if (ybits >= 0x4b800000) {
539 // always even int, keep positive
540 }
541 else if (ybits >= 0x3f800000) {
542 /* Bigger than 1, check. */
543 int k = (ybits >> 23) - 127; /* Get exponent. */
544 int j = ybits >> (23 - k); /* Shift out possible fractional bits. */
545 if ((j << (23 - k)) == ybits) { /* rebuild number and check for a match. */
546 /* +1 for even, -1 for odd. */
547 sign = __int_as_float(0x3f800000 | (j << 31));
548 }
549 else {
550 /* Not an integer. */
551 return 0.0f;
552 }
553 }
554 else {
555 /* Not an integer. */
556 return 0.0f;
557 }
558 }
559 return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
560}
561
562/* TODO(sergey): Check speed with our erf functions implementation from
563 * bsdf_microfacet.h.
564 */
565
567{
568 /* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
569 /* Abramowitz and Stegun, 7.1.28. */
570 const float a1 = 0.0705230784f;
571 const float a2 = 0.0422820123f;
572 const float a3 = 0.0092705272f;
573 const float a4 = 0.0001520143f;
574 const float a5 = 0.0002765672f;
575 const float a6 = 0.0000430638f;
576 const float a = fabsf(x);
577 if (a >= 12.3f) {
578 return copysignf(1.0f, x);
579 }
580 const float b = 1.0f - (1.0f - a); /* Crush denormals. */
581 const float r = madd(
582 madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
583 const float s = r * r; /* ^2 */
584 const float t = s * s; /* ^4 */
585 const float u = t * t; /* ^8 */
586 const float v = u * u; /* ^16 */
587 return copysignf(1.0f - 1.0f / v, x);
588}
589
591{
592 /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
593 *
594 * ULP histogram:
595 *
596 * 0 = 80.30%
597 */
598 return 1.0f - fast_erff(x);
599}
600
602{
603 /* From: Approximating the `erfinv` function by Mike Giles. */
604 /* To avoid trouble at the limit, clamp input to 1-epsilon. */
605 float a = fabsf(x);
606 if (a > 0.99999994f) {
607 a = 0.99999994f;
608 }
609 float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
610 if (w < 5.0f) {
611 w = w - 2.5f;
612 p = 2.81022636e-08f;
613 p = madd(p, w, 3.43273939e-07f);
614 p = madd(p, w, -3.5233877e-06f);
615 p = madd(p, w, -4.39150654e-06f);
616 p = madd(p, w, 0.00021858087f);
617 p = madd(p, w, -0.00125372503f);
618 p = madd(p, w, -0.00417768164f);
619 p = madd(p, w, 0.246640727f);
620 p = madd(p, w, 1.50140941f);
621 }
622 else {
623 w = sqrtf(w) - 3.0f;
624 p = -0.000200214257f;
625 p = madd(p, w, 0.000100950558f);
626 p = madd(p, w, 0.00134934322f);
627 p = madd(p, w, -0.00367342844f);
628 p = madd(p, w, 0.00573950773f);
629 p = madd(p, w, -0.0076224613f);
630 p = madd(p, w, 0.00943887047f);
631 p = madd(p, w, 1.00167406f);
632 p = madd(p, w, 2.83297682f);
633 }
634 return p * x;
635}
636
637/* Fast inverse cube root for positive x, with two Newton iterations to improve accuracy. */
639{
640 util_assert(x >= 0.0f);
641
642 /* Constant is roughly `cbrt(2^127)`, but tweaked a bit to balance the error across the entire
643 * range. The exact value is not critical. */
644 float y = __int_as_float(0x54a24242 - __float_as_int(x) / 3);
645 y = (2.0f / 3) * y + 1 / (3 * y * y * x);
646 y = (2.0f / 3) * y + 1 / (3 * y * y * x);
647 return y;
648}
649
651
652#endif /* __UTIL_FAST_MATH__ */
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
ATTR_WARN_UNUSED_RESULT const BMVert * v
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
local_group_size(16, 16) .push_constant(Type b
#define util_assert(statement)
Definition defines.h:98
#define ccl_device
#define ccl_private
#define ccl_device_inline
#define CCL_NAMESPACE_END
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
#define copysignf(x, y)
#define __int_as_float(x)
#define __float_as_int(x)
#define fabsf(x)
#define __float_as_uint(x)
#define sqrtf(x)
ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
#define __uint_as_float(x)
int len
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
ccl_device_inline float fast_inv_cbrtf(float x)
Definition math_fast.h:638
ccl_device float fast_exp2f(float x)
Definition math_fast.h:400
ccl_device_inline float fast_cospif(float x)
Definition math_fast.h:255
ccl_device_inline float fast_log10(float x)
Definition math_fast.h:383
CCL_NAMESPACE_BEGIN ccl_device_inline float madd(const float a, const float b, const float c)
Definition math_fast.h:29
ccl_device_inline int fast_rint(float x)
Definition math_fast.h:62
ccl_device_inline float vector_angle(float3 a, float3 b)
Definition math_fast.h:340
ccl_device_inline float fast_logf(float x)
Definition math_fast.h:375
ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
Definition math_fast.h:40
ccl_device void fast_sincosf(float x, ccl_private float *sine, ccl_private float *cosine)
Definition math_fast.h:141
ccl_device float4 fast_exp2f4(float4 x)
Definition math_fast.h:439
ccl_device_inline float fast_erfcf(float x)
Definition math_fast.h:590
ccl_device float fast_acosf(float x)
Definition math_fast.h:260
ccl_device float fast_sinpif(float x)
Definition math_fast.h:219
ccl_device_inline float fast_erff(float x)
Definition math_fast.h:566
ccl_device float fast_asinf(float x)
Definition math_fast.h:277
ccl_device float fast_logb(float x)
Definition math_fast.h:391
ccl_device float fast_tanf(float x)
Definition math_fast.h:184
ccl_device_inline float fast_coshf(float x)
Definition math_fast.h:507
ccl_device float fast_atan2f(float y, float x)
Definition math_fast.h:310
ccl_device float fast_atanf(float x)
Definition math_fast.h:291
ccl_device_inline float fast_tanhf(float x)
Definition math_fast.h:516
ccl_device_inline float fast_expm1f(float x)
Definition math_fast.h:474
ccl_device_inline float fast_ierff(float x)
Definition math_fast.h:601
ccl_device_inline float fast_exp10(float x)
Definition math_fast.h:466
ccl_device float fast_sinf(float x)
Definition math_fast.h:77
ccl_device float fast_cosf(float x)
Definition math_fast.h:113
ccl_device_inline float fast_expf(float x)
Definition math_fast.h:427
ccl_device float fast_safe_powf(float x, float y)
Definition math_fast.h:526
ccl_device float fast_log2f(float x)
Definition math_fast.h:349
ccl_device_inline float4 fast_expf4(float4 x)
Definition math_fast.h:455
ccl_device float fast_sinhf(float x)
Definition math_fast.h:485
#define M_PI_F
Definition mikk_util.hh:15
VecBase< float, 4 > float4
#define M_PI_2_F
Definition sky_float3.h:20
#define FLT_MAX
Definition stdcycles.h:14
ccl_device_inline int4 __float4_as_int4(float4 f)
Definition util/math.h:296
ccl_device_inline int float_to_int(float f)
Definition util/math.h:424
ccl_device_inline float4 __int4_as_float4(int4 i)
Definition util/math.h:306
ccl_device_inline int clamp(int a, int mn, int mx)
Definition util/math.h:379