Blender V4.3
math_matrix.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifndef __UTIL_MATH_MATRIX_H__
6#define __UTIL_MATH_MATRIX_H__
7
9
10#define MAT(A, size, row, col) A[(row) * (size) + (col)]
11
12/* Variants that use a constant stride on GPUS. */
13#ifdef __KERNEL_GPU__
14# define MATS(A, n, r, c, s) A[((r) * (n) + (c)) * (s)]
15/* Element access when only the lower-triangular elements are stored. */
16# define MATHS(A, r, c, s) A[((r) * ((r) + 1) / 2 + (c)) * (s)]
17# define VECS(V, i, s) V[(i) * (s)]
18#else
19# define MATS(A, n, r, c, s) MAT(A, n, r, c)
20# define MATHS(A, r, c, s) A[(r) * ((r) + 1) / 2 + (c)]
21# define VECS(V, i, s) V[i]
22#endif
23
24/* Zeroing helpers. */
25
27{
28 for (int i = 0; i < n; i++) {
29 v[i] = 0.0f;
30 }
31}
32
34{
35 for (int row = 0; row < n; row++) {
36 for (int col = 0; col <= row; col++) {
37 MAT(A, n, row, col) = 0.0f;
38 }
39 }
40}
41
42/* Elementary vector operations. */
43
45 ccl_private const float *ccl_restrict b,
46 int n)
47{
48 for (int i = 0; i < n; i++) {
49 a[i] += b[i];
50 }
51}
52
54 ccl_private const float *ccl_restrict b,
55 int n)
56{
57 for (int i = 0; i < n; i++) {
58 a[i] *= b[i];
59 }
60}
61
63 ccl_private const float *ccl_restrict b,
64 int astride,
65 int n)
66{
67 for (int i = 0; i < n; i++) {
68 a[i * astride] *= b[i];
69 }
70}
71
73{
74 for (int i = 0; i < n; i++) {
75 a[i] *= b;
76 }
77}
78
80 ccl_private const float *ccl_restrict b,
81 int n)
82{
83 for (int i = 0; i < n; i++) {
84 a[i] = max(a[i], b[i]);
85 }
86}
87
89{
90 for (int i = 0; i < n; i++) {
91 v[i] += w * x[i];
92 }
93}
94
96 ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
97{
98 for (int i = 0; i < n; i++) {
99 ccl_global float *elem = (ccl_global float *)(v + i * stride);
100 atomic_add_and_fetch_float(elem + 0, w.x * x[i]);
101 atomic_add_and_fetch_float(elem + 1, w.y * x[i]);
102 atomic_add_and_fetch_float(elem + 2, w.z * x[i]);
103 }
104}
105
106/* Elementary matrix operations.
107 * NOTE: TriMatrix refers to a square matrix that is symmetric,
108 * and therefore its upper-triangular part isn't stored. */
109
111 int n,
112 float val,
113 int stride)
114{
115 for (int row = 0; row < n; row++) {
116 MATHS(A, row, row, stride) += val;
117 }
118}
119
120/* Add Gramian matrix of v to A.
121 * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
123 int n,
124 ccl_private const float *ccl_restrict v,
125 float weight)
126{
127 for (int row = 0; row < n; row++) {
128 for (int col = 0; col <= row; col++) {
129 MAT(A, n, row, col) += v[row] * v[col] * weight;
130 }
131 }
132}
133
134/* Add Gramian matrix of v to A.
135 * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
137 ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight, int stride)
138{
139 for (int row = 0; row < n; row++) {
140 for (int col = 0; col <= row; col++) {
141 atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row] * v[col] * weight);
142 }
143 }
144}
145
147 int n,
148 ccl_private const float *ccl_restrict v,
149 float weight)
150{
151 for (int row = 0; row < n; row++) {
152 for (int col = 0; col <= row; col++) {
153 atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
154 }
155 }
156}
157
158/* Transpose matrix A in place. */
159ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
160{
161 for (int i = 0; i < n; i++) {
162 for (int j = 0; j < i; j++) {
163 float temp = MATS(A, n, i, j, stride);
164 MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
165 MATS(A, n, j, i, stride) = temp;
166 }
167 }
168}
169
170/* Solvers for matrix problems */
171
172/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
173 * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
174 * Also, only the lower triangular part of A is ever accessed. */
175ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
176{
177 for (int row = 0; row < n; row++) {
178 for (int col = 0; col <= row; col++) {
179 float sum_col = MATHS(A, row, col, stride);
180 for (int k = 0; k < col; k++) {
181 sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
182 }
183 if (row == col) {
184 sum_col = sqrtf(max(sum_col, 0.0f));
185 }
186 else {
187 sum_col /= MATHS(A, col, col, stride);
188 }
189 MATHS(A, row, col, stride) = sum_col;
190 }
191 }
192}
193
194/* Solve A*S=y for S given A and y,
195 * where A is symmetrical positive-semi-definite and both inputs are destroyed in the process.
196 *
197 * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
198 * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
199 * Since L is lower triangular, finding b is relatively easy since y is known.
200 * Then, the remaining problem is Lt*S = b, which again can be solved easily.
201 *
202 * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
203 * symmetrical positive-semidefinite by construction,
204 * so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
207 int n,
208 int stride)
209{
210 /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
211 * heuristic for the amount of pixels considered (with weighting),
212 * therefore the amount of correction is scaled based on it. */
213 math_trimatrix_add_diagonal(A, n, 3e-7f * A[0], stride); /* Improve the numerical stability. */
214 math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
215
216 /* Use forward substitution to solve L*b = y, replacing y by b. */
217 for (int row = 0; row < n; row++) {
218 float3 sum = VECS(y, row, stride);
219 for (int col = 0; col < row; col++)
220 sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
221 VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
222 }
223
224 /* Use backward substitution to solve Lt*S = b, replacing b by S. */
225 for (int row = n - 1; row >= 0; row--) {
226 float3 sum = VECS(y, row, stride);
227 for (int col = row + 1; col < n; col++)
228 sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
229 VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
230 }
231}
232
233/* Perform the Jacobi Eigenvalue Method on matrix A.
234 * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever
235 * accessed. The algorithm overwrites the contents of A.
236 *
237 * After returning, A will be overwritten with D, which is (almost) diagonal,
238 * and V will contain the eigenvectors of the original A in its rows (!),
239 * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
240 */
242 ccl_global float *V,
243 int n,
244 int v_stride)
245{
246 const float singular_epsilon = 1e-9f;
247
248 for (int row = 0; row < n; row++) {
249 for (int col = 0; col < n; col++) {
250 MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
251 }
252 }
253
254 for (int sweep = 0; sweep < 8; sweep++) {
255 float off_diagonal = 0.0f;
256 for (int row = 1; row < n; row++) {
257 for (int col = 0; col < row; col++) {
258 off_diagonal += fabsf(MAT(A, n, row, col));
259 }
260 }
261 if (off_diagonal < 1e-7f) {
262 /* The matrix has nearly reached diagonal form.
263 * Since the eigenvalues are only used to determine truncation, their exact values aren't
264 * required - a relative error of a few ULPs won't matter at all. */
265 break;
266 }
267
268 /* Set the threshold for the small element rotation skip in the first sweep:
269 * Skip all elements that are less than a tenth of the average off-diagonal element. */
270 float threshold = 0.2f * off_diagonal / (n * n);
271
272 for (int row = 1; row < n; row++) {
273 for (int col = 0; col < row; col++) {
274 /* Perform a Jacobi rotation on this element that reduces it to zero. */
275 float element = MAT(A, n, row, col);
276 float abs_element = fabsf(element);
277
278 /* If we're in a later sweep and the element already is very small,
279 * just set it to zero and skip the rotation. */
280 if (sweep > 3 && abs_element <= singular_epsilon * fabsf(MAT(A, n, row, row)) &&
281 abs_element <= singular_epsilon * fabsf(MAT(A, n, col, col)))
282 {
283 MAT(A, n, row, col) = 0.0f;
284 continue;
285 }
286
287 if (element == 0.0f) {
288 continue;
289 }
290
291 /* If we're in one of the first sweeps and the element is smaller than the threshold,
292 * skip it. */
293 if (sweep < 3 && (abs_element < threshold)) {
294 continue;
295 }
296
297 /* Determine rotation: The rotation is characterized by its angle phi - or,
298 * in the actual implementation, sin(phi) and cos(phi).
299 * To find those, we first compute their ratio - that might be unstable if the angle
300 * approaches 90 degrees, so there's a fallback for that case.
301 * Then, we compute sin(phi) and cos(phi) themselves. */
302 float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
303 float ratio;
304 if (abs_element > singular_epsilon * fabsf(singular_diff)) {
305 float cot_2phi = 0.5f * singular_diff / element;
306 ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi * cot_2phi));
307 if (cot_2phi < 0.0f) {
308 ratio = -ratio; /* Copy sign. */
309 }
310 }
311 else {
312 ratio = element / singular_diff;
313 }
314
315 float c = 1.0f / sqrtf(1.0f + ratio * ratio);
316 float s = ratio * c;
317 /* To improve numerical stability by avoiding cancellation, the update equations are
318 * reformulized to use sin(phi) and tan(phi/2) instead. */
319 float tan_phi_2 = s / (1.0f + c);
320
321 /* Update the singular values in the diagonal. */
322 float singular_delta = ratio * element;
323 MAT(A, n, row, row) += singular_delta;
324 MAT(A, n, col, col) -= singular_delta;
325
326 /* Set the element itself to zero. */
327 MAT(A, n, row, col) = 0.0f;
328
329 /* Perform the actual rotations on the matrices. */
330#define ROT(M, r1, c1, r2, c2, stride) \
331 { \
332 float M1 = MATS(M, n, r1, c1, stride); \
333 float M2 = MATS(M, n, r2, c2, stride); \
334 MATS(M, n, r1, c1, stride) -= s * (M2 + tan_phi_2 * M1); \
335 MATS(M, n, r2, c2, stride) += s * (M1 - tan_phi_2 * M2); \
336 }
337
338 /* Split into three parts to ensure correct accesses since we only store the
339 * lower-triangular part of A. */
340 for (int i = 0; i < col; i++)
341 ROT(A, col, i, row, i, 1);
342 for (int i = col + 1; i < row; i++)
343 ROT(A, i, col, row, i, 1);
344 for (int i = row + 1; i < n; i++)
345 ROT(A, i, col, i, row, 1);
346
347 for (int i = 0; i < n; i++)
348 ROT(V, col, i, row, i, v_stride);
349#undef ROT
350 }
351 }
352 }
353
354 /* Sort eigenvalues and the associated eigenvectors. */
355 for (int i = 0; i < n - 1; i++) {
356 float v = MAT(A, n, i, i);
357 int k = i;
358 for (int j = i; j < n; j++) {
359 if (MAT(A, n, j, j) >= v) {
360 v = MAT(A, n, j, j);
361 k = j;
362 }
363 }
364 if (k != i) {
365 /* Swap eigenvalues. */
366 MAT(A, n, k, k) = MAT(A, n, i, i);
367 MAT(A, n, i, i) = v;
368 /* Swap eigenvectors. */
369 for (int j = 0; j < n; j++) {
370 float v = MATS(V, n, i, j, v_stride);
371 MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
372 MATS(V, n, k, j, v_stride) = v;
373 }
374 }
375 }
376}
377
378#ifdef __KERNEL_SSE3__
379ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
380{
381 for (int i = 0; i < n; i++) {
382 A[i] = zero_float4();
383 }
384}
385
386ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
387{
388 for (int row = 0; row < n; row++) {
389 for (int col = 0; col <= row; col++) {
390 MAT(A, n, row, col) = zero_float4();
391 }
392 }
393}
394
395/* Add Gramian matrix of v to A.
396 * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
397ccl_device_inline void math_matrix_add_gramian_sse(float4 *A,
398 int n,
399 const float4 *ccl_restrict v,
400 float4 weight)
401{
402 for (int row = 0; row < n; row++) {
403 for (int col = 0; col <= row; col++) {
404 MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
405 }
406 }
407}
408
409ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
410{
411 for (int i = 0; i < n; i++) {
412 V[i] += a[i];
413 }
414}
415
416ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
417{
418 for (int i = 0; i < n; i++) {
419 V[i] *= a[i];
420 }
421}
422
423ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
424{
425 for (int i = 0; i < n; i++) {
426 a[i] = max(a[i], b[i]);
427 }
428}
429
430ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
431{
432 for (int row = 0; row < n; row++) {
433 for (int col = 0; col <= row; col++) {
434 MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
435 }
436 }
437}
438#endif
439
440#undef MAT
441
443
444#endif /* __UTIL_MATH_MATRIX_H__ */
#define atomic_add_and_fetch_float(p, x)
Definition atomic.h:13
ATTR_WARN_UNUSED_RESULT const void * element
ATTR_WARN_UNUSED_RESULT const BMVert * v
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
static T sum(const btAlignedObjectArray< T > &items)
local_group_size(16, 16) .push_constant(Type b
#define ccl_restrict
#define ccl_device
#define ccl_private
#define ccl_device_inline
#define ccl_global
#define CCL_NAMESPACE_END
#define fabsf(x)
#define sqrtf(x)
uint col
ccl_device_inline float reduce_add(const float2 a)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 zero_float4()
Definition math_float4.h:15
#define MAT(A, size, row, col)
Definition math_matrix.h:10
ccl_device_inline void math_vec3_add(ccl_private float3 *v, int n, ccl_private float *x, float3 w)
Definition math_matrix.h:88
ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
ccl_device_inline void math_vector_add(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition math_matrix.h:44
ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
Definition math_matrix.h:95
ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
ccl_device_inline void math_matrix_zero(ccl_private float *A, int n)
Definition math_matrix.h:33
ccl_device_inline void math_vector_mul(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition math_matrix.h:53
#define MATHS(A, r, c, s)
Definition math_matrix.h:20
ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight, int stride)
#define ROT(M, r1, c1, r2, c2, stride)
ccl_device_inline void math_vector_max(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition math_matrix.h:79
ccl_device_inline void math_vector_scale(ccl_private float *a, float b, int n)
Definition math_matrix.h:72
ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
#define MATS(A, n, r, c, s)
Definition math_matrix.h:19
ccl_device void math_matrix_jacobi_eigendecomposition(ccl_private float *A, ccl_global float *V, int n, int v_stride)
#define VECS(V, i, s)
Definition math_matrix.h:21
ccl_device_inline void math_vector_mul_strided(ccl_global float *a, ccl_private const float *ccl_restrict b, int astride, int n)
Definition math_matrix.h:62
ccl_device_inline void math_matrix_add_gramian(ccl_private float *A, int n, ccl_private const float *ccl_restrict v, float weight)
ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight)
ccl_device_inline void math_vector_zero(ccl_private float *v, int n)
Definition math_matrix.h:26
#define B
float max
CCL_NAMESPACE_BEGIN struct Window V