Blender V4.3
math_interp.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2024 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
9#include <cmath>
10#include <cstring>
11
12#include "BLI_math_base.h"
13#include "BLI_math_base.hh"
14#include "BLI_math_interp.hh"
15#include "BLI_math_vector.h"
17#include "BLI_simd.hh"
18
19#include "BLI_strict_flags.h" /* Keep last. */
20
21namespace blender::math {
22
23enum class eCubicFilter {
24 BSpline,
26};
27
28/* Calculate cubic filter coefficients, for samples at -1,0,+1,+2.
29 * f is 0..1 offset from texel center in pixel space. */
30template<enum eCubicFilter filter> static float4 cubic_filter_coefficients(float f)
31{
32 float f2 = f * f;
33 float f3 = f2 * f;
34
35 if constexpr (filter == eCubicFilter::BSpline) {
36 /* Cubic B-Spline (Mitchell-Netravali filter with B=1, C=0 parameters). */
37 float w3 = f3 * (1.0f / 6.0f);
38 float w0 = -w3 + f2 * 0.5f - f * 0.5f + 1.0f / 6.0f;
39 float w1 = f3 * 0.5f - f2 * 1.0f + 2.0f / 3.0f;
40 float w2 = 1.0f - w0 - w1 - w3;
41 return float4(w0, w1, w2, w3);
42 }
43 else if constexpr (filter == eCubicFilter::Mitchell) {
44 /* Cubic Mitchell-Netravali filter with B=1/3, C=1/3 parameters. */
45 float w0 = -7.0f / 18.0f * f3 + 5.0f / 6.0f * f2 - 0.5f * f + 1.0f / 18.0f;
46 float w1 = 7.0f / 6.0f * f3 - 2.0f * f2 + 8.0f / 9.0f;
47 float w2 = -7.0f / 6.0f * f3 + 3.0f / 2.0f * f2 + 0.5f * f + 1.0f / 18.0f;
48 float w3 = 7.0f / 18.0f * f3 - 1.0f / 3.0f * f2;
49 return float4(w0, w1, w2, w3);
50 }
51}
52
53#if BLI_HAVE_SSE4
54template<eCubicFilter filter>
55BLI_INLINE void bicubic_interpolation_uchar_simd(
56 const uchar *src_buffer, uchar *output, int width, int height, float u, float v)
57{
58 __m128 uv = _mm_set_ps(0, 0, v, u);
59 __m128 uv_floor = _mm_floor_ps(uv);
60 __m128i i_uv = _mm_cvttps_epi32(uv_floor);
61
62 /* Sample area entirely outside image?
63 * We check if any of (iu+1, iv+1, width, height) < (0, 0, iu+1, iv+1). */
64 __m128i i_uv_1 = _mm_add_epi32(i_uv, _mm_set_epi32(0, 0, 1, 1));
65 __m128i cmp_a = _mm_or_si128(i_uv_1, _mm_set_epi32(height, width, 0, 0));
66 __m128i cmp_b = _mm_shuffle_epi32(i_uv_1, _MM_SHUFFLE(1, 0, 3, 2));
67 __m128i invalid = _mm_cmplt_epi32(cmp_a, cmp_b);
68 if (_mm_movemask_ps(_mm_castsi128_ps(invalid)) != 0) {
69 memset(output, 0, 4);
70 return;
71 }
72
73 __m128 frac_uv = _mm_sub_ps(uv, uv_floor);
74
75 /* Calculate pixel weights. */
76 float4 wx = cubic_filter_coefficients<filter>(_mm_cvtss_f32(frac_uv));
78 _mm_cvtss_f32(_mm_shuffle_ps(frac_uv, frac_uv, 1)));
79
80 /* Read 4x4 source pixels and blend them. */
81 __m128 out = _mm_setzero_ps();
82 int iu = _mm_cvtsi128_si32(i_uv);
83 int iv = _mm_cvtsi128_si32(_mm_shuffle_epi32(i_uv, 1));
84
85 for (int n = 0; n < 4; n++) {
86 int y1 = iv + n - 1;
87 CLAMP(y1, 0, height - 1);
88 for (int m = 0; m < 4; m++) {
89
90 int x1 = iu + m - 1;
91 CLAMP(x1, 0, width - 1);
92 float w = wx[m] * wy[n];
93
94 const uchar *data = src_buffer + (width * y1 + x1) * 4;
95 /* Load 4 bytes and expand into 4-lane SIMD. */
96 __m128i sample_i = _mm_castps_si128(_mm_load_ss((const float *)data));
97 sample_i = _mm_unpacklo_epi8(sample_i, _mm_setzero_si128());
98 sample_i = _mm_unpacklo_epi16(sample_i, _mm_setzero_si128());
99
100 /* Accumulate into out with weight. */
101 out = _mm_add_ps(out, _mm_mul_ps(_mm_cvtepi32_ps(sample_i), _mm_set1_ps(w)));
102 }
103 }
104
105 /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
106 * unsigned, then write resulting 32-bit value. This will clamp
107 * out of range values too. */
108 out = _mm_add_ps(out, _mm_set1_ps(0.5f));
109 __m128i rgba32 = _mm_cvttps_epi32(out);
110 __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
111 __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
112 _mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
113}
114#endif /* BLI_HAVE_SSE4 */
115
116template<typename T, eCubicFilter filter>
118 const T *src_buffer, T *output, int width, int height, int components, float u, float v)
119{
120 BLI_assert(src_buffer && output);
121
122#if BLI_HAVE_SSE4
123 if constexpr (std::is_same_v<T, uchar>) {
124 if (components == 4) {
125 bicubic_interpolation_uchar_simd<filter>(src_buffer, output, width, height, u, v);
126 return;
127 }
128 }
129#endif
130
131 int iu = int(floor(u));
132 int iv = int(floor(v));
133
134 /* Sample area entirely outside image? */
135 if (iu + 1 < 0 || iu > width - 1 || iv + 1 < 0 || iv > height - 1) {
136 memset(output, 0, size_t(components) * sizeof(T));
137 return;
138 }
139
140 float frac_u = u - float(iu);
141 float frac_v = v - float(iv);
142
143 float4 out{0.0f};
144
145 /* Calculate pixel weights. */
148
149 /* Read 4x4 source pixels and blend them. */
150 for (int n = 0; n < 4; n++) {
151 int y1 = iv + n - 1;
152 CLAMP(y1, 0, height - 1);
153 for (int m = 0; m < 4; m++) {
154
155 int x1 = iu + m - 1;
156 CLAMP(x1, 0, width - 1);
157 float w = wx[m] * wy[n];
158
159 const T *data = src_buffer + (width * y1 + x1) * components;
160
161 if (components == 1) {
162 out[0] += data[0] * w;
163 }
164 else if (components == 3) {
165 out[0] += data[0] * w;
166 out[1] += data[1] * w;
167 out[2] += data[2] * w;
168 }
169 else {
170 out[0] += data[0] * w;
171 out[1] += data[1] * w;
172 out[2] += data[2] * w;
173 out[3] += data[3] * w;
174 }
175 }
176 }
177
178 /* Mitchell filter has negative lobes; prevent output from going out of range. */
179 if constexpr (filter == eCubicFilter::Mitchell) {
180 for (int i = 0; i < components; i++) {
181 out[i] = math::max(out[i], 0.0f);
182 if constexpr (std::is_same_v<T, uchar>) {
183 out[i] = math::min(out[i], 255.0f);
184 }
185 }
186 }
187
188 /* Write result. */
189 if constexpr (std::is_same_v<T, float>) {
190 if (components == 1) {
191 output[0] = out[0];
192 }
193 else if (components == 3) {
194 copy_v3_v3(output, out);
195 }
196 else {
197 copy_v4_v4(output, out);
198 }
199 }
200 else {
201 if (components == 1) {
202 output[0] = uchar(out[0] + 0.5f);
203 }
204 else if (components == 3) {
205 output[0] = uchar(out[0] + 0.5f);
206 output[1] = uchar(out[1] + 0.5f);
207 output[2] = uchar(out[2] + 0.5f);
208 }
209 else {
210 output[0] = uchar(out[0] + 0.5f);
211 output[1] = uchar(out[1] + 0.5f);
212 output[2] = uchar(out[2] + 0.5f);
213 output[3] = uchar(out[3] + 0.5f);
214 }
215 }
216}
217
218template<bool border>
219BLI_INLINE void bilinear_fl_impl(const float *buffer,
220 float *output,
221 int width,
222 int height,
223 int components,
224 float u,
225 float v,
226 bool wrap_x = false,
227 bool wrap_y = false)
228{
229 BLI_assert(buffer && output);
230 float a, b;
231 float a_b, ma_b, a_mb, ma_mb;
232 int y1, y2, x1, x2;
233
234 if (wrap_x) {
235 u = floored_fmod(u, float(width));
236 }
237 if (wrap_y) {
238 v = floored_fmod(v, float(height));
239 }
240
241 float uf = floorf(u);
242 float vf = floorf(v);
243
244 x1 = int(uf);
245 x2 = x1 + 1;
246 y1 = int(vf);
247 y2 = y1 + 1;
248
249 const float *row1, *row2, *row3, *row4;
250 const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
251
252 /* Check if +1 samples need wrapping, or we don't do wrapping then if
253 * we are sampling completely outside the image. */
254 if (wrap_x) {
255 if (x2 >= width) {
256 x2 = 0;
257 }
258 }
259 else if (border && (x2 < 0 || x1 >= width)) {
260 copy_vn_fl(output, components, 0.0f);
261 return;
262 }
263 if (wrap_y) {
264 if (y2 >= height) {
265 y2 = 0;
266 }
267 }
268 else if (border && (y2 < 0 || y1 >= height)) {
269 copy_vn_fl(output, components, 0.0f);
270 return;
271 }
272
273 /* Sample locations. */
274 if constexpr (border) {
275 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x1) * components;
276 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x1) * components;
277 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x2) * components;
278 row4 = (x2 > width - 1 || y2 > height - 1) ? empty :
279 buffer + (int64_t(width) * y2 + x2) * components;
280 }
281 else {
282 x1 = blender::math::clamp(x1, 0, width - 1);
283 x2 = blender::math::clamp(x2, 0, width - 1);
284 y1 = blender::math::clamp(y1, 0, height - 1);
285 y2 = blender::math::clamp(y2, 0, height - 1);
286 row1 = buffer + (int64_t(width) * y1 + x1) * components;
287 row2 = buffer + (int64_t(width) * y2 + x1) * components;
288 row3 = buffer + (int64_t(width) * y1 + x2) * components;
289 row4 = buffer + (int64_t(width) * y2 + x2) * components;
290 }
291
292 a = u - uf;
293 b = v - vf;
294 a_b = a * b;
295 ma_b = (1.0f - a) * b;
296 a_mb = a * (1.0f - b);
297 ma_mb = (1.0f - a) * (1.0f - b);
298
299 if (components == 1) {
300 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
301 }
302 else if (components == 3) {
303 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
304 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
305 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
306 }
307 else {
308#if BLI_HAVE_SSE2
309 __m128 rgba1 = _mm_loadu_ps(row1);
310 __m128 rgba2 = _mm_loadu_ps(row2);
311 __m128 rgba3 = _mm_loadu_ps(row3);
312 __m128 rgba4 = _mm_loadu_ps(row4);
313 rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
314 rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
315 rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
316 rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
317 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
318 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
319 __m128 rgba = _mm_add_ps(rgba13, rgba24);
320 _mm_storeu_ps(output, rgba);
321#else
322 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
323 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
324 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
325 output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
326#endif
327 }
328}
329
330template<bool border>
331BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height, float u, float v)
332{
333 BLI_assert(buffer);
334 uchar4 res;
335
336#if BLI_HAVE_SSE4
337 __m128 uvuv = _mm_set_ps(v, u, v, u);
338 __m128 uvuv_floor = _mm_floor_ps(uvuv);
339
340 /* x1, y1, x2, y2 */
341 __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
342 /* Check whether any of the coordinates are outside of the image. */
343 __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
344 _mm_set1_epi32(1));
345
346 /* Samples 1,2,3,4 will be in this order: x1y1, x1y2, x2y1, x2y2. */
347 __m128i x1234, y1234, invalid_1234;
348
349 if constexpr (border) {
350 /* Blend black colors for samples right outside the image: figure out
351 * which of the 4 samples were outside, set their coordinates to zero
352 * and later on put black color into their place. */
353 __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
354 __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
355 __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
356
357 /* Samples 1,2,3,4 are in this order: x1y1, x1y2, x2y1, x2y2 */
358 x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
359 y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
360 invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
361 _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
362 /* Set x & y to zero for invalid samples. */
363 x1234 = _mm_andnot_si128(invalid_1234, x1234);
364 y1234 = _mm_andnot_si128(invalid_1234, y1234);
365 }
366 else {
367 /* Clamp samples to image edges. */
368 __m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
369 xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
370 x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
371 y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
372 }
373
374 /* Read the four sample values. Do address calculations in C, since SSE
375 * before 4.1 makes it very cumbersome to do full integer multiplies. */
376 int xcoord[4];
377 int ycoord[4];
378 _mm_storeu_ps((float *)xcoord, _mm_castsi128_ps(x1234));
379 _mm_storeu_ps((float *)ycoord, _mm_castsi128_ps(y1234));
380 int sample1 = ((const int *)buffer)[ycoord[0] * int64_t(width) + xcoord[0]];
381 int sample2 = ((const int *)buffer)[ycoord[1] * int64_t(width) + xcoord[1]];
382 int sample3 = ((const int *)buffer)[ycoord[2] * int64_t(width) + xcoord[2]];
383 int sample4 = ((const int *)buffer)[ycoord[3] * int64_t(width) + xcoord[3]];
384 __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
385 if constexpr (border) {
386 /* Set samples to black for the ones that were actually invalid. */
387 samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
388 }
389
390 /* Expand samples from packed 8-bit RGBA to full floats:
391 * spread to 16 bit values. */
392 __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
393 __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
394 /* Spread to 32 bit values and convert to float. */
395 __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
396 __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
397 __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
398 __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
399
400 /* Calculate interpolation factors: (1-a)*(1-b), (1-a)*b, a*(1-b), a*b */
401 __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
402 __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
403 __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
404 __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
405 _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
406
407 /* Blend the samples. */
408 rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
409 rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
410 rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
411 rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
412 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
413 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
414 __m128 rgba = _mm_add_ps(rgba13, rgba24);
415 rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
416 /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
417 * unsigned, then write resulting 32-bit value. */
418 __m128i rgba32 = _mm_cvttps_epi32(rgba);
419 __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
420 __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
421 _mm_store_ss((float *)&res, _mm_castsi128_ps(rgba8));
422
423#else
424
425 float uf = floorf(u);
426 float vf = floorf(v);
427
428 int x1 = int(uf);
429 int x2 = x1 + 1;
430 int y1 = int(vf);
431 int y2 = y1 + 1;
432
433 /* Completely outside of the image in bordered mode? */
434 if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
435 return uchar4(0);
436 }
437
438 /* Sample locations. */
439 const uchar *row1, *row2, *row3, *row4;
440 uchar empty[4] = {0, 0, 0, 0};
441 if constexpr (border) {
442 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x1) * 4;
443 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x1) * 4;
444 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x2) * 4;
445 row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x2) * 4;
446 }
447 else {
448 x1 = blender::math::clamp(x1, 0, width - 1);
449 x2 = blender::math::clamp(x2, 0, width - 1);
450 y1 = blender::math::clamp(y1, 0, height - 1);
451 y2 = blender::math::clamp(y2, 0, height - 1);
452 row1 = buffer + (int64_t(width) * y1 + x1) * 4;
453 row2 = buffer + (int64_t(width) * y2 + x1) * 4;
454 row3 = buffer + (int64_t(width) * y1 + x2) * 4;
455 row4 = buffer + (int64_t(width) * y2 + x2) * 4;
456 }
457
458 float a = u - uf;
459 float b = v - vf;
460 float a_b = a * b;
461 float ma_b = (1.0f - a) * b;
462 float a_mb = a * (1.0f - b);
463 float ma_mb = (1.0f - a) * (1.0f - b);
464
465 res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
466 res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
467 res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
468 res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
469#endif
470
471 return res;
472}
473
475 const uchar *buffer, int width, int height, float u, float v)
476{
477 return bilinear_byte_impl<true>(buffer, width, height, u, v);
478}
479
480uchar4 interpolate_bilinear_byte(const uchar *buffer, int width, int height, float u, float v)
481{
482 return bilinear_byte_impl<false>(buffer, width, height, u, v);
483}
484
485float4 interpolate_bilinear_border_fl(const float *buffer, int width, int height, float u, float v)
486{
487 float4 res;
488 bilinear_fl_impl<true>(buffer, res, width, height, 4, u, v);
489 return res;
490}
491
493 const float *buffer, float *output, int width, int height, int components, float u, float v)
494{
495 bilinear_fl_impl<true>(buffer, output, width, height, components, u, v);
496}
497
498float4 interpolate_bilinear_fl(const float *buffer, int width, int height, float u, float v)
499{
500 float4 res;
501 bilinear_fl_impl<false>(buffer, res, width, height, 4, u, v);
502 return res;
503}
504
506 const float *buffer, float *output, int width, int height, int components, float u, float v)
507{
508 bilinear_fl_impl<false>(buffer, output, width, height, components, u, v);
509}
510
511void interpolate_bilinear_wrap_fl(const float *buffer,
512 float *output,
513 int width,
514 int height,
515 int components,
516 float u,
517 float v,
518 bool wrap_x,
519 bool wrap_y)
520{
521 bilinear_fl_impl<false>(buffer, output, width, height, components, u, v, wrap_x, wrap_y);
522}
523
524uchar4 interpolate_bilinear_wrap_byte(const uchar *buffer, int width, int height, float u, float v)
525{
526 u = floored_fmod(u, float(width));
527 v = floored_fmod(v, float(height));
528 float uf = floorf(u);
529 float vf = floorf(v);
530
531 int x1 = int(uf);
532 int x2 = x1 + 1;
533 int y1 = int(vf);
534 int y2 = y1 + 1;
535
536 /* Wrap interpolation pixels if needed. */
537 BLI_assert(x1 >= 0 && x1 < width && y1 >= 0 && y1 < height);
538 if (x2 >= width) {
539 x2 = 0;
540 }
541 if (y2 >= height) {
542 y2 = 0;
543 }
544
545 float a = u - uf;
546 float b = v - vf;
547 float a_b = a * b;
548 float ma_b = (1.0f - a) * b;
549 float a_mb = a * (1.0f - b);
550 float ma_mb = (1.0f - a) * (1.0f - b);
551
552 /* Blend samples. */
553 const uchar *row1 = buffer + (int64_t(width) * y1 + x1) * 4;
554 const uchar *row2 = buffer + (int64_t(width) * y2 + x1) * 4;
555 const uchar *row3 = buffer + (int64_t(width) * y1 + x2) * 4;
556 const uchar *row4 = buffer + (int64_t(width) * y2 + x2) * 4;
557
558 uchar4 res;
559 res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
560 res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
561 res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
562 res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
563 return res;
564}
565
566float4 interpolate_bilinear_wrap_fl(const float *buffer, int width, int height, float u, float v)
567{
568 float4 res;
569 bilinear_fl_impl<false>(buffer, res, width, height, 4, u, v, true, true);
570 return res;
571}
572
573uchar4 interpolate_cubic_bspline_byte(const uchar *buffer, int width, int height, float u, float v)
574{
575 uchar4 res;
576 bicubic_interpolation<uchar, eCubicFilter::BSpline>(buffer, res, width, height, 4, u, v);
577 return res;
578}
579
580float4 interpolate_cubic_bspline_fl(const float *buffer, int width, int height, float u, float v)
581{
582 float4 res;
583 bicubic_interpolation<float, eCubicFilter::BSpline>(buffer, res, width, height, 4, u, v);
584 return res;
585}
586
588 const float *buffer, float *output, int width, int height, int components, float u, float v)
589{
591 buffer, output, width, height, components, u, v);
592}
593
595 const uchar *buffer, int width, int height, float u, float v)
596{
597 uchar4 res;
598 bicubic_interpolation<uchar, eCubicFilter::Mitchell>(buffer, res, width, height, 4, u, v);
599 return res;
600}
601
602float4 interpolate_cubic_mitchell_fl(const float *buffer, int width, int height, float u, float v)
603{
604 float4 res;
605 bicubic_interpolation<float, eCubicFilter::Mitchell>(buffer, res, width, height, 4, u, v);
606 return res;
607}
608
610 const float *buffer, float *output, int width, int height, int components, float u, float v)
611{
613 buffer, output, width, height, components, u, v);
614}
615
616} // namespace blender::math
617
618/**************************************************************************
619 * Filtering method based on
620 * "Creating raster omnimax images from multiple perspective views
621 * using the elliptical weighted average filter"
622 * by Ned Greene and Paul S. Heckbert (1986)
623 ***************************************************************************/
624
625/* Table of `(exp(ar) - exp(a)) / (1 - exp(a))` for `r` in range [0, 1] and `a = -2`.
626 * used instead of actual gaussian,
627 * otherwise at high texture magnifications circular artifacts are visible. */
628#define EWA_MAXIDX 255
629const float EWA_WTS[EWA_MAXIDX + 1] = {
630 1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
631 0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
632 0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
633 0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
634 0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
635 0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
636 0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
637 0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
638 0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
639 0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
640 0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
641 0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
642 0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
643 0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
644 0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
645 0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
646 0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
647 0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
648 0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
649 0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
650 0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
651 0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
652 0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
653 0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
654 0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
655 0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
656 0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
657 0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
658 0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
659 0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
660 0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
661 0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
662 0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
663 0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
664 0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
665 0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
666 0.00372644f, 0.00247454f, 0.00123242f, 0.0f,
667};
668
669static void radangle2imp(float a2, float b2, float th, float *A, float *B, float *C, float *F)
670{
671 float ct2 = cosf(th);
672 const float st2 = 1.0f - ct2 * ct2; /* <- sin(th)^2 */
673 ct2 *= ct2;
674 *A = a2 * st2 + b2 * ct2;
675 *B = (b2 - a2) * sinf(2.0f * th);
676 *C = a2 * ct2 + b2 * st2;
677 *F = a2 * b2;
678}
679
681 float A, float B, float C, float F, float *a, float *b, float *th, float *ecc)
682{
683 /* NOTE: all tests here are done to make sure possible overflows are hopefully minimized. */
684
685 if (F <= 1e-5f) { /* use arbitrary major radius, zero minor, infinite eccentricity */
686 *a = sqrtf(A > C ? A : C);
687 *b = 0.0f;
688 *ecc = 1e10f;
689 *th = 0.5f * (atan2f(B, A - C) + float(M_PI));
690 }
691 else {
692 const float AmC = A - C, ApC = A + C, F2 = F * 2.0f;
693 const float r = sqrtf(AmC * AmC + B * B);
694 float d = ApC - r;
695 *a = (d <= 0.0f) ? sqrtf(A > C ? A : C) : sqrtf(F2 / d);
696 d = ApC + r;
697 if (d <= 0.0f) {
698 *b = 0.0f;
699 *ecc = 1e10f;
700 }
701 else {
702 *b = sqrtf(F2 / d);
703 *ecc = *a / *b;
704 }
705 /* Increase theta by `0.5 * pi` (angle of major axis). */
706 *th = 0.5f * (atan2f(B, AmC) + float(M_PI));
707 }
708}
709
710void BLI_ewa_filter(const int width,
711 const int height,
712 const bool intpol,
713 const bool use_alpha,
714 const float uv[2],
715 const float du[2],
716 const float dv[2],
717 ewa_filter_read_pixel_cb read_pixel_cb,
718 void *userdata,
719 float result[4])
720{
721 /* Scaling `dxt` / `dyt` by full resolution can cause overflow because of huge A/B/C and esp.
722 * F values, scaling by aspect ratio alone does the opposite, so try something in between
723 * instead. */
724 const float ff2 = float(width), ff = sqrtf(ff2), q = float(height) / ff;
725 const float Ux = du[0] * ff, Vx = du[1] * q, Uy = dv[0] * ff, Vy = dv[1] * q;
726 float A = Vx * Vx + Vy * Vy;
727 float B = -2.0f * (Ux * Vx + Uy * Vy);
728 float C = Ux * Ux + Uy * Uy;
729 float F = A * C - B * B * 0.25f;
730 float a, b, th, ecc, a2, b2, ue, ve, U0, V0, DDQ, U, ac1, ac2, BU, d;
731 int u, v, u1, u2, v1, v2;
732
733 /* The so-called 'high' quality EWA method simply adds a constant of 1 to both A & C,
734 * so the ellipse always covers at least some texels. But since the filter is now always larger,
735 * it also means that everywhere else it's also more blurry then ideally should be the case.
736 * So instead here the ellipse radii are modified instead whenever either is too low.
737 * Use a different radius based on interpolation switch,
738 * just enough to anti-alias when interpolation is off,
739 * and slightly larger to make result a bit smoother than bilinear interpolation when
740 * interpolation is on (minimum values: `const float rmin = intpol ? 1.0f : 0.5f;`) */
741 const float rmin = (intpol ? 1.5625f : 0.765625f) / ff2;
742 BLI_ewa_imp2radangle(A, B, C, F, &a, &b, &th, &ecc);
743 if ((b2 = b * b) < rmin) {
744 if ((a2 = a * a) < rmin) {
745 UNUSED_VARS(a2, b2);
746 B = 0.0f;
747 A = C = rmin;
748 F = A * C;
749 }
750 else {
751 b2 = rmin;
752 radangle2imp(a2, b2, th, &A, &B, &C, &F);
753 }
754 }
755
756 ue = ff * sqrtf(C);
757 ve = ff * sqrtf(A);
758 d = float(EWA_MAXIDX + 1) / (F * ff2);
759 A *= d;
760 B *= d;
761 C *= d;
762
763 U0 = uv[0] * float(width);
764 V0 = uv[1] * float(height);
765 u1 = int(floorf(U0 - ue));
766 u2 = int(ceilf(U0 + ue));
767 v1 = int(floorf(V0 - ve));
768 v2 = int(ceilf(V0 + ve));
769
770 /* sane clamping to avoid unnecessarily huge loops */
771 /* NOTE: if eccentricity gets clamped (see above),
772 * the ue/ve limits can also be lowered accordingly
773 */
774 if (U0 - float(u1) > EWA_MAXIDX) {
775 u1 = int(U0) - EWA_MAXIDX;
776 }
777 if (float(u2) - U0 > EWA_MAXIDX) {
778 u2 = int(U0) + EWA_MAXIDX;
779 }
780 if (V0 - float(v1) > EWA_MAXIDX) {
781 v1 = int(V0) - EWA_MAXIDX;
782 }
783 if (float(v2) - V0 > EWA_MAXIDX) {
784 v2 = int(V0) + EWA_MAXIDX;
785 }
786
787 /* Early output check for cases the whole region is outside of the buffer. */
788 if ((u2 < 0 || u1 >= width) || (v2 < 0 || v1 >= height)) {
789 zero_v4(result);
790 return;
791 }
792
793 U0 -= 0.5f;
794 V0 -= 0.5f;
795 DDQ = 2.0f * A;
796 U = float(u1) - U0;
797 ac1 = A * (2.0f * U + 1.0f);
798 ac2 = A * U * U;
799 BU = B * U;
800
801 d = 0.0f;
802 zero_v4(result);
803 for (v = v1; v <= v2; v++) {
804 const float V = float(v) - V0;
805 float DQ = ac1 + B * V;
806 float Q = (C * V + BU) * V + ac2;
807 for (u = u1; u <= u2; u++) {
808 if (Q < float(EWA_MAXIDX + 1)) {
809 float tc[4];
810 const float wt = EWA_WTS[(Q < 0.0f) ? 0 : uint(Q)];
811 read_pixel_cb(userdata, u, v, tc);
812 madd_v3_v3fl(result, tc, wt);
813 result[3] += use_alpha ? tc[3] * wt : 0.0f;
814 d += wt;
815 }
816 Q += DQ;
817 DQ += DDQ;
818 }
819 }
820
821 /* `d` should hopefully never be zero anymore. */
822 d = 1.0f / d;
823 mul_v3_fl(result, d);
824 /* Clipping can be ignored if alpha used, `texr->trgba[3]` already includes filtered edge. */
825 result[3] = use_alpha ? result[3] * d : 1.0f;
826}
#define BLI_assert(a)
Definition BLI_assert.h:50
#define BLI_INLINE
MINLINE float floored_fmod(float f, float n)
#define M_PI
void(*)(void *userdata, int x, int y, float result[4]) ewa_filter_read_pixel_cb
MINLINE void copy_v4_v4(float r[4], const float a[4])
MINLINE void madd_v3_v3fl(float r[3], const float a[3], float f)
MINLINE void mul_v3_fl(float r[3], float f)
MINLINE void copy_v3_v3(float r[3], const float a[3])
void copy_vn_fl(float *array_tar, int size, float val)
MINLINE void zero_v4(float r[4])
unsigned char uchar
unsigned int uint
#define CLAMP(a, b, c)
#define UNUSED_VARS(...)
#define C
Definition RandGen.cpp:29
#define U
ATTR_WARN_UNUSED_RESULT const BMVert * v2
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define A
unsigned int U
Definition btGjkEpa3.h:78
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
local_group_size(16, 16) .push_constant(Type b
#define sinf(x)
#define cosf(x)
#define atan2f(x, y)
#define ceilf(x)
#define floorf(x)
#define sqrtf(x)
draw_view in_light_buf[] float
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
#define EWA_MAXIDX
static void radangle2imp(float a2, float b2, float th, float *A, float *B, float *C, float *F)
const float EWA_WTS[EWA_MAXIDX+1]
void BLI_ewa_filter(const int width, const int height, const bool intpol, const bool use_alpha, const float uv[2], const float du[2], const float dv[2], ewa_filter_read_pixel_cb read_pixel_cb, void *userdata, float result[4])
void BLI_ewa_imp2radangle(float A, float B, float C, float F, float *a, float *b, float *th, float *ecc)
#define B
uchar4 interpolate_cubic_mitchell_byte(const uchar *buffer, int width, int height, float u, float v)
static void bicubic_interpolation(const T *src_buffer, T *output, int width, int height, int components, float u, float v)
uchar4 interpolate_bilinear_wrap_byte(const uchar *buffer, int width, int height, float u, float v)
static float4 cubic_filter_coefficients(float f)
BLI_INLINE void bilinear_fl_impl(const float *buffer, float *output, int width, int height, int components, float u, float v, bool wrap_x=false, bool wrap_y=false)
uchar4 interpolate_bilinear_byte(const uchar *buffer, int width, int height, float u, float v)
BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height, float u, float v)
T clamp(const T &a, const T &min, const T &max)
T floor(const T &a)
float4 interpolate_bilinear_border_fl(const float *buffer, int width, int height, float u, float v)
float4 interpolate_bilinear_wrap_fl(const float *buffer, int width, int height, float u, float v)
float4 interpolate_cubic_bspline_fl(const float *buffer, int width, int height, float u, float v)
T min(const T &a, const T &b)
float4 interpolate_bilinear_fl(const float *buffer, int width, int height, float u, float v)
float4 interpolate_cubic_mitchell_fl(const float *buffer, int width, int height, float u, float v)
T max(const T &a, const T &b)
uchar4 interpolate_cubic_bspline_byte(const uchar *buffer, int width, int height, float u, float v)
uchar4 interpolate_bilinear_border_byte(const uchar *buffer, int width, int height, float u, float v)
blender::VecBase< uint8_t, 4 > uchar4
VecBase< float, 4 > float4
__int64 int64_t
Definition stdint.h:89
CCL_NAMESPACE_BEGIN struct Window V