Blender V4.3
brute_region_tracker.cc
Go to the documentation of this file.
1// Copyright (c) 2011 libmv authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to
5// deal in the Software without restriction, including without limitation the
6// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7// sell copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19// IN THE SOFTWARE.
20
22
23#ifdef __SSE2__
24# include <emmintrin.h>
25#endif
26
30#include "libmv/image/image.h"
31#include "libmv/image/sample.h"
33
34namespace libmv {
35namespace {
36
37bool RegionIsInBounds(const FloatImage& image1,
38 double x,
39 double y,
40 int half_window_size) {
41 // Check the minimum coordinates.
42 int min_x = floor(x) - half_window_size - 1;
43 int min_y = floor(y) - half_window_size - 1;
44 if (min_x < 0.0 || min_y < 0.0) {
45 return false;
46 }
47
48 // Check the maximum coordinates.
49 int max_x = ceil(x) + half_window_size + 1;
50 int max_y = ceil(y) + half_window_size + 1;
51 if (max_x > image1.cols() || max_y > image1.rows()) {
52 return false;
53 }
54
55 // Ok, we're good.
56 return true;
57}
58
59#ifdef __SSE2__
60
61// Compute the sub of absolute differences between the arrays "a" and "b".
62// The array "a" is assumed to be 16-byte aligned, while "b" is not. The
63// result is returned as the first and third elements of __m128i if
64// interpreted as a 4-element 32-bit integer array. The SAD is the sum of the
65// elements.
66//
67// The function requires size % 16 valid extra elements at the end of both "a"
68// and "b", since the SSE load instructionst will pull in memory past the end
69// of the arrays if their size is not a multiple of 16.
70inline static __m128i SumOfAbsoluteDifferencesContiguousSSE(
71 const unsigned char* a, // aligned
72 const unsigned char* b, // not aligned
73 unsigned int size,
74 __m128i sad) {
75 // Do the bulk of the work as 16-way integer operations.
76 for (unsigned int j = 0; j < size / 16; j++) {
77 sad = _mm_add_epi32(sad,
78 _mm_sad_epu8(_mm_load_si128((__m128i*)(a + 16 * j)),
79 _mm_loadu_si128((__m128i*)(b + 16 * j))));
80 }
81 // Handle the trailing end.
82 // TODO(keir): Benchmark to verify that the below SSE is a win compared to a
83 // hand-rolled loop. It's not clear that the hand rolled loop would be slower
84 // than the potential cache miss when loading the immediate table below.
85 //
86 // An alternative to this version is to take a packet of all 1's then do a
87 // 128-bit shift. The issue is that the shift instruction needs an immediate
88 // amount rather than a variable amount, so the branch instruction here must
89 // remain. See _mm_srli_si128 and _mm_slli_si128.
90 unsigned int remainder = size % 16u;
91 if (remainder) {
92 unsigned int j = size / 16;
93 __m128i a_trail = _mm_load_si128((__m128i*)(a + 16 * j));
94 __m128i b_trail = _mm_loadu_si128((__m128i*)(b + 16 * j));
95 __m128i mask;
96 switch (remainder) {
97# define X 0xff
98 case 1:
99 mask = _mm_setr_epi8(X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
100 break;
101 case 2:
102 mask = _mm_setr_epi8(X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
103 break;
104 case 3:
105 mask = _mm_setr_epi8(X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
106 break;
107 case 4:
108 mask = _mm_setr_epi8(X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
109 break;
110 case 5:
111 mask = _mm_setr_epi8(X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
112 break;
113 case 6:
114 mask = _mm_setr_epi8(X, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
115 break;
116 case 7:
117 mask = _mm_setr_epi8(X, X, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0);
118 break;
119 case 8:
120 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0);
121 break;
122 case 9:
123 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0);
124 break;
125 case 10:
126 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, 0, 0, 0, 0, 0, 0);
127 break;
128 case 11:
129 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, X, 0, 0, 0, 0, 0);
130 break;
131 case 12:
132 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, X, X, 0, 0, 0, 0);
133 break;
134 case 13:
135 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, X, X, X, 0, 0, 0);
136 break;
137 case 14:
138 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, X, X, X, X, 0, 0);
139 break;
140 case 15:
141 mask = _mm_setr_epi8(X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, 0);
142 break;
143 // To silence compiler warning.
144 default: mask = _mm_setzero_si128(); break;
145# undef X
146 }
147 sad = _mm_add_epi32(sad,
148 _mm_sad_epu8(_mm_and_si128(mask, a_trail),
149 _mm_and_si128(mask, b_trail)));
150 }
151 return sad;
152}
153#endif
154
155// Computes the sum of absolute differences between pattern and image. Pattern
156// must be 16-byte aligned, and the stride must be a multiple of 16. The image
157// does pointer does not have to be aligned.
158int SumOfAbsoluteDifferencesContiguousImage(const unsigned char* pattern,
159 unsigned int pattern_width,
160 unsigned int pattern_height,
161 unsigned int pattern_stride,
162 const unsigned char* image,
163 unsigned int image_stride) {
164#ifdef __SSE2__
165 // TODO(keir): Add interleaved accumulation, where accumulation is done into
166 // two or more SSE registers that then get combined at the end. This reduces
167 // instruction dependency; in Eigen's squared norm code, splitting the
168 // accumulation produces a ~2x speedup. It's not clear it will help here,
169 // where the number of SSE instructions in the inner loop is smaller.
170 __m128i sad = _mm_setzero_si128();
171 for (int r = 0; r < pattern_height; ++r) {
172 sad = SumOfAbsoluteDifferencesContiguousSSE(&pattern[pattern_stride * r],
173 &image[image_stride * r],
174 pattern_width,
175 sad);
176 }
177 return _mm_cvtsi128_si32(
178 _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(3, 0, 1, 2))));
179#else
180 int sad = 0;
181 for (int r = 0; r < pattern_height; ++r) {
182 for (int c = 0; c < pattern_width; ++c) {
183 sad += abs(pattern[pattern_stride * r + c] - image[image_stride * r + c]);
184 }
185 }
186 return sad;
187#endif
188}
189
190// Sample a region of size width, height centered at x,y in image, converting
191// from float to byte in the process. Samples from the first channel. Puts
192// result into *pattern.
193void SampleRectangularPattern(const FloatImage& image,
194 double x,
195 double y,
196 int width,
197 int height,
198 int pattern_stride,
199 unsigned char* pattern) {
200 // There are two cases for width and height: even or odd. If it's odd, then
201 // the bounds [-width / 2, width / 2] works as expected. However, for even,
202 // this results in one extra access past the end. So use < instead of <= in
203 // the loops below, but increase the end limit by one in the odd case.
204 int end_width = (width / 2) + (width % 2);
205 int end_height = (height / 2) + (height % 2);
206 for (int r = -height / 2; r < end_height; ++r) {
207 for (int c = -width / 2; c < end_width; ++c) {
208 pattern[pattern_stride * (r + height / 2) + c + width / 2] =
209 SampleLinear(image, y + r, x + c, 0) * 255.0;
210 }
211 }
212}
213
214// Returns x rounded up to the nearest multiple of alignment.
215inline int PadToAlignment(int x, int alignment) {
216 if (x % alignment != 0) {
217 x += alignment - (x % alignment);
218 }
219 return x;
220}
221
222// Sample a region centered at x,y in image with size extending by half_width
223// from x. Samples from the first channel. The resulting array is placed in
224// *pattern, and the stride, which will be a multiple of 16 if SSE is enabled,
225// is returned in *pattern_stride.
226//
227// NOTE: Caller must free *pattern with aligned_malloc() from above.
228void SampleSquarePattern(const FloatImage& image,
229 double x,
230 double y,
231 int half_width,
232 unsigned char** pattern,
233 int* pattern_stride) {
234 int width = 2 * half_width + 1;
235 // Allocate an aligned block with padding on the end so each row of the
236 // pattern starts on a 16-byte boundary.
237 *pattern_stride = PadToAlignment(width, 16);
238 int pattern_size_bytes = *pattern_stride * width;
239 *pattern =
240 static_cast<unsigned char*>(aligned_malloc(pattern_size_bytes, 16));
241 SampleRectangularPattern(
242 image, x, y, width, width, *pattern_stride, *pattern);
243}
244
245// NOTE: Caller must free *image with aligned_malloc() from above.
246void FloatArrayToByteArrayWithPadding(const FloatImage& float_image,
247 unsigned char** image,
248 int* image_stride) {
249 // Allocate enough so that accessing 16 elements past the end is fine.
250 *image_stride = float_image.Width() + 16;
251 *image = static_cast<unsigned char*>(
252 aligned_malloc(*image_stride * float_image.Height(), 16));
253 for (int i = 0; i < float_image.Height(); ++i) {
254 for (int j = 0; j < float_image.Width(); ++j) {
255 (*image)[*image_stride * i + j] =
256 static_cast<unsigned char>(255.0 * float_image(i, j, 0));
257 }
258 }
259}
260
261} // namespace
262
263// TODO(keir): Compare the "sharpness" of the peak around the best pixel. It's
264// probably worth plotting a few examples to see what the histogram of SAD
265// values for every hypothesis looks like.
266//
267// TODO(keir): Priority queue for multiple hypothesis.
269 const FloatImage& image2,
270 double x1,
271 double y1,
272 double* x2,
273 double* y2) const {
274 if (!RegionIsInBounds(image1, x1, y1, half_window_size)) {
275 LG << "Fell out of image1's window with x1=" << x1 << ", y1=" << y1
276 << ", hw=" << half_window_size << ".";
277 return false;
278 }
279 int pattern_width = 2 * half_window_size + 1;
280
281 Array3Df image_and_gradient1;
282 Array3Df image_and_gradient2;
283 BlurredImageAndDerivativesChannels(image1, 0.9, &image_and_gradient1);
284 BlurredImageAndDerivativesChannels(image2, 0.9, &image_and_gradient2);
285
286 // Sample the pattern to get it aligned to an image grid.
287 unsigned char* pattern;
288 int pattern_stride;
289 SampleSquarePattern(
290 image_and_gradient1, x1, y1, half_window_size, &pattern, &pattern_stride);
291
292 // Convert the search area directly to bytes without sampling.
293 unsigned char* search_area;
294 int search_area_stride;
295 FloatArrayToByteArrayWithPadding(
296 image_and_gradient2, &search_area, &search_area_stride);
297
298 // Try all possible locations inside the search area. Yes, everywhere.
299 int best_i = -1, best_j = -1, best_sad = INT_MAX;
300 for (int i = 0; i < image2.Height() - pattern_width; ++i) {
301 for (int j = 0; j < image2.Width() - pattern_width; ++j) {
302 int sad = SumOfAbsoluteDifferencesContiguousImage(
303 pattern,
304 pattern_width,
305 pattern_width,
306 pattern_stride,
307 search_area + search_area_stride * i + j,
308 search_area_stride);
309 if (sad < best_sad) {
310 best_i = i;
311 best_j = j;
312 best_sad = sad;
313 }
314 }
315 }
316
317 CHECK_NE(best_i, -1);
318 CHECK_NE(best_j, -1);
319
320 aligned_free(pattern);
321 aligned_free(search_area);
322
323 if (best_sad == INT_MAX) {
324 LG << "Hit INT_MAX in SAD; failing.";
325 return false;
326 }
327
328 *x2 = best_j + half_window_size;
329 *y2 = best_i + half_window_size;
330
331 // Calculate the shift done by the fine tracker.
332 double dx2 = *x2 - x1;
333 double dy2 = *y2 - y1;
334 double fine_shift = sqrt(dx2 * dx2 + dy2 * dy2);
335 LG << "Brute shift: dx=" << dx2 << " dy=" << dy2 << ", d=" << fine_shift;
336
337 if (minimum_correlation <= 0) {
338 // No correlation checking requested; nothing else to do.
339 LG << "No correlation checking; returning success. best_sad: " << best_sad;
340 return true;
341 }
342
343 Array3Df image_and_gradient1_sampled, image_and_gradient2_sampled;
344 SamplePattern(image_and_gradient1,
345 x1,
346 y1,
348 3,
349 &image_and_gradient1_sampled);
350 SamplePattern(image_and_gradient2,
351 *x2,
352 *y2,
354 3,
355 &image_and_gradient2_sampled);
356
357 // Compute the Pearson product-moment correlation coefficient to check
358 // for sanity.
359 double correlation = PearsonProductMomentCorrelation(
360 image_and_gradient1_sampled, image_and_gradient2_sampled);
361
362 LG << "Final correlation: " << correlation;
363
364 if (correlation < minimum_correlation) {
365 LG << "Correlation " << correlation << " greater than "
366 << minimum_correlation << "; bailing.";
367 return false;
368 }
369 return true;
370}
371
372} // namespace libmv
sqrt(x)+1/max(0
#define X
3D array (row, column, channel).
Definition array_nd.h:332
int Height() const
Definition array_nd.h:345
int Width() const
Definition array_nd.h:346
local_group_size(16, 16) .push_constant(Type b
#define LG
#define CHECK_NE(a, b)
Definition log.h:45
ccl_device_inline float2 floor(const float2 a)
ccl_device_inline float3 ceil(const float3 a)
ccl_device_inline float4 mask(const int4 mask, const float4 a)
double PearsonProductMomentCorrelation(const FloatImage &image_and_gradient1_sampled, const FloatImage &image_and_gradient2_sampled)
Definition correlation.h:29
static bool RegionIsInBounds(const FloatImage &image1, double x, double y, int half_window_size)
Array3Df FloatImage
T SampleLinear(const Array3D< T > &image, float y, float x, int v=0)
Linear interpolation.
void BlurredImageAndDerivativesChannels(const Array3Df &in, double sigma, Array3Df *blurred_and_gradxy)
Definition convolve.cc:233
void SamplePattern(const FloatImage &image, double x, double y, int half_width, int channels, FloatImage *sampled)
void aligned_free(void *ptr)
void * aligned_malloc(int size, int alignment)
virtual bool Track(const FloatImage &image1, const FloatImage &image2, double x1, double y1, double *x2, double *y2) const
ccl_device_inline int abs(int x)
Definition util/math.h:120