24# include <emmintrin.h>
40 int half_window_size) {
42 int min_x =
floor(x) - half_window_size - 1;
43 int min_y =
floor(y) - half_window_size - 1;
44 if (min_x < 0.0 || min_y < 0.0) {
49 int max_x =
ceil(x) + half_window_size + 1;
50 int max_y =
ceil(y) + half_window_size + 1;
51 if (max_x > image1.cols() || max_y > image1.rows()) {
70inline static __m128i SumOfAbsoluteDifferencesContiguousSSE(
71 const unsigned char* a,
72 const unsigned char*
b,
76 for (
unsigned int j = 0; j < size / 16; j++) {
77 sad = _mm_add_epi32(sad,
78 _mm_sad_epu8(_mm_load_si128((__m128i*)(a + 16 * j)),
79 _mm_loadu_si128((__m128i*)(
b + 16 * j))));
90 unsigned int remainder = size % 16u;
92 unsigned int j = size / 16;
93 __m128i a_trail = _mm_load_si128((__m128i*)(a + 16 * j));
94 __m128i b_trail = _mm_loadu_si128((__m128i*)(
b + 16 * j));
99 mask = _mm_setr_epi8(
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
102 mask = _mm_setr_epi8(
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
105 mask = _mm_setr_epi8(
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
108 mask = _mm_setr_epi8(
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
111 mask = _mm_setr_epi8(
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
114 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
117 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0, 0);
120 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0, 0);
123 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0, 0);
126 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0, 0);
129 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0, 0);
132 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0, 0);
135 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0, 0);
138 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0, 0);
141 mask = _mm_setr_epi8(
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X, 0);
144 default: mask = _mm_setzero_si128();
break;
147 sad = _mm_add_epi32(sad,
148 _mm_sad_epu8(_mm_and_si128(mask, a_trail),
149 _mm_and_si128(mask, b_trail)));
158int SumOfAbsoluteDifferencesContiguousImage(
const unsigned char* pattern,
159 unsigned int pattern_width,
160 unsigned int pattern_height,
161 unsigned int pattern_stride,
162 const unsigned char* image,
163 unsigned int image_stride) {
170 __m128i sad = _mm_setzero_si128();
171 for (
int r = 0; r < pattern_height; ++r) {
172 sad = SumOfAbsoluteDifferencesContiguousSSE(&pattern[pattern_stride * r],
173 &image[image_stride * r],
177 return _mm_cvtsi128_si32(
178 _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(3, 0, 1, 2))));
181 for (
int r = 0; r < pattern_height; ++r) {
182 for (
int c = 0; c < pattern_width; ++c) {
183 sad +=
abs(pattern[pattern_stride * r + c] - image[image_stride * r + c]);
193void SampleRectangularPattern(
const FloatImage& image,
199 unsigned char* pattern) {
204 int end_width = (width / 2) + (width % 2);
205 int end_height = (height / 2) + (height % 2);
206 for (
int r = -height / 2; r < end_height; ++r) {
207 for (
int c = -width / 2; c < end_width; ++c) {
208 pattern[pattern_stride * (r + height / 2) + c + width / 2] =
215inline int PadToAlignment(
int x,
int alignment) {
216 if (x % alignment != 0) {
217 x += alignment - (x % alignment);
228void SampleSquarePattern(
const FloatImage& image,
232 unsigned char** pattern,
233 int* pattern_stride) {
234 int width = 2 * half_width + 1;
237 *pattern_stride = PadToAlignment(width, 16);
238 int pattern_size_bytes = *pattern_stride * width;
240 static_cast<unsigned char*
>(
aligned_malloc(pattern_size_bytes, 16));
241 SampleRectangularPattern(
242 image, x, y, width, width, *pattern_stride, *pattern);
246void FloatArrayToByteArrayWithPadding(
const FloatImage& float_image,
247 unsigned char** image,
250 *image_stride = float_image.Width() + 16;
251 *image =
static_cast<unsigned char*
>(
253 for (
int i = 0; i < float_image.Height(); ++i) {
254 for (
int j = 0; j < float_image.Width(); ++j) {
255 (*image)[*image_stride * i + j] =
256 static_cast<unsigned char>(255.0 * float_image(i, j, 0));
275 LG <<
"Fell out of image1's window with x1=" << x1 <<
", y1=" << y1
287 unsigned char* pattern;
293 unsigned char* search_area;
294 int search_area_stride;
295 FloatArrayToByteArrayWithPadding(
296 image_and_gradient2, &search_area, &search_area_stride);
299 int best_i = -1, best_j = -1, best_sad = INT_MAX;
300 for (
int i = 0; i < image2.
Height() - pattern_width; ++i) {
301 for (
int j = 0; j < image2.
Width() - pattern_width; ++j) {
302 int sad = SumOfAbsoluteDifferencesContiguousImage(
307 search_area + search_area_stride * i + j,
309 if (sad < best_sad) {
323 if (best_sad == INT_MAX) {
324 LG <<
"Hit INT_MAX in SAD; failing.";
332 double dx2 = *x2 - x1;
333 double dy2 = *y2 - y1;
334 double fine_shift =
sqrt(dx2 * dx2 + dy2 * dy2);
335 LG <<
"Brute shift: dx=" << dx2 <<
" dy=" << dy2 <<
", d=" << fine_shift;
339 LG <<
"No correlation checking; returning success. best_sad: " << best_sad;
343 Array3Df image_and_gradient1_sampled, image_and_gradient2_sampled;
349 &image_and_gradient1_sampled);
355 &image_and_gradient2_sampled);
360 image_and_gradient1_sampled, image_and_gradient2_sampled);
362 LG <<
"Final correlation: " << correlation;
365 LG <<
"Correlation " << correlation <<
" greater than "
3D array (row, column, channel).
local_group_size(16, 16) .push_constant(Type b
ccl_device_inline float2 floor(const float2 a)
ccl_device_inline float3 ceil(const float3 a)
ccl_device_inline float4 mask(const int4 mask, const float4 a)
double PearsonProductMomentCorrelation(const FloatImage &image_and_gradient1_sampled, const FloatImage &image_and_gradient2_sampled)
static bool RegionIsInBounds(const FloatImage &image1, double x, double y, int half_window_size)
T SampleLinear(const Array3D< T > &image, float y, float x, int v=0)
Linear interpolation.
void BlurredImageAndDerivativesChannels(const Array3Df &in, double sigma, Array3Df *blurred_and_gradxy)
void SamplePattern(const FloatImage &image, double x, double y, int half_width, int channels, FloatImage *sampled)
void aligned_free(void *ptr)
void * aligned_malloc(int size, int alignment)
virtual bool Track(const FloatImage &image1, const FloatImage &image2, double x1, double y1, double *x2, double *y2) const
double minimum_correlation
ccl_device_inline int abs(int x)