1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13
14 #include <arm_neon.h>
15
16 #include "av1/encoder/rdopt.h"
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19
20 // Process horizontal and vertical correlations in a 4x4 block of pixels.
21 // We actually use the 4x4 pixels to calculate correlations corresponding to
22 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
23 // moving the window along/down by 3 pixels at a time.
horver_correlation_4x4(const int16_t * diff,int stride,int32x4_t * xy_sum_32,int32x4_t * xz_sum_32,int32x4_t * x_sum_32,int32x4_t * x2_sum_32)24 static inline void horver_correlation_4x4(const int16_t *diff, int stride,
25 int32x4_t *xy_sum_32,
26 int32x4_t *xz_sum_32,
27 int32x4_t *x_sum_32,
28 int32x4_t *x2_sum_32) {
29 // Pixels in this 4x4 [ a b c d ]
30 // are referred to as: [ e f g h ]
31 // [ i j k l ]
32 // [ m n o p ]
33
34 const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
35 const int16x4_t pixelsa_2_sli =
36 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
37 const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
38 const int16x4_t pixelsb_2_sli =
39 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
40 const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
41 const int16x4_t pixelsa_1_sli =
42 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
43 const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
44 const int16x4_t pixelsb_1_sli =
45 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
46
47 const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
48
49 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
50 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
51 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
52
53 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
54 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
55 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
56
57 // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
58 // (sum up every element in slli_a and swap_b)
59 *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
60 *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
61
62 // Also sum their squares
63 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
64 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
65 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
66 }
67
av1_get_horver_correlation_full_neon(const int16_t * diff,int stride,int width,int height,float * hcorr,float * vcorr)68 void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
69 int width, int height, float *hcorr,
70 float *vcorr) {
71 // The following notation is used:
72 // x - current pixel
73 // y - right neighbour pixel
74 // z - below neighbour pixel
75 // w - down-right neighbour pixel
76 int64_t xy_sum = 0, xz_sum = 0;
77 int64_t x_sum = 0, x2_sum = 0;
78 int32x4_t zero = vdupq_n_s32(0);
79 int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
80 int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
81 int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
82 int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
83 // Process horizontal and vertical correlations through the body in 4x4
84 // blocks. This excludes the final row and column and possibly one extra
85 // column depending how 3 divides into width and height
86
87 for (int i = 0; i <= height - 4; i += 3) {
88 int32x4_t xy_sum_32 = zero;
89 int32x4_t xz_sum_32 = zero;
90 int32x4_t x_sum_32 = zero;
91 int32x4_t x2_sum_32 = zero;
92 for (int j = 0; j <= width - 4; j += 3) {
93 horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
94 &xz_sum_32, &x_sum_32, &x2_sum_32);
95 }
96 v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
97 v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
98 v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
99 v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
100 }
101 #if AOM_ARCH_AARCH64
102 xy_sum = vaddvq_s64(v_xy_sum);
103 xz_sum = vaddvq_s64(v_xz_sum);
104 x2_sum = vaddvq_s64(v_x2_sum);
105 x_sum = vaddvq_s64(v_x_sum);
106 #else
107 xy_sum = vget_lane_s64(
108 vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
109 xz_sum = vget_lane_s64(
110 vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
111 x2_sum = vget_lane_s64(
112 vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
113 x_sum =
114 vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
115 #endif
116 // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
117 int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
118
119 // Do we have 2 rows remaining or just the one? Note that width and height
120 // are powers of 2, so each modulo 3 must be 1 or 2.
121 if (height % 3 == 1) { // Just horiz corrs on the final row
122 const int16_t x0 = diff[(height - 1) * stride];
123 x_sum += x0;
124 x_finalrow += x0;
125 x2_sum += x0 * x0;
126 x2_finalrow += x0 * x0;
127 if (width >= 8) {
128 int32x4_t v_y_sum = zero;
129 int32x4_t v_y2_sum = zero;
130 int32x4_t v_xy_sum_a = zero;
131 int k = width - 1;
132 int j = 0;
133 while ((k - 8) > 0) {
134 const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
135 const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
136 const int16x4_t v_x_lo = vget_low_s16(v_x);
137 const int16x4_t v_x_hi = vget_high_s16(v_x);
138 const int16x4_t v_y_lo = vget_low_s16(v_y);
139 const int16x4_t v_y_hi = vget_high_s16(v_y);
140 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
141 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
142 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
143 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
144 v_y_sum = vpadalq_s16(v_y_sum, v_y);
145 k -= 8;
146 j += 8;
147 }
148
149 const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
150 const int16x8_t v_x =
151 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
152 vreinterpretq_s16_s32(zero), 1);
153 const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
154 const int16x4_t v_x_lo = vget_low_s16(v_x);
155 const int16x4_t v_x_hi = vget_high_s16(v_x);
156 const int16x4_t v_y_lo = vget_low_s16(v_y);
157 const int16x4_t v_y_hi = vget_high_s16(v_y);
158 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
159 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
160 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
161 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
162 const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
163 const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
164 #if AOM_ARCH_AARCH64
165 const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
166 xy_sum += vaddvq_s64(v_xy_sum2);
167 const int32_t y = vaddvq_s32(v_y_sum_a);
168 const int64_t y2 = vaddvq_s64(v_y2_sum_a);
169 #else
170 xy_sum += vget_lane_s64(
171 vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
172 const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
173 const int64_t y =
174 vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
175 const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
176 int64_t y2 = vget_lane_s64(
177 vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
178 #endif
179 x_sum += y;
180 x2_sum += y2;
181 x_finalrow += y;
182 x2_finalrow += y2;
183 } else {
184 for (int j = 0; j < width - 1; ++j) {
185 const int16_t x = diff[(height - 1) * stride + j];
186 const int16_t y = diff[(height - 1) * stride + j + 1];
187 xy_sum += x * y;
188 x_sum += y;
189 x2_sum += y * y;
190 x_finalrow += y;
191 x2_finalrow += y * y;
192 }
193 }
194 } else { // Two rows remaining to do
195 const int16_t x0 = diff[(height - 2) * stride];
196 const int16_t z0 = diff[(height - 1) * stride];
197 x_sum += x0 + z0;
198 x2_sum += x0 * x0 + z0 * z0;
199 x_finalrow += z0;
200 x2_finalrow += z0 * z0;
201 if (width >= 8) {
202 int32x4_t v_y2_sum = zero;
203 int32x4_t v_w2_sum = zero;
204 int32x4_t v_xy_sum_a = zero;
205 int32x4_t v_xz_sum_a = zero;
206 int32x4_t v_x_sum_a = zero;
207 int32x4_t v_w_sum = zero;
208 int k = width - 1;
209 int j = 0;
210 while ((k - 8) > 0) {
211 const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
212 const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
213 const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
214 const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
215
216 const int16x4_t v_x_lo = vget_low_s16(v_x);
217 const int16x4_t v_y_lo = vget_low_s16(v_y);
218 const int16x4_t v_z_lo = vget_low_s16(v_z);
219 const int16x4_t v_w_lo = vget_low_s16(v_w);
220 const int16x4_t v_x_hi = vget_high_s16(v_x);
221 const int16x4_t v_y_hi = vget_high_s16(v_y);
222 const int16x4_t v_z_hi = vget_high_s16(v_z);
223 const int16x4_t v_w_hi = vget_high_s16(v_w);
224
225 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
226 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
227 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
228 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
229
230 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
231 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
232
233 v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
234 v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
235 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
236 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
237
238 v_w_sum = vpadalq_s16(v_w_sum, v_w);
239 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
240 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
241
242 k -= 8;
243 j += 8;
244 }
245 const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
246 const int16x8_t v_x =
247 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
248 vreinterpretq_s16_s32(zero), 1);
249 const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
250 const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
251 const int16x8_t v_z =
252 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
253 vreinterpretq_s16_s32(zero), 1);
254 const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
255
256 const int16x4_t v_x_lo = vget_low_s16(v_x);
257 const int16x4_t v_y_lo = vget_low_s16(v_y);
258 const int16x4_t v_z_lo = vget_low_s16(v_z);
259 const int16x4_t v_w_lo = vget_low_s16(v_w);
260 const int16x4_t v_x_hi = vget_high_s16(v_x);
261 const int16x4_t v_y_hi = vget_high_s16(v_y);
262 const int16x4_t v_z_hi = vget_high_s16(v_z);
263 const int16x4_t v_w_hi = vget_high_s16(v_w);
264
265 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
266 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
267 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
268 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
269
270 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
271 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
272
273 v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
274 v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
275 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
276 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
277
278 v_w_sum = vpadalq_s16(v_w_sum, v_w);
279 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
280 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
281
282 #if AOM_ARCH_AARCH64
283 xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
284 xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
285 x_sum += vaddvq_s32(v_x_sum_a);
286 x_finalrow += vaddvq_s32(v_w_sum);
287 int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
288 int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
289 #else
290 const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
291 xy_sum += vget_lane_s64(
292 vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
293 const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
294 xz_sum += vget_lane_s64(
295 vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
296 const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
297 x_sum += vget_lane_s64(
298 vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
299 const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
300 x_finalrow += vget_lane_s64(
301 vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
302 const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
303 int64_t y2 = vget_lane_s64(
304 vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
305 const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
306 int64_t w2 = vget_lane_s64(
307 vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
308 #endif
309 x2_sum += y2 + w2;
310 x2_finalrow += w2;
311 } else {
312 for (int j = 0; j < width - 1; ++j) {
313 const int16_t x = diff[(height - 2) * stride + j];
314 const int16_t y = diff[(height - 2) * stride + j + 1];
315 const int16_t z = diff[(height - 1) * stride + j];
316 const int16_t w = diff[(height - 1) * stride + j + 1];
317
318 // Horizontal and vertical correlations for the penultimate row:
319 xy_sum += x * y;
320 xz_sum += x * z;
321
322 // Now just horizontal correlations for the final row:
323 xy_sum += z * w;
324
325 x_sum += y + w;
326 x2_sum += y * y + w * w;
327 x_finalrow += w;
328 x2_finalrow += w * w;
329 }
330 }
331 }
332
333 // Do we have 2 columns remaining or just the one?
334 if (width % 3 == 1) { // Just vert corrs on the final col
335 const int16_t x0 = diff[width - 1];
336 x_sum += x0;
337 x_finalcol += x0;
338 x2_sum += x0 * x0;
339 x2_finalcol += x0 * x0;
340 for (int i = 0; i < height - 1; ++i) {
341 const int16_t x = diff[i * stride + width - 1];
342 const int16_t z = diff[(i + 1) * stride + width - 1];
343 xz_sum += x * z;
344 x_finalcol += z;
345 x2_finalcol += z * z;
346 // So the bottom-right elements don't get counted twice:
347 if (i < height - (height % 3 == 1 ? 2 : 3)) {
348 x_sum += z;
349 x2_sum += z * z;
350 }
351 }
352 } else { // Two cols remaining
353 const int16_t x0 = diff[width - 2];
354 const int16_t y0 = diff[width - 1];
355 x_sum += x0 + y0;
356 x2_sum += x0 * x0 + y0 * y0;
357 x_finalcol += y0;
358 x2_finalcol += y0 * y0;
359 for (int i = 0; i < height - 1; ++i) {
360 const int16_t x = diff[i * stride + width - 2];
361 const int16_t y = diff[i * stride + width - 1];
362 const int16_t z = diff[(i + 1) * stride + width - 2];
363 const int16_t w = diff[(i + 1) * stride + width - 1];
364
365 // Horizontal and vertical correlations for the penultimate col:
366 // Skip these on the last iteration of this loop if we also had two
367 // rows remaining, otherwise the final horizontal and vertical correlation
368 // get erroneously processed twice
369 if (i < height - 2 || height % 3 == 1) {
370 xy_sum += x * y;
371 xz_sum += x * z;
372 }
373
374 x_finalcol += w;
375 x2_finalcol += w * w;
376 // So the bottom-right elements don't get counted twice:
377 if (i < height - (height % 3 == 1 ? 2 : 3)) {
378 x_sum += z + w;
379 x2_sum += z * z + w * w;
380 }
381
382 // Now just vertical correlations for the final column:
383 xz_sum += y * w;
384 }
385 }
386
387 // Calculate the simple sums and squared-sums
388 int64_t x_firstrow = 0, x_firstcol = 0;
389 int64_t x2_firstrow = 0, x2_firstcol = 0;
390
391 if (width >= 8) {
392 int32x4_t v_x_firstrow = zero;
393 int32x4_t v_x2_firstrow = zero;
394 for (int j = 0; j < width; j += 8) {
395 const int16x8_t v_diff = vld1q_s16(diff + j);
396 const int16x4_t v_diff_lo = vget_low_s16(v_diff);
397 const int16x4_t v_diff_hi = vget_high_s16(v_diff);
398 v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
399 v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
400 v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
401 }
402 #if AOM_ARCH_AARCH64
403 x_firstrow += vaddvq_s32(v_x_firstrow);
404 x2_firstrow += vaddvq_s32(v_x2_firstrow);
405 #else
406 const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
407 x_firstrow += vget_lane_s64(
408 vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
409 0);
410 const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
411 x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
412 vget_high_s64(v_x2_firstrow_64)),
413 0);
414 #endif
415 } else {
416 for (int j = 0; j < width; ++j) {
417 x_firstrow += diff[j];
418 x2_firstrow += diff[j] * diff[j];
419 }
420 }
421 for (int i = 0; i < height; ++i) {
422 x_firstcol += diff[i * stride];
423 x2_firstcol += diff[i * stride] * diff[i * stride];
424 }
425
426 int64_t xhor_sum = x_sum - x_finalcol;
427 int64_t xver_sum = x_sum - x_finalrow;
428 int64_t y_sum = x_sum - x_firstcol;
429 int64_t z_sum = x_sum - x_firstrow;
430 int64_t x2hor_sum = x2_sum - x2_finalcol;
431 int64_t x2ver_sum = x2_sum - x2_finalrow;
432 int64_t y2_sum = x2_sum - x2_firstcol;
433 int64_t z2_sum = x2_sum - x2_firstrow;
434
435 const float num_hor = (float)(height * (width - 1));
436 const float num_ver = (float)((height - 1) * width);
437
438 const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
439 const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
440
441 const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
442 const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
443
444 const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
445 const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
446
447 if (xhor_var_n > 0 && y_var_n > 0) {
448 *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
449 *hcorr = *hcorr < 0 ? 0 : *hcorr;
450 } else {
451 *hcorr = 1.0;
452 }
453 if (xver_var_n > 0 && z_var_n > 0) {
454 *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
455 *vcorr = *vcorr < 0 ? 0 : *vcorr;
456 } else {
457 *vcorr = 1.0;
458 }
459 }
460