1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <immintrin.h> // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13
calc_final_4(const __m256i * const sums,uint32_t sad_array[4])14 static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
15 uint32_t sad_array[4]) {
16 const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
17 const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
18 const __m256i t2 = _mm256_hadd_epi32(t0, t1);
19 const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
20 _mm256_extractf128_si256(t2, 1));
21 _mm_storeu_si128((__m128i *)sad_array, sum);
22 }
23
highbd_sad64xHx4d(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * refs[4],int ref_stride,int height)24 static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
25 const uint16_t *src,
26 int src_stride,
27 uint16_t *refs[4],
28 int ref_stride, int height) {
29 int i;
30 for (i = 0; i < height; ++i) {
31 // load src and all ref[]
32 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
33 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
34 const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
35 const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
36 int x;
37
38 for (x = 0; x < 4; ++x) {
39 __m256i r[4];
40 r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
41 r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
42 r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
43 r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
44
45 // absolute differences between every ref[] to src
46 r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
47 r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
48 r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
49 r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
50
51 // sum every abs diff
52 sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
53 sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
54 }
55
56 src += src_stride;
57 refs[0] += ref_stride;
58 refs[1] += ref_stride;
59 refs[2] += ref_stride;
60 refs[3] += ref_stride;
61 }
62 }
63
highbd_sad64xNx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4],int n)64 static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
65 const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
66 int ref_stride, uint32_t sad_array[4], int n) {
67 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
68 uint16_t *refs[4];
69 __m256i sums_16[4];
70 __m256i sums_32[4];
71 int i;
72
73 refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
74 refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
75 refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
76 refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
77 sums_32[0] = _mm256_setzero_si256();
78 sums_32[1] = _mm256_setzero_si256();
79 sums_32[2] = _mm256_setzero_si256();
80 sums_32[3] = _mm256_setzero_si256();
81
82 for (i = 0; i < (n / 2); ++i) {
83 sums_16[0] = _mm256_setzero_si256();
84 sums_16[1] = _mm256_setzero_si256();
85 sums_16[2] = _mm256_setzero_si256();
86 sums_16[3] = _mm256_setzero_si256();
87
88 highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
89
90 /* sums_16 will outrange after 2 rows, so add current sums_16 to
91 * sums_32*/
92 sums_32[0] = _mm256_add_epi32(
93 sums_32[0],
94 _mm256_add_epi32(
95 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
96 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
97 sums_32[1] = _mm256_add_epi32(
98 sums_32[1],
99 _mm256_add_epi32(
100 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
101 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
102 sums_32[2] = _mm256_add_epi32(
103 sums_32[2],
104 _mm256_add_epi32(
105 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
106 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
107 sums_32[3] = _mm256_add_epi32(
108 sums_32[3],
109 _mm256_add_epi32(
110 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
111 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
112
113 src += src_stride << 1;
114 }
115 calc_final_4(sums_32, sad_array);
116 }
117
118 #define HIGHBD_SAD64XNX4D(n) \
119 void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \
120 const uint8_t *const ref_array[4], \
121 int ref_stride, uint32_t sad_array[4]) { \
122 highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
123 n); \
124 }
125
126 #define HIGHBD_SADSKIP64XNx4D(n) \
127 void vpx_highbd_sad_skip_64x##n##x4d_avx2( \
128 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
129 int ref_stride, uint32_t sad_array[4]) { \
130 highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
131 sad_array, n / 2); \
132 sad_array[0] <<= 1; \
133 sad_array[1] <<= 1; \
134 sad_array[2] <<= 1; \
135 sad_array[3] <<= 1; \
136 }
137
highbd_sad32xHx4d(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * refs[4],int ref_stride,int height)138 static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
139 const uint16_t *src,
140 int src_stride,
141 uint16_t *refs[4],
142 int ref_stride, int height) {
143 int i;
144 for (i = 0; i < height; i++) {
145 __m256i r[8];
146
147 // load src and all ref[]
148 const __m256i s = _mm256_load_si256((const __m256i *)src);
149 const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
150 r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
151 r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
152 r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
153 r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
154 r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
155 r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
156 r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
157 r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
158
159 // absolute differences between every ref[] to src
160 r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
161 r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
162 r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
163 r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
164 r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
165 r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
166 r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
167 r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
168
169 // sum every abs diff
170 sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
171 sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
172 sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
173 sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
174
175 src += src_stride;
176 refs[0] += ref_stride;
177 refs[1] += ref_stride;
178 refs[2] += ref_stride;
179 refs[3] += ref_stride;
180 }
181 }
182
highbd_sad32xNx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4],int n)183 static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
184 const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
185 int ref_stride, uint32_t sad_array[4], int n) {
186 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
187 uint16_t *refs[4];
188 __m256i sums_16[4];
189 __m256i sums_32[4];
190 int i;
191
192 refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
193 refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
194 refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
195 refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
196 sums_32[0] = _mm256_setzero_si256();
197 sums_32[1] = _mm256_setzero_si256();
198 sums_32[2] = _mm256_setzero_si256();
199 sums_32[3] = _mm256_setzero_si256();
200
201 for (i = 0; i < (n / 8); ++i) {
202 sums_16[0] = _mm256_setzero_si256();
203 sums_16[1] = _mm256_setzero_si256();
204 sums_16[2] = _mm256_setzero_si256();
205 sums_16[3] = _mm256_setzero_si256();
206
207 highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
208
209 /* sums_16 will outrange after 8 rows, so add current sums_16 to
210 * sums_32*/
211 sums_32[0] = _mm256_add_epi32(
212 sums_32[0],
213 _mm256_add_epi32(
214 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
215 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
216 sums_32[1] = _mm256_add_epi32(
217 sums_32[1],
218 _mm256_add_epi32(
219 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
220 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
221 sums_32[2] = _mm256_add_epi32(
222 sums_32[2],
223 _mm256_add_epi32(
224 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
225 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
226 sums_32[3] = _mm256_add_epi32(
227 sums_32[3],
228 _mm256_add_epi32(
229 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
230 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
231
232 src += src_stride << 3;
233 }
234 calc_final_4(sums_32, sad_array);
235 }
236
237 #define HIGHBD_SAD32XNX4D(n) \
238 void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \
239 const uint8_t *const ref_array[4], \
240 int ref_stride, uint32_t sad_array[4]) { \
241 highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
242 n); \
243 }
244
245 #define HIGHBD_SADSKIP32XNx4D(n) \
246 void vpx_highbd_sad_skip_32x##n##x4d_avx2( \
247 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
248 int ref_stride, uint32_t sad_array[4]) { \
249 highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
250 sad_array, n / 2); \
251 sad_array[0] <<= 1; \
252 sad_array[1] <<= 1; \
253 sad_array[2] <<= 1; \
254 sad_array[3] <<= 1; \
255 }
256
highbd_sad16xHx4d(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * refs[4],int ref_stride,int height)257 static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
258 const uint16_t *src,
259 int src_stride,
260 uint16_t *refs[4],
261 int ref_stride, int height) {
262 int i;
263 for (i = 0; i < height; i++) {
264 __m256i r[4];
265
266 // load src and all ref[]
267 const __m256i s = _mm256_load_si256((const __m256i *)src);
268 r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
269 r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
270 r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
271 r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
272
273 // absolute differences between every ref[] to src
274 r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
275 r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
276 r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
277 r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
278
279 // sum every abs diff
280 sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
281 sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
282 sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
283 sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
284
285 src += src_stride;
286 refs[0] += ref_stride;
287 refs[1] += ref_stride;
288 refs[2] += ref_stride;
289 refs[3] += ref_stride;
290 }
291 }
292
highbd_sad16xNx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4],int n)293 static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
294 const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
295 int ref_stride, uint32_t sad_array[4], int n) {
296 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
297 uint16_t *refs[4];
298 __m256i sums_16[4];
299 __m256i sums_32[4];
300 const int height = VPXMIN(16, n);
301 const int num_iters = n / height;
302 int i;
303
304 refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
305 refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
306 refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
307 refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
308 sums_32[0] = _mm256_setzero_si256();
309 sums_32[1] = _mm256_setzero_si256();
310 sums_32[2] = _mm256_setzero_si256();
311 sums_32[3] = _mm256_setzero_si256();
312
313 for (i = 0; i < num_iters; ++i) {
314 sums_16[0] = _mm256_setzero_si256();
315 sums_16[1] = _mm256_setzero_si256();
316 sums_16[2] = _mm256_setzero_si256();
317 sums_16[3] = _mm256_setzero_si256();
318
319 highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
320
321 // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
322 sums_32[0] = _mm256_add_epi32(
323 sums_32[0],
324 _mm256_add_epi32(
325 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
326 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
327 sums_32[1] = _mm256_add_epi32(
328 sums_32[1],
329 _mm256_add_epi32(
330 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
331 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
332 sums_32[2] = _mm256_add_epi32(
333 sums_32[2],
334 _mm256_add_epi32(
335 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
336 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
337 sums_32[3] = _mm256_add_epi32(
338 sums_32[3],
339 _mm256_add_epi32(
340 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
341 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
342
343 src += src_stride << 4;
344 }
345 calc_final_4(sums_32, sad_array);
346 }
347
348 #define HIGHBD_SAD16XNX4D(n) \
349 void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
350 const uint8_t *const ref_array[4], \
351 int ref_stride, uint32_t sad_array[4]) { \
352 highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
353 n); \
354 }
355
356 #define HIGHBD_SADSKIP16XNx4D(n) \
357 void vpx_highbd_sad_skip_16x##n##x4d_avx2( \
358 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
359 int ref_stride, uint32_t sad_array[4]) { \
360 highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
361 sad_array, n / 2); \
362 sad_array[0] <<= 1; \
363 sad_array[1] <<= 1; \
364 sad_array[2] <<= 1; \
365 sad_array[3] <<= 1; \
366 }
367
vpx_highbd_sad16x16x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])368 void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
369 const uint8_t *const ref_array[4],
370 int ref_stride, uint32_t sad_array[4]) {
371 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
372 uint16_t *refs[4];
373 __m256i sums_16[4];
374
375 refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
376 refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
377 refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
378 refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
379 sums_16[0] = _mm256_setzero_si256();
380 sums_16[1] = _mm256_setzero_si256();
381 sums_16[2] = _mm256_setzero_si256();
382 sums_16[3] = _mm256_setzero_si256();
383
384 highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
385
386 {
387 __m256i sums_32[4];
388 sums_32[0] = _mm256_add_epi32(
389 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
390 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
391 sums_32[1] = _mm256_add_epi32(
392 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
393 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
394 sums_32[2] = _mm256_add_epi32(
395 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
396 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
397 sums_32[3] = _mm256_add_epi32(
398 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
399 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
400 calc_final_4(sums_32, sad_array);
401 }
402 }
403
vpx_highbd_sad16x8x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])404 void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
405 const uint8_t *const ref_array[4],
406 int ref_stride, uint32_t sad_array[4]) {
407 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
408 uint16_t *refs[4];
409 __m256i sums_16[4];
410
411 refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
412 refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
413 refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
414 refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
415 sums_16[0] = _mm256_setzero_si256();
416 sums_16[1] = _mm256_setzero_si256();
417 sums_16[2] = _mm256_setzero_si256();
418 sums_16[3] = _mm256_setzero_si256();
419
420 highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
421
422 {
423 __m256i sums_32[4];
424 sums_32[0] = _mm256_add_epi32(
425 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
426 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
427 sums_32[1] = _mm256_add_epi32(
428 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
429 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
430 sums_32[2] = _mm256_add_epi32(
431 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
432 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
433 sums_32[3] = _mm256_add_epi32(
434 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
435 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
436 calc_final_4(sums_32, sad_array);
437 }
438 }
439
440 // clang-format off
441 HIGHBD_SAD64XNX4D(64)
442 HIGHBD_SADSKIP64XNx4D(64)
443
444 HIGHBD_SAD64XNX4D(32)
445 HIGHBD_SADSKIP64XNx4D(32)
446
447 HIGHBD_SAD32XNX4D(64)
448 HIGHBD_SADSKIP32XNx4D(64)
449
450 HIGHBD_SAD32XNX4D(32)
451 HIGHBD_SADSKIP32XNx4D(32)
452
453 HIGHBD_SAD32XNX4D(16)
454 HIGHBD_SADSKIP32XNx4D(16)
455
456 HIGHBD_SAD16XNX4D(32)
457 HIGHBD_SADSKIP16XNx4D(32)
458
459 HIGHBD_SADSKIP16XNx4D(16)
460
461 HIGHBD_SADSKIP16XNx4D(8)
462 // clang-format on
463