xref: /aosp_15_r20/external/libaom/aom_dsp/avg.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <stdlib.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "aom_ports/mem.h"
17 
aom_minmax_8x8_c(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)18 void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
19                       int *min, int *max) {
20   int i, j;
21   *min = 255;
22   *max = 0;
23   for (i = 0; i < 8; ++i, s += p, d += dp) {
24     for (j = 0; j < 8; ++j) {
25       int diff = abs(s[j] - d[j]);
26       *min = diff < *min ? diff : *min;
27       *max = diff > *max ? diff : *max;
28     }
29   }
30 }
31 
aom_avg_4x4_c(const uint8_t * s,int p)32 unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
33   int i, j;
34   int sum = 0;
35   for (i = 0; i < 4; ++i, s += p)
36     for (j = 0; j < 4; sum += s[j], ++j) {
37     }
38 
39   return (sum + 8) >> 4;
40 }
41 
aom_avg_8x8_c(const uint8_t * s,int p)42 unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
43   int i, j;
44   int sum = 0;
45   for (i = 0; i < 8; ++i, s += p)
46     for (j = 0; j < 8; sum += s[j], ++j) {
47     }
48 
49   return (sum + 32) >> 6;
50 }
51 
aom_avg_8x8_quad_c(const uint8_t * s,int p,int x16_idx,int y16_idx,int * avg)52 void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx,
53                         int *avg) {
54   for (int k = 0; k < 4; k++) {
55     const int x8_idx = x16_idx + ((k & 1) << 3);
56     const int y8_idx = y16_idx + ((k >> 1) << 3);
57     const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
58     avg[k] = aom_avg_8x8_c(s_tmp, p);
59   }
60 }
61 
62 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_avg_8x8_c(const uint8_t * s8,int p)63 unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
64   int i, j;
65   int sum = 0;
66   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
67   for (i = 0; i < 8; ++i, s += p)
68     for (j = 0; j < 8; sum += s[j], ++j) {
69     }
70 
71   return (sum + 32) >> 6;
72 }
73 
aom_highbd_avg_4x4_c(const uint8_t * s8,int p)74 unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
75   int i, j;
76   int sum = 0;
77   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
78   for (i = 0; i < 4; ++i, s += p)
79     for (j = 0; j < 4; sum += s[j], ++j) {
80     }
81 
82   return (sum + 8) >> 4;
83 }
84 
aom_highbd_minmax_8x8_c(const uint8_t * s8,int p,const uint8_t * d8,int dp,int * min,int * max)85 void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
86                              int dp, int *min, int *max) {
87   int i, j;
88   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
89   const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
90   *min = 65535;
91   *max = 0;
92   for (i = 0; i < 8; ++i, s += p, d += dp) {
93     for (j = 0; j < 8; ++j) {
94       int diff = abs(s[j] - d[j]);
95       *min = diff < *min ? diff : *min;
96       *max = diff > *max ? diff : *max;
97     }
98   }
99 }
100 #endif  // CONFIG_AV1_HIGHBITDEPTH
101 
hadamard_col4(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)102 static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
103                           int16_t *coeff) {
104   int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
105   int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1;
106   int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1;
107   int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1;
108 
109   coeff[0] = b0 + b2;
110   coeff[1] = b1 + b3;
111   coeff[2] = b0 - b2;
112   coeff[3] = b1 - b3;
113 }
114 
aom_hadamard_4x4_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)115 void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
116                         tran_low_t *coeff) {
117   int idx;
118   int16_t buffer[16];
119   int16_t buffer2[16];
120   int16_t *tmp_buf = &buffer[0];
121   for (idx = 0; idx < 4; ++idx) {
122     hadamard_col4(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
123                                                    // dynamic range [-255, 255]
124     tmp_buf += 4;
125     ++src_diff;
126   }
127 
128   tmp_buf = &buffer[0];
129   for (idx = 0; idx < 4; ++idx) {
130     hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx);  // tmp_buf: 12 bit
131     // dynamic range [-2040, 2040]
132     // buffer2: 15 bit
133     // dynamic range [-16320, 16320]
134     ++tmp_buf;
135   }
136 
137   // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2).
138   for (int i = 0; i < 4; i++) {
139     for (int j = 0; j < 4; j++) {
140       coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i];
141     }
142   }
143 }
144 
145 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
146 //           second pass, 12 bit, dynamic range [-2040, 2040]
hadamard_col8(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)147 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
148                           int16_t *coeff) {
149   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
150   int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
151   int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
152   int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
153   int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
154   int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
155   int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
156   int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
157 
158   int16_t c0 = b0 + b2;
159   int16_t c1 = b1 + b3;
160   int16_t c2 = b0 - b2;
161   int16_t c3 = b1 - b3;
162   int16_t c4 = b4 + b6;
163   int16_t c5 = b5 + b7;
164   int16_t c6 = b4 - b6;
165   int16_t c7 = b5 - b7;
166 
167   coeff[0] = c0 + c4;
168   coeff[7] = c1 + c5;
169   coeff[3] = c2 + c6;
170   coeff[4] = c3 + c7;
171   coeff[2] = c0 - c4;
172   coeff[6] = c1 - c5;
173   coeff[1] = c2 - c6;
174   coeff[5] = c3 - c7;
175 }
176 
aom_hadamard_8x8_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)177 void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
178                         tran_low_t *coeff) {
179   int idx;
180   int16_t buffer[64];
181   int16_t buffer2[64];
182   int16_t *tmp_buf = &buffer[0];
183   for (idx = 0; idx < 8; ++idx) {
184     hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
185                                                    // dynamic range [-255, 255]
186     tmp_buf += 8;
187     ++src_diff;
188   }
189 
190   tmp_buf = &buffer[0];
191   for (idx = 0; idx < 8; ++idx) {
192     hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
193     // dynamic range [-2040, 2040]
194     // buffer2: 15 bit
195     // dynamic range [-16320, 16320]
196     ++tmp_buf;
197   }
198 
199   // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2).
200   for (int i = 0; i < 8; i++) {
201     for (int j = 0; j < 8; j++) {
202       coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i];
203     }
204   }
205 }
206 
aom_hadamard_lp_8x8_c(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)207 void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
208                            int16_t *coeff) {
209   int16_t buffer[64];
210   int16_t buffer2[64];
211   int16_t *tmp_buf = &buffer[0];
212   for (int idx = 0; idx < 8; ++idx) {
213     hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
214                                                    // dynamic range [-255, 255]
215     tmp_buf += 8;
216     ++src_diff;
217   }
218 
219   tmp_buf = &buffer[0];
220   for (int idx = 0; idx < 8; ++idx) {
221     hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
222     // dynamic range [-2040, 2040]
223     // buffer2: 15 bit
224     // dynamic range [-16320, 16320]
225     ++tmp_buf;
226   }
227 
228   for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
229 
230   // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2).
231   for (int i = 0; i < 8; i++) {
232     for (int j = 0; j < 8; j++) {
233       coeff[i * 8 + j] = buffer2[j * 8 + i];
234     }
235   }
236 }
237 
aom_hadamard_lp_8x8_dual_c(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)238 void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
239                                 int16_t *coeff) {
240   for (int i = 0; i < 2; i++) {
241     aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride,
242                           (int16_t *)coeff + (i * 64));
243   }
244 }
245 
246 // In place 16x16 2D Hadamard transform
aom_hadamard_16x16_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)247 void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
248                           tran_low_t *coeff) {
249   int idx;
250   for (idx = 0; idx < 4; ++idx) {
251     // src_diff: 9 bit, dynamic range [-255, 255]
252     const int16_t *src_ptr =
253         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
254     aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
255   }
256 
257   // coeff: 15 bit, dynamic range [-16320, 16320]
258   for (idx = 0; idx < 64; ++idx) {
259     tran_low_t a0 = coeff[0];
260     tran_low_t a1 = coeff[64];
261     tran_low_t a2 = coeff[128];
262     tran_low_t a3 = coeff[192];
263 
264     tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
265     tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
266     tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
267     tran_low_t b3 = (a2 - a3) >> 1;
268 
269     coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
270     coeff[64] = b1 + b3;
271     coeff[128] = b0 - b2;
272     coeff[192] = b1 - b3;
273 
274     ++coeff;
275   }
276 
277   coeff -= 64;
278   // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2).
279   // Note that to match SSE2 output, it does not need this step.
280   for (int i = 0; i < 16; i++) {
281     for (int j = 0; j < 4; j++) {
282       tran_low_t temp = coeff[i * 16 + 4 + j];
283       coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j];
284       coeff[i * 16 + 8 + j] = temp;
285     }
286   }
287 }
288 
aom_hadamard_lp_16x16_c(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)289 void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
290                              int16_t *coeff) {
291   for (int idx = 0; idx < 4; ++idx) {
292     // src_diff: 9 bit, dynamic range [-255, 255]
293     const int16_t *src_ptr =
294         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
295     aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
296   }
297 
298   for (int idx = 0; idx < 64; ++idx) {
299     int16_t a0 = coeff[0];
300     int16_t a1 = coeff[64];
301     int16_t a2 = coeff[128];
302     int16_t a3 = coeff[192];
303 
304     int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
305     int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
306     int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
307     int16_t b3 = (a2 - a3) >> 1;
308 
309     coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
310     coeff[64] = b1 + b3;
311     coeff[128] = b0 - b2;
312     coeff[192] = b1 - b3;
313 
314     ++coeff;
315   }
316 }
317 
aom_hadamard_32x32_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)318 void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
319                           tran_low_t *coeff) {
320   int idx;
321   for (idx = 0; idx < 4; ++idx) {
322     // src_diff: 9 bit, dynamic range [-255, 255]
323     const int16_t *src_ptr =
324         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
325     aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
326   }
327 
328   // coeff: 16 bit, dynamic range [-32768, 32767]
329   for (idx = 0; idx < 256; ++idx) {
330     tran_low_t a0 = coeff[0];
331     tran_low_t a1 = coeff[256];
332     tran_low_t a2 = coeff[512];
333     tran_low_t a3 = coeff[768];
334 
335     tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
336     tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
337     tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
338     tran_low_t b3 = (a2 - a3) >> 2;
339 
340     coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
341     coeff[256] = b1 + b3;
342     coeff[512] = b0 - b2;
343     coeff[768] = b1 - b3;
344 
345     ++coeff;
346   }
347 }
348 
349 #if CONFIG_AV1_HIGHBITDEPTH
hadamard_highbd_col8_first_pass(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)350 static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
351                                             ptrdiff_t src_stride,
352                                             int16_t *coeff) {
353   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
354   int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
355   int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
356   int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
357   int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
358   int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
359   int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
360   int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
361 
362   int16_t c0 = b0 + b2;
363   int16_t c1 = b1 + b3;
364   int16_t c2 = b0 - b2;
365   int16_t c3 = b1 - b3;
366   int16_t c4 = b4 + b6;
367   int16_t c5 = b5 + b7;
368   int16_t c6 = b4 - b6;
369   int16_t c7 = b5 - b7;
370 
371   coeff[0] = c0 + c4;
372   coeff[7] = c1 + c5;
373   coeff[3] = c2 + c6;
374   coeff[4] = c3 + c7;
375   coeff[2] = c0 - c4;
376   coeff[6] = c1 - c5;
377   coeff[1] = c2 - c6;
378   coeff[5] = c3 - c7;
379 }
380 
381 // src_diff: 16 bit, dynamic range [-32760, 32760]
382 // coeff: 19 bit
hadamard_highbd_col8_second_pass(const int16_t * src_diff,ptrdiff_t src_stride,int32_t * coeff)383 static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
384                                              ptrdiff_t src_stride,
385                                              int32_t *coeff) {
386   int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
387   int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
388   int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
389   int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
390   int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
391   int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
392   int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
393   int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
394 
395   int32_t c0 = b0 + b2;
396   int32_t c1 = b1 + b3;
397   int32_t c2 = b0 - b2;
398   int32_t c3 = b1 - b3;
399   int32_t c4 = b4 + b6;
400   int32_t c5 = b5 + b7;
401   int32_t c6 = b4 - b6;
402   int32_t c7 = b5 - b7;
403 
404   coeff[0] = c0 + c4;
405   coeff[7] = c1 + c5;
406   coeff[3] = c2 + c6;
407   coeff[4] = c3 + c7;
408   coeff[2] = c0 - c4;
409   coeff[6] = c1 - c5;
410   coeff[1] = c2 - c6;
411   coeff[5] = c3 - c7;
412 }
413 
414 // The order of the output coeff of the hadamard is not important. For
415 // optimization purposes the final transpose may be skipped.
aom_highbd_hadamard_8x8_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)416 void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
417                                tran_low_t *coeff) {
418   int idx;
419   int16_t buffer[64];
420   int32_t buffer2[64];
421   int16_t *tmp_buf = &buffer[0];
422   for (idx = 0; idx < 8; ++idx) {
423     // src_diff: 13 bit
424     // buffer: 16 bit, dynamic range [-32760, 32760]
425     hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
426     tmp_buf += 8;
427     ++src_diff;
428   }
429 
430   tmp_buf = &buffer[0];
431   for (idx = 0; idx < 8; ++idx) {
432     // buffer: 16 bit
433     // buffer2: 19 bit, dynamic range [-262080, 262080]
434     hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
435     ++tmp_buf;
436   }
437 
438   for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
439 }
440 
441 // In place 16x16 2D Hadamard transform
aom_highbd_hadamard_16x16_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)442 void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
443                                  tran_low_t *coeff) {
444   int idx;
445   for (idx = 0; idx < 4; ++idx) {
446     // src_diff: 13 bit, dynamic range [-4095, 4095]
447     const int16_t *src_ptr =
448         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
449     aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
450   }
451 
452   // coeff: 19 bit, dynamic range [-262080, 262080]
453   for (idx = 0; idx < 64; ++idx) {
454     tran_low_t a0 = coeff[0];
455     tran_low_t a1 = coeff[64];
456     tran_low_t a2 = coeff[128];
457     tran_low_t a3 = coeff[192];
458 
459     tran_low_t b0 = (a0 + a1) >> 1;
460     tran_low_t b1 = (a0 - a1) >> 1;
461     tran_low_t b2 = (a2 + a3) >> 1;
462     tran_low_t b3 = (a2 - a3) >> 1;
463 
464     // new coeff dynamic range: 20 bit
465     coeff[0] = b0 + b2;
466     coeff[64] = b1 + b3;
467     coeff[128] = b0 - b2;
468     coeff[192] = b1 - b3;
469 
470     ++coeff;
471   }
472 }
473 
aom_highbd_hadamard_32x32_c(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)474 void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
475                                  tran_low_t *coeff) {
476   int idx;
477   for (idx = 0; idx < 4; ++idx) {
478     // src_diff: 13 bit, dynamic range [-4095, 4095]
479     const int16_t *src_ptr =
480         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
481     aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
482   }
483 
484   // coeff: 20 bit
485   for (idx = 0; idx < 256; ++idx) {
486     tran_low_t a0 = coeff[0];
487     tran_low_t a1 = coeff[256];
488     tran_low_t a2 = coeff[512];
489     tran_low_t a3 = coeff[768];
490 
491     tran_low_t b0 = (a0 + a1) >> 2;
492     tran_low_t b1 = (a0 - a1) >> 2;
493     tran_low_t b2 = (a2 + a3) >> 2;
494     tran_low_t b3 = (a2 - a3) >> 2;
495 
496     // new coeff dynamic range: 20 bit
497     coeff[0] = b0 + b2;
498     coeff[256] = b1 + b3;
499     coeff[512] = b0 - b2;
500     coeff[768] = b1 - b3;
501 
502     ++coeff;
503   }
504 }
505 #endif  // CONFIG_AV1_HIGHBITDEPTH
506 
507 // coeff: 20 bits, dynamic range [-524287, 524287].
508 // length: value range {16, 32, 64, 128, 256, 512, 1024}.
aom_satd_c(const tran_low_t * coeff,int length)509 int aom_satd_c(const tran_low_t *coeff, int length) {
510   int i;
511   int satd = 0;
512   for (i = 0; i < length; ++i) satd += abs(coeff[i]);
513 
514   // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
515   return satd;
516 }
517 
aom_satd_lp_c(const int16_t * coeff,int length)518 int aom_satd_lp_c(const int16_t *coeff, int length) {
519   int satd = 0;
520   for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
521 
522   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
523   return satd;
524 }
525 
526 // Integer projection onto row vectors.
527 // height: value range {16, 32, 64, 128}.
aom_int_pro_row_c(int16_t * hbuf,const uint8_t * ref,const int ref_stride,const int width,const int height,int norm_factor)528 void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
529                        const int width, const int height, int norm_factor) {
530   assert(height >= 2);
531   for (int idx = 0; idx < width; ++idx) {
532     hbuf[idx] = 0;
533     // hbuf[idx]: 14 bit, dynamic range [0, 32640].
534     for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
535     // hbuf[idx]: 9 bit, dynamic range [0, 1020].
536     hbuf[idx] >>= norm_factor;
537     ++ref;
538   }
539 }
540 
541 // width: value range {16, 32, 64, 128}.
aom_int_pro_col_c(int16_t * vbuf,const uint8_t * ref,const int ref_stride,const int width,const int height,int norm_factor)542 void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
543                        const int width, const int height, int norm_factor) {
544   for (int ht = 0; ht < height; ++ht) {
545     int16_t sum = 0;
546     // sum: 14 bit, dynamic range [0, 32640]
547     for (int idx = 0; idx < width; ++idx) sum += ref[idx];
548     vbuf[ht] = sum >> norm_factor;
549     ref += ref_stride;
550   }
551 }
552 
553 // ref: [0 - 510]
554 // src: [0 - 510]
555 // bwl: {2, 3, 4, 5}
aom_vector_var_c(const int16_t * ref,const int16_t * src,int bwl)556 int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
557   int i;
558   int width = 4 << bwl;
559   int sse = 0, mean = 0, var;
560 
561   for (i = 0; i < width; ++i) {
562     int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
563     mean += diff;                // mean: dynamic range 16 bits.
564     sse += diff * diff;          // sse:  dynamic range 26 bits.
565   }
566 
567   // (mean * mean): dynamic range 31 bits.
568   // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
569   // 31.99, so it needs to be casted to unsigned int to compute its square.
570   const unsigned int mean_abs = abs(mean);
571   var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
572   return var;
573 }
574