xref: /aosp_15_r20/external/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/loop_restoration.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
19 #include <smmintrin.h>
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 
27 #include "src/dsp/common.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/constants.h"
33 
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37 
WienerHorizontalClip(const __m128i s[2],int16_t * const wiener_buffer)38 inline void WienerHorizontalClip(const __m128i s[2],
39                                  int16_t* const wiener_buffer) {
40   constexpr int offset =
41       1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
42   constexpr int limit = (offset << 2) - 1;
43   const __m128i offsets = _mm_set1_epi16(-offset);
44   const __m128i limits = _mm_set1_epi16(limit - offset);
45   const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
46   const __m128i sum0 = _mm_add_epi32(s[0], round);
47   const __m128i sum1 = _mm_add_epi32(s[1], round);
48   const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
49   const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
50   const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
51   const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
52   const __m128i d1 = _mm_min_epi16(d0, limits);
53   StoreAligned16(wiener_buffer, d1);
54 }
55 
WienerHorizontalTap7(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)56 inline void WienerHorizontalTap7(const uint16_t* src,
57                                  const ptrdiff_t src_stride,
58                                  const ptrdiff_t width, const int height,
59                                  const __m128i coefficients,
60                                  int16_t** const wiener_buffer) {
61   __m128i filter[2];
62   filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
63   filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
64   for (int y = height; y != 0; --y) {
65     ptrdiff_t x = 0;
66     do {
67       __m128i s[7], madds[4];
68       s[0] = LoadUnaligned16(src + x + 0);
69       s[1] = LoadUnaligned16(src + x + 1);
70       s[2] = LoadUnaligned16(src + x + 2);
71       s[3] = LoadUnaligned16(src + x + 3);
72       s[4] = LoadUnaligned16(src + x + 4);
73       s[5] = LoadUnaligned16(src + x + 5);
74       s[6] = LoadUnaligned16(src + x + 6);
75       const __m128i s06 = _mm_add_epi16(s[0], s[6]);
76       const __m128i s15 = _mm_add_epi16(s[1], s[5]);
77       const __m128i s24 = _mm_add_epi16(s[2], s[4]);
78       const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
79       const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
80       const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
81       const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
82       madds[0] = _mm_madd_epi16(ss0, filter[0]);
83       madds[1] = _mm_madd_epi16(ss1, filter[0]);
84       madds[2] = _mm_madd_epi16(ss2, filter[1]);
85       madds[3] = _mm_madd_epi16(ss3, filter[1]);
86       madds[0] = _mm_add_epi32(madds[0], madds[2]);
87       madds[1] = _mm_add_epi32(madds[1], madds[3]);
88       WienerHorizontalClip(madds, *wiener_buffer + x);
89       x += 8;
90     } while (x < width);
91     src += src_stride;
92     *wiener_buffer += width;
93   }
94 }
95 
WienerHorizontalTap5(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)96 inline void WienerHorizontalTap5(const uint16_t* src,
97                                  const ptrdiff_t src_stride,
98                                  const ptrdiff_t width, const int height,
99                                  const __m128i coefficients,
100                                  int16_t** const wiener_buffer) {
101   const __m128i filter =
102       _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
103   for (int y = height; y != 0; --y) {
104     ptrdiff_t x = 0;
105     do {
106       __m128i s[5], madds[2];
107       s[0] = LoadUnaligned16(src + x + 0);
108       s[1] = LoadUnaligned16(src + x + 1);
109       s[2] = LoadUnaligned16(src + x + 2);
110       s[3] = LoadUnaligned16(src + x + 3);
111       s[4] = LoadUnaligned16(src + x + 4);
112       const __m128i s04 = _mm_add_epi16(s[0], s[4]);
113       const __m128i s13 = _mm_add_epi16(s[1], s[3]);
114       const __m128i s2d = _mm_add_epi16(s[2], s[2]);
115       const __m128i s0m = _mm_sub_epi16(s04, s2d);
116       const __m128i s1m = _mm_sub_epi16(s13, s2d);
117       const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
118       const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
119       madds[0] = _mm_madd_epi16(ss0, filter);
120       madds[1] = _mm_madd_epi16(ss1, filter);
121       const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
122       const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
123       const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
124       const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
125       madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
126       madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
127       WienerHorizontalClip(madds, *wiener_buffer + x);
128       x += 8;
129     } while (x < width);
130     src += src_stride;
131     *wiener_buffer += width;
132   }
133 }
134 
WienerHorizontalTap3(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)135 inline void WienerHorizontalTap3(const uint16_t* src,
136                                  const ptrdiff_t src_stride,
137                                  const ptrdiff_t width, const int height,
138                                  const __m128i coefficients,
139                                  int16_t** const wiener_buffer) {
140   const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
141   for (int y = height; y != 0; --y) {
142     ptrdiff_t x = 0;
143     do {
144       __m128i s[3], madds[2];
145       s[0] = LoadUnaligned16(src + x + 0);
146       s[1] = LoadUnaligned16(src + x + 1);
147       s[2] = LoadUnaligned16(src + x + 2);
148       const __m128i s02 = _mm_add_epi16(s[0], s[2]);
149       const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
150       const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
151       madds[0] = _mm_madd_epi16(ss0, filter);
152       madds[1] = _mm_madd_epi16(ss1, filter);
153       WienerHorizontalClip(madds, *wiener_buffer + x);
154       x += 8;
155     } while (x < width);
156     src += src_stride;
157     *wiener_buffer += width;
158   }
159 }
160 
WienerHorizontalTap1(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,int16_t ** const wiener_buffer)161 inline void WienerHorizontalTap1(const uint16_t* src,
162                                  const ptrdiff_t src_stride,
163                                  const ptrdiff_t width, const int height,
164                                  int16_t** const wiener_buffer) {
165   for (int y = height; y != 0; --y) {
166     ptrdiff_t x = 0;
167     do {
168       const __m128i s = LoadUnaligned16(src + x);
169       const __m128i d = _mm_slli_epi16(s, 4);
170       StoreAligned16(*wiener_buffer + x, d);
171       x += 8;
172     } while (x < width);
173     src += src_stride;
174     *wiener_buffer += width;
175   }
176 }
177 
WienerVertical7(const __m128i a[4],const __m128i filter[4])178 inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
179   const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
180   const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
181   const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
182   const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
183   const __m128i madd01 = _mm_add_epi32(madd0, madd1);
184   const __m128i madd23 = _mm_add_epi32(madd2, madd3);
185   const __m128i sum = _mm_add_epi32(madd01, madd23);
186   return _mm_srai_epi32(sum, kInterRoundBitsVertical);
187 }
188 
WienerVertical5(const __m128i a[3],const __m128i filter[3])189 inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
190   const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
191   const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
192   const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
193   const __m128i madd01 = _mm_add_epi32(madd0, madd1);
194   const __m128i sum = _mm_add_epi32(madd01, madd2);
195   return _mm_srai_epi32(sum, kInterRoundBitsVertical);
196 }
197 
WienerVertical3(const __m128i a[2],const __m128i filter[2])198 inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
199   const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
200   const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
201   const __m128i sum = _mm_add_epi32(madd0, madd1);
202   return _mm_srai_epi32(sum, kInterRoundBitsVertical);
203 }
204 
WienerVerticalClip(const __m128i s[2])205 inline __m128i WienerVerticalClip(const __m128i s[2]) {
206   const __m128i d = _mm_packus_epi32(s[0], s[1]);
207   return _mm_min_epu16(d, _mm_set1_epi16(1023));
208 }
209 
WienerVerticalFilter7(const __m128i a[7],const __m128i filter[2])210 inline __m128i WienerVerticalFilter7(const __m128i a[7],
211                                      const __m128i filter[2]) {
212   const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
213   __m128i b[4], c[2];
214   b[0] = _mm_unpacklo_epi16(a[0], a[1]);
215   b[1] = _mm_unpacklo_epi16(a[2], a[3]);
216   b[2] = _mm_unpacklo_epi16(a[4], a[5]);
217   b[3] = _mm_unpacklo_epi16(a[6], round);
218   c[0] = WienerVertical7(b, filter);
219   b[0] = _mm_unpackhi_epi16(a[0], a[1]);
220   b[1] = _mm_unpackhi_epi16(a[2], a[3]);
221   b[2] = _mm_unpackhi_epi16(a[4], a[5]);
222   b[3] = _mm_unpackhi_epi16(a[6], round);
223   c[1] = WienerVertical7(b, filter);
224   return WienerVerticalClip(c);
225 }
226 
WienerVerticalFilter5(const __m128i a[5],const __m128i filter[3])227 inline __m128i WienerVerticalFilter5(const __m128i a[5],
228                                      const __m128i filter[3]) {
229   const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
230   __m128i b[3], c[2];
231   b[0] = _mm_unpacklo_epi16(a[0], a[1]);
232   b[1] = _mm_unpacklo_epi16(a[2], a[3]);
233   b[2] = _mm_unpacklo_epi16(a[4], round);
234   c[0] = WienerVertical5(b, filter);
235   b[0] = _mm_unpackhi_epi16(a[0], a[1]);
236   b[1] = _mm_unpackhi_epi16(a[2], a[3]);
237   b[2] = _mm_unpackhi_epi16(a[4], round);
238   c[1] = WienerVertical5(b, filter);
239   return WienerVerticalClip(c);
240 }
241 
WienerVerticalFilter3(const __m128i a[3],const __m128i filter[2])242 inline __m128i WienerVerticalFilter3(const __m128i a[3],
243                                      const __m128i filter[2]) {
244   const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
245   __m128i b[2], c[2];
246   b[0] = _mm_unpacklo_epi16(a[0], a[1]);
247   b[1] = _mm_unpacklo_epi16(a[2], round);
248   c[0] = WienerVertical3(b, filter);
249   b[0] = _mm_unpackhi_epi16(a[0], a[1]);
250   b[1] = _mm_unpackhi_epi16(a[2], round);
251   c[1] = WienerVertical3(b, filter);
252   return WienerVerticalClip(c);
253 }
254 
WienerVerticalTap7Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[7])255 inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
256                                         const ptrdiff_t wiener_stride,
257                                         const __m128i filter[2], __m128i a[7]) {
258   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
259   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
260   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
261   a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
262   a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
263   a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
264   a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
265   return WienerVerticalFilter7(a, filter);
266 }
267 
WienerVerticalTap5Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[3],__m128i a[5])268 inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
269                                         const ptrdiff_t wiener_stride,
270                                         const __m128i filter[3], __m128i a[5]) {
271   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
272   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
273   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
274   a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
275   a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
276   return WienerVerticalFilter5(a, filter);
277 }
278 
WienerVerticalTap3Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[3])279 inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
280                                         const ptrdiff_t wiener_stride,
281                                         const __m128i filter[2], __m128i a[3]) {
282   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
283   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
284   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
285   return WienerVerticalFilter3(a, filter);
286 }
287 
WienerVerticalTap7(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[4],uint16_t * dst,const ptrdiff_t dst_stride)288 inline void WienerVerticalTap7(const int16_t* wiener_buffer,
289                                const ptrdiff_t width, const int height,
290                                const int16_t coefficients[4], uint16_t* dst,
291                                const ptrdiff_t dst_stride) {
292   const __m128i c = LoadLo8(coefficients);
293   __m128i filter[4];
294   filter[0] = _mm_shuffle_epi32(c, 0x0);
295   filter[1] = _mm_shuffle_epi32(c, 0x55);
296   filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
297   filter[3] =
298       _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
299   for (int y = height >> 1; y > 0; --y) {
300     ptrdiff_t x = 0;
301     do {
302       __m128i a[8], d[2];
303       d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
304       a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
305       d[1] = WienerVerticalFilter7(a + 1, filter);
306       StoreAligned16(dst + x, d[0]);
307       StoreAligned16(dst + dst_stride + x, d[1]);
308       x += 8;
309     } while (x < width);
310     dst += 2 * dst_stride;
311     wiener_buffer += 2 * width;
312   }
313 
314   if ((height & 1) != 0) {
315     ptrdiff_t x = 0;
316     do {
317       __m128i a[7];
318       const __m128i d =
319           WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
320       StoreAligned16(dst + x, d);
321       x += 8;
322     } while (x < width);
323   }
324 }
325 
WienerVerticalTap5(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[3],uint16_t * dst,const ptrdiff_t dst_stride)326 inline void WienerVerticalTap5(const int16_t* wiener_buffer,
327                                const ptrdiff_t width, const int height,
328                                const int16_t coefficients[3], uint16_t* dst,
329                                const ptrdiff_t dst_stride) {
330   const __m128i c = LoadLo8(coefficients);
331   __m128i filter[3];
332   filter[0] = _mm_shuffle_epi32(c, 0x0);
333   filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
334   filter[2] =
335       _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
336   for (int y = height >> 1; y > 0; --y) {
337     ptrdiff_t x = 0;
338     do {
339       __m128i a[6], d[2];
340       d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
341       a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
342       d[1] = WienerVerticalFilter5(a + 1, filter);
343       StoreAligned16(dst + x, d[0]);
344       StoreAligned16(dst + dst_stride + x, d[1]);
345       x += 8;
346     } while (x < width);
347     dst += 2 * dst_stride;
348     wiener_buffer += 2 * width;
349   }
350 
351   if ((height & 1) != 0) {
352     ptrdiff_t x = 0;
353     do {
354       __m128i a[5];
355       const __m128i d =
356           WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
357       StoreAligned16(dst + x, d);
358       x += 8;
359     } while (x < width);
360   }
361 }
362 
WienerVerticalTap3(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[2],uint16_t * dst,const ptrdiff_t dst_stride)363 inline void WienerVerticalTap3(const int16_t* wiener_buffer,
364                                const ptrdiff_t width, const int height,
365                                const int16_t coefficients[2], uint16_t* dst,
366                                const ptrdiff_t dst_stride) {
367   __m128i filter[2];
368   filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
369   filter[1] =
370       _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
371   for (int y = height >> 1; y > 0; --y) {
372     ptrdiff_t x = 0;
373     do {
374       __m128i a[4], d[2];
375       d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
376       a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
377       d[1] = WienerVerticalFilter3(a + 1, filter);
378       StoreAligned16(dst + x, d[0]);
379       StoreAligned16(dst + dst_stride + x, d[1]);
380       x += 8;
381     } while (x < width);
382     dst += 2 * dst_stride;
383     wiener_buffer += 2 * width;
384   }
385 
386   if ((height & 1) != 0) {
387     ptrdiff_t x = 0;
388     do {
389       __m128i a[3];
390       const __m128i d =
391           WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
392       StoreAligned16(dst + x, d);
393       x += 8;
394     } while (x < width);
395   }
396 }
397 
WienerVerticalTap1Kernel(const int16_t * const wiener_buffer,uint16_t * const dst)398 inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
399                                      uint16_t* const dst) {
400   const __m128i a = LoadAligned16(wiener_buffer);
401   const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
402   const __m128i c = _mm_srai_epi16(b, 4);
403   const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
404   const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
405   StoreAligned16(dst, e);
406 }
407 
WienerVerticalTap1(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,uint16_t * dst,const ptrdiff_t dst_stride)408 inline void WienerVerticalTap1(const int16_t* wiener_buffer,
409                                const ptrdiff_t width, const int height,
410                                uint16_t* dst, const ptrdiff_t dst_stride) {
411   for (int y = height >> 1; y > 0; --y) {
412     ptrdiff_t x = 0;
413     do {
414       WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
415       WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
416       x += 8;
417     } while (x < width);
418     dst += 2 * dst_stride;
419     wiener_buffer += 2 * width;
420   }
421 
422   if ((height & 1) != 0) {
423     ptrdiff_t x = 0;
424     do {
425       WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
426       x += 8;
427     } while (x < width);
428   }
429 }
430 
WienerFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)431 void WienerFilter_SSE4_1(
432     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
433     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
434     const void* LIBGAV1_RESTRICT const top_border,
435     const ptrdiff_t top_border_stride,
436     const void* LIBGAV1_RESTRICT const bottom_border,
437     const ptrdiff_t bottom_border_stride, const int width, const int height,
438     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
439     void* LIBGAV1_RESTRICT const dest) {
440   const int16_t* const number_leading_zero_coefficients =
441       restoration_info.wiener_info.number_leading_zero_coefficients;
442   const int number_rows_to_skip = std::max(
443       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
444       1);
445   const ptrdiff_t wiener_stride = Align(width, 16);
446   int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
447   // The values are saturated to 13 bits before storing.
448   int16_t* wiener_buffer_horizontal =
449       wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
450 
451   // horizontal filtering.
452   // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
453   const int height_horizontal =
454       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
455   const int height_extra = (height_horizontal - height) >> 1;
456   assert(height_extra <= 2);
457   const auto* const src = static_cast<const uint16_t*>(source);
458   const auto* const top = static_cast<const uint16_t*>(top_border);
459   const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
460   const __m128i coefficients_horizontal =
461       LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
462   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
463     WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
464                          top_border_stride, wiener_stride, height_extra,
465                          coefficients_horizontal, &wiener_buffer_horizontal);
466     WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
467                          coefficients_horizontal, &wiener_buffer_horizontal);
468     WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
469                          height_extra, coefficients_horizontal,
470                          &wiener_buffer_horizontal);
471   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
472     WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
473                          top_border_stride, wiener_stride, height_extra,
474                          coefficients_horizontal, &wiener_buffer_horizontal);
475     WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
476                          coefficients_horizontal, &wiener_buffer_horizontal);
477     WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
478                          height_extra, coefficients_horizontal,
479                          &wiener_buffer_horizontal);
480   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
481     // The maximum over-reads happen here.
482     WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
483                          top_border_stride, wiener_stride, height_extra,
484                          coefficients_horizontal, &wiener_buffer_horizontal);
485     WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
486                          coefficients_horizontal, &wiener_buffer_horizontal);
487     WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
488                          height_extra, coefficients_horizontal,
489                          &wiener_buffer_horizontal);
490   } else {
491     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
492     WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
493                          top_border_stride, wiener_stride, height_extra,
494                          &wiener_buffer_horizontal);
495     WienerHorizontalTap1(src, stride, wiener_stride, height,
496                          &wiener_buffer_horizontal);
497     WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
498                          height_extra, &wiener_buffer_horizontal);
499   }
500 
501   // vertical filtering.
502   // Over-writes up to 15 values.
503   const int16_t* const filter_vertical =
504       restoration_info.wiener_info.filter[WienerInfo::kVertical];
505   auto* dst = static_cast<uint16_t*>(dest);
506   if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
507     // Because the top row of |source| is a duplicate of the second row, and the
508     // bottom row of |source| is a duplicate of its above row, we can duplicate
509     // the top and bottom row of |wiener_buffer| accordingly.
510     memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
511            sizeof(*wiener_buffer_horizontal) * wiener_stride);
512     memcpy(restoration_buffer->wiener_buffer,
513            restoration_buffer->wiener_buffer + wiener_stride,
514            sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
515     WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
516                        filter_vertical, dst, stride);
517   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
518     WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
519                        height, filter_vertical + 1, dst, stride);
520   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
521     WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
522                        wiener_stride, height, filter_vertical + 2, dst, stride);
523   } else {
524     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
525     WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
526                        wiener_stride, height, dst, stride);
527   }
528 }
529 
530 //------------------------------------------------------------------------------
531 // SGR
532 
533 // SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
534 // Pass 1 and 2 for Pass 2.
535 constexpr int kOverreadInBytesPass1 = 4;
536 constexpr int kOverreadInBytesPass2 = 8;
537 
LoadAligned16x2U16(const uint16_t * const src[2],const ptrdiff_t x,__m128i dst[2])538 inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
539                                __m128i dst[2]) {
540   dst[0] = LoadAligned16(src[0] + x);
541   dst[1] = LoadAligned16(src[1] + x);
542 }
543 
LoadAligned16x2U16Msan(const uint16_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])544 inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
545                                    const ptrdiff_t x, const ptrdiff_t border,
546                                    __m128i dst[2]) {
547   dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
548   dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
549 }
550 
LoadAligned16x3U16(const uint16_t * const src[3],const ptrdiff_t x,__m128i dst[3])551 inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
552                                __m128i dst[3]) {
553   dst[0] = LoadAligned16(src[0] + x);
554   dst[1] = LoadAligned16(src[1] + x);
555   dst[2] = LoadAligned16(src[2] + x);
556 }
557 
LoadAligned16x3U16Msan(const uint16_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3])558 inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
559                                    const ptrdiff_t x, const ptrdiff_t border,
560                                    __m128i dst[3]) {
561   dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
562   dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
563   dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
564 }
565 
LoadAligned32U32(const uint32_t * const src,__m128i dst[2])566 inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
567   dst[0] = LoadAligned16(src + 0);
568   dst[1] = LoadAligned16(src + 4);
569 }
570 
LoadAligned32U32Msan(const uint32_t * const src,const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])571 inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
572                                  const ptrdiff_t border, __m128i dst[2]) {
573   dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
574   dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
575 }
576 
LoadAligned32x2U32(const uint32_t * const src[2],const ptrdiff_t x,__m128i dst[2][2])577 inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
578                                __m128i dst[2][2]) {
579   LoadAligned32U32(src[0] + x, dst[0]);
580   LoadAligned32U32(src[1] + x, dst[1]);
581 }
582 
LoadAligned32x2U32Msan(const uint32_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2][2])583 inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
584                                    const ptrdiff_t x, const ptrdiff_t border,
585                                    __m128i dst[2][2]) {
586   LoadAligned32U32Msan(src[0], x, border, dst[0]);
587   LoadAligned32U32Msan(src[1], x, border, dst[1]);
588 }
589 
LoadAligned32x3U32(const uint32_t * const src[3],const ptrdiff_t x,__m128i dst[3][2])590 inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
591                                __m128i dst[3][2]) {
592   LoadAligned32U32(src[0] + x, dst[0]);
593   LoadAligned32U32(src[1] + x, dst[1]);
594   LoadAligned32U32(src[2] + x, dst[2]);
595 }
596 
LoadAligned32x3U32Msan(const uint32_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3][2])597 inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
598                                    const ptrdiff_t x, const ptrdiff_t border,
599                                    __m128i dst[3][2]) {
600   LoadAligned32U32Msan(src[0], x, border, dst[0]);
601   LoadAligned32U32Msan(src[1], x, border, dst[1]);
602   LoadAligned32U32Msan(src[2], x, border, dst[2]);
603 }
604 
StoreAligned32U16(uint16_t * const dst,const __m128i src[2])605 inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
606   StoreAligned16(dst + 0, src[0]);
607   StoreAligned16(dst + 8, src[1]);
608 }
609 
StoreAligned32U32(uint32_t * const dst,const __m128i src[2])610 inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
611   StoreAligned16(dst + 0, src[0]);
612   StoreAligned16(dst + 4, src[1]);
613 }
614 
StoreAligned64U32(uint32_t * const dst,const __m128i src[4])615 inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
616   StoreAligned32U32(dst + 0, src + 0);
617   StoreAligned32U32(dst + 8, src + 2);
618 }
619 
620 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
621 // functions. Some compilers may generate super inefficient code and the whole
622 // decoder could be 15% slower.
623 
VaddlLo8(const __m128i src0,const __m128i src1)624 inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
625   const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
626   const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
627   return _mm_add_epi16(s0, s1);
628 }
629 
VaddlHi8(const __m128i src0,const __m128i src1)630 inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
631   const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
632   const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
633   return _mm_add_epi16(s0, s1);
634 }
635 
VaddwLo8(const __m128i src0,const __m128i src1)636 inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
637   const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
638   return _mm_add_epi16(src0, s1);
639 }
640 
VaddwHi8(const __m128i src0,const __m128i src1)641 inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
642   const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
643   return _mm_add_epi16(src0, s1);
644 }
645 
VmullNLo8(const __m128i src0,const int src1)646 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
647   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
648   return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
649 }
650 
VmullNHi8(const __m128i src0,const int src1)651 inline __m128i VmullNHi8(const __m128i src0, const int src1) {
652   const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
653   return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
654 }
655 
VmullLo16(const __m128i src0,const __m128i src1)656 inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
657   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
658   const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
659   return _mm_madd_epi16(s0, s1);
660 }
661 
VmullHi16(const __m128i src0,const __m128i src1)662 inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
663   const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
664   const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
665   return _mm_madd_epi16(s0, s1);
666 }
667 
VrshrU16(const __m128i src0,const int src1)668 inline __m128i VrshrU16(const __m128i src0, const int src1) {
669   const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
670   return _mm_srli_epi16(sum, src1);
671 }
672 
VrshrS32(const __m128i src0,const int src1)673 inline __m128i VrshrS32(const __m128i src0, const int src1) {
674   const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
675   return _mm_srai_epi32(sum, src1);
676 }
677 
VrshrU32(const __m128i src0,const int src1)678 inline __m128i VrshrU32(const __m128i src0, const int src1) {
679   const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
680   return _mm_srli_epi32(sum, src1);
681 }
682 
Square(const __m128i src,__m128i dst[2])683 inline void Square(const __m128i src, __m128i dst[2]) {
684   const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
685   const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
686   dst[0] = _mm_madd_epi16(s0, s0);
687   dst[1] = _mm_madd_epi16(s1, s1);
688 }
689 
690 template <int offset>
Prepare3_8(const __m128i src[2],__m128i dst[3])691 inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
692   dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
693   dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
694   dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
695 }
696 
Prepare3_16(const __m128i src[2],__m128i dst[3])697 inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
698   dst[0] = src[0];
699   dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
700   dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
701 }
702 
Prepare3_32(const __m128i src[2],__m128i dst[3])703 inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
704   dst[0] = src[0];
705   dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
706   dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
707 }
708 
Prepare5_16(const __m128i src[2],__m128i dst[5])709 inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
710   Prepare3_16(src, dst);
711   dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
712   dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
713 }
714 
Prepare5_32(const __m128i src[2],__m128i dst[5])715 inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
716   Prepare3_32(src, dst);
717   dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
718   dst[4] = src[1];
719 }
720 
Sum3_16(const __m128i src0,const __m128i src1,const __m128i src2)721 inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
722                        const __m128i src2) {
723   const __m128i sum = _mm_add_epi16(src0, src1);
724   return _mm_add_epi16(sum, src2);
725 }
726 
Sum3_16(const __m128i src[3])727 inline __m128i Sum3_16(const __m128i src[3]) {
728   return Sum3_16(src[0], src[1], src[2]);
729 }
730 
Sum3_32(const __m128i src0,const __m128i src1,const __m128i src2)731 inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
732                        const __m128i src2) {
733   const __m128i sum = _mm_add_epi32(src0, src1);
734   return _mm_add_epi32(sum, src2);
735 }
736 
Sum3_32(const __m128i src[3])737 inline __m128i Sum3_32(const __m128i src[3]) {
738   return Sum3_32(src[0], src[1], src[2]);
739 }
740 
Sum3_32(const __m128i src[3][2],__m128i dst[2])741 inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
742   dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
743   dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
744 }
745 
Sum3WLo16(const __m128i src[3])746 inline __m128i Sum3WLo16(const __m128i src[3]) {
747   const __m128i sum = VaddlLo8(src[0], src[1]);
748   return VaddwLo8(sum, src[2]);
749 }
750 
Sum3WHi16(const __m128i src[3])751 inline __m128i Sum3WHi16(const __m128i src[3]) {
752   const __m128i sum = VaddlHi8(src[0], src[1]);
753   return VaddwHi8(sum, src[2]);
754 }
755 
Sum5_16(const __m128i src[5])756 inline __m128i Sum5_16(const __m128i src[5]) {
757   const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
758   const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
759   const __m128i sum = _mm_add_epi16(sum01, sum23);
760   return _mm_add_epi16(sum, src[4]);
761 }
762 
Sum5_32(const __m128i * const src0,const __m128i * const src1,const __m128i * const src2,const __m128i * const src3,const __m128i * const src4)763 inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
764                        const __m128i* const src2, const __m128i* const src3,
765                        const __m128i* const src4) {
766   const __m128i sum01 = _mm_add_epi32(*src0, *src1);
767   const __m128i sum23 = _mm_add_epi32(*src2, *src3);
768   const __m128i sum = _mm_add_epi32(sum01, sum23);
769   return _mm_add_epi32(sum, *src4);
770 }
771 
Sum5_32(const __m128i src[5])772 inline __m128i Sum5_32(const __m128i src[5]) {
773   return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
774 }
775 
Sum5_32(const __m128i src[5][2],__m128i dst[2])776 inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
777   dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
778   dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
779 }
780 
Sum3Horizontal16(const __m128i src[2])781 inline __m128i Sum3Horizontal16(const __m128i src[2]) {
782   __m128i s[3];
783   Prepare3_16(src, s);
784   return Sum3_16(s);
785 }
786 
Sum3Horizontal32(const __m128i src[3],__m128i dst[2])787 inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
788   __m128i s[3];
789   Prepare3_32(src + 0, s);
790   dst[0] = Sum3_32(s);
791   Prepare3_32(src + 1, s);
792   dst[1] = Sum3_32(s);
793 }
794 
Sum5Horizontal16(const __m128i src[2])795 inline __m128i Sum5Horizontal16(const __m128i src[2]) {
796   __m128i s[5];
797   Prepare5_16(src, s);
798   return Sum5_16(s);
799 }
800 
Sum5Horizontal32(const __m128i src[3],__m128i dst[2])801 inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
802   __m128i s[5];
803   Prepare5_32(src + 0, s);
804   dst[0] = Sum5_32(s);
805   Prepare5_32(src + 1, s);
806   dst[1] = Sum5_32(s);
807 }
808 
SumHorizontal16(const __m128i src[2],__m128i * const row3,__m128i * const row5)809 void SumHorizontal16(const __m128i src[2], __m128i* const row3,
810                      __m128i* const row5) {
811   __m128i s[5];
812   Prepare5_16(src, s);
813   const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
814   *row3 = Sum3_16(s + 1);
815   *row5 = _mm_add_epi16(sum04, *row3);
816 }
817 
SumHorizontal16(const __m128i src[3],__m128i * const row3_0,__m128i * const row3_1,__m128i * const row5_0,__m128i * const row5_1)818 inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
819                             __m128i* const row3_1, __m128i* const row5_0,
820                             __m128i* const row5_1) {
821   SumHorizontal16(src + 0, row3_0, row5_0);
822   SumHorizontal16(src + 1, row3_1, row5_1);
823 }
824 
SumHorizontal32(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)825 void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
826                      __m128i* const row_sq5) {
827   const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
828   *row_sq3 = Sum3_32(src + 1);
829   *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
830 }
831 
SumHorizontal32(const __m128i src[3],__m128i * const row_sq3_0,__m128i * const row_sq3_1,__m128i * const row_sq5_0,__m128i * const row_sq5_1)832 inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
833                             __m128i* const row_sq3_1, __m128i* const row_sq5_0,
834                             __m128i* const row_sq5_1) {
835   __m128i s[5];
836   Prepare5_32(src + 0, s);
837   SumHorizontal32(s, row_sq3_0, row_sq5_0);
838   Prepare5_32(src + 1, s);
839   SumHorizontal32(s, row_sq3_1, row_sq5_1);
840 }
841 
Sum343Lo(const __m128i ma3[3])842 inline __m128i Sum343Lo(const __m128i ma3[3]) {
843   const __m128i sum = Sum3WLo16(ma3);
844   const __m128i sum3 = Sum3_16(sum, sum, sum);
845   return VaddwLo8(sum3, ma3[1]);
846 }
847 
Sum343Hi(const __m128i ma3[3])848 inline __m128i Sum343Hi(const __m128i ma3[3]) {
849   const __m128i sum = Sum3WHi16(ma3);
850   const __m128i sum3 = Sum3_16(sum, sum, sum);
851   return VaddwHi8(sum3, ma3[1]);
852 }
853 
Sum343(const __m128i src[3])854 inline __m128i Sum343(const __m128i src[3]) {
855   const __m128i sum = Sum3_32(src);
856   const __m128i sum3 = Sum3_32(sum, sum, sum);
857   return _mm_add_epi32(sum3, src[1]);
858 }
859 
Sum343(const __m128i src[3],__m128i dst[2])860 inline void Sum343(const __m128i src[3], __m128i dst[2]) {
861   __m128i s[3];
862   Prepare3_32(src + 0, s);
863   dst[0] = Sum343(s);
864   Prepare3_32(src + 1, s);
865   dst[1] = Sum343(s);
866 }
867 
Sum565Lo(const __m128i src[3])868 inline __m128i Sum565Lo(const __m128i src[3]) {
869   const __m128i sum = Sum3WLo16(src);
870   const __m128i sum4 = _mm_slli_epi16(sum, 2);
871   const __m128i sum5 = _mm_add_epi16(sum4, sum);
872   return VaddwLo8(sum5, src[1]);
873 }
874 
Sum565Hi(const __m128i src[3])875 inline __m128i Sum565Hi(const __m128i src[3]) {
876   const __m128i sum = Sum3WHi16(src);
877   const __m128i sum4 = _mm_slli_epi16(sum, 2);
878   const __m128i sum5 = _mm_add_epi16(sum4, sum);
879   return VaddwHi8(sum5, src[1]);
880 }
881 
Sum565(const __m128i src[3])882 inline __m128i Sum565(const __m128i src[3]) {
883   const __m128i sum = Sum3_32(src);
884   const __m128i sum4 = _mm_slli_epi32(sum, 2);
885   const __m128i sum5 = _mm_add_epi32(sum4, sum);
886   return _mm_add_epi32(sum5, src[1]);
887 }
888 
Sum565(const __m128i src[3],__m128i dst[2])889 inline void Sum565(const __m128i src[3], __m128i dst[2]) {
890   __m128i s[3];
891   Prepare3_32(src + 0, s);
892   dst[0] = Sum565(s);
893   Prepare3_32(src + 1, s);
894   dst[1] = Sum565(s);
895 }
896 
BoxSum(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sum3,uint16_t * sum5,uint32_t * square_sum3,uint32_t * square_sum5)897 inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
898                    const ptrdiff_t width, const ptrdiff_t sum_stride,
899                    const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
900                    uint32_t* square_sum3, uint32_t* square_sum5) {
901   const ptrdiff_t overread_in_bytes =
902       kOverreadInBytesPass1 - sizeof(*src) * width;
903   int y = 2;
904   do {
905     __m128i s[3], sq[6];
906     s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
907     Square(s[0], sq);
908     ptrdiff_t x = sum_width;
909     do {
910       __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
911       s[1] = LoadUnaligned16Msan(
912           src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
913       x -= 16;
914       src += 16;
915       s[2] = LoadUnaligned16Msan(
916           src, overread_in_bytes + sizeof(*src) * (sum_width - x));
917       Square(s[1], sq + 2);
918       Square(s[2], sq + 4);
919       SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
920       StoreAligned32U16(sum3, row3);
921       StoreAligned32U16(sum5, row5);
922       SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
923                       &row_sq5[1]);
924       StoreAligned32U32(square_sum3 + 0, row_sq3);
925       StoreAligned32U32(square_sum5 + 0, row_sq5);
926       SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
927                       &row_sq5[1]);
928       StoreAligned32U32(square_sum3 + 8, row_sq3);
929       StoreAligned32U32(square_sum5 + 8, row_sq5);
930       s[0] = s[2];
931       sq[0] = sq[4];
932       sq[1] = sq[5];
933       sum3 += 16;
934       sum5 += 16;
935       square_sum3 += 16;
936       square_sum5 += 16;
937     } while (x != 0);
938     src += src_stride - sum_width;
939     sum3 += sum_stride - sum_width;
940     sum5 += sum_stride - sum_width;
941     square_sum3 += sum_stride - sum_width;
942     square_sum5 += sum_stride - sum_width;
943   } while (--y != 0);
944 }
945 
946 template <int size>
BoxSum(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sums,uint32_t * square_sums)947 inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
948                    const ptrdiff_t width, const ptrdiff_t sum_stride,
949                    const ptrdiff_t sum_width, uint16_t* sums,
950                    uint32_t* square_sums) {
951   static_assert(size == 3 || size == 5, "");
952   const ptrdiff_t overread_in_bytes =
953       ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
954       sizeof(*src) * width;
955   int y = 2;
956   do {
957     __m128i s[3], sq[6];
958     s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
959     Square(s[0], sq);
960     ptrdiff_t x = sum_width;
961     do {
962       __m128i row[2], row_sq[4];
963       s[1] = LoadUnaligned16Msan(
964           src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
965       x -= 16;
966       src += 16;
967       s[2] = LoadUnaligned16Msan(
968           src, overread_in_bytes + sizeof(*src) * (sum_width - x));
969       Square(s[1], sq + 2);
970       Square(s[2], sq + 4);
971       if (size == 3) {
972         row[0] = Sum3Horizontal16(s + 0);
973         row[1] = Sum3Horizontal16(s + 1);
974         Sum3Horizontal32(sq + 0, row_sq + 0);
975         Sum3Horizontal32(sq + 2, row_sq + 2);
976       } else {
977         row[0] = Sum5Horizontal16(s + 0);
978         row[1] = Sum5Horizontal16(s + 1);
979         Sum5Horizontal32(sq + 0, row_sq + 0);
980         Sum5Horizontal32(sq + 2, row_sq + 2);
981       }
982       StoreAligned32U16(sums, row);
983       StoreAligned64U32(square_sums, row_sq);
984       s[0] = s[2];
985       sq[0] = sq[4];
986       sq[1] = sq[5];
987       sums += 16;
988       square_sums += 16;
989     } while (x != 0);
990     src += src_stride - sum_width;
991     sums += sum_stride - sum_width;
992     square_sums += sum_stride - sum_width;
993   } while (--y != 0);
994 }
995 
996 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq,const uint32_t scale)997 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
998                            const uint32_t scale) {
999   static_assert(n == 9 || n == 25, "");
1000   // a = |sum_sq|
1001   // d = |sum|
1002   // p = (a * n < d * d) ? 0 : a * n - d * d;
1003   const __m128i dxd = _mm_madd_epi16(sum, sum);
1004   // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
1005   // Some compilers could do this for us but we make this explicit.
1006   // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
1007   __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
1008   if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
1009   const __m128i sub = _mm_sub_epi32(axn, dxd);
1010   const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
1011   const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
1012   return VrshrU32(pxs, kSgrProjScaleBits);
1013 }
1014 
1015 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq[2],const uint32_t scale)1016 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
1017                            const uint32_t scale) {
1018   static_assert(n == 9 || n == 25, "");
1019   const __m128i b = VrshrU16(sum, 2);
1020   const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
1021   const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
1022   const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
1023   const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
1024   return _mm_packus_epi32(z0, z1);
1025 }
1026 
CalculateB5(const __m128i sum,const __m128i ma,__m128i b[2])1027 inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
1028   // one_over_n == 164.
1029   constexpr uint32_t one_over_n =
1030       ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
1031   // one_over_n_quarter == 41.
1032   constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
1033   static_assert(one_over_n == one_over_n_quarter << 2, "");
1034   // |ma| is in range [0, 255].
1035   const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
1036   const __m128i m0 = VmullLo16(m, sum);
1037   const __m128i m1 = VmullHi16(m, sum);
1038   b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
1039   b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
1040 }
1041 
CalculateB3(const __m128i sum,const __m128i ma,__m128i b[2])1042 inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
1043   // one_over_n == 455.
1044   constexpr uint32_t one_over_n =
1045       ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
1046   const __m128i m0 = VmullLo16(ma, sum);
1047   const __m128i m1 = VmullHi16(ma, sum);
1048   const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
1049   const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
1050   b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
1051   b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
1052 }
1053 
CalculateSumAndIndex5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1054 inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
1055                                   const uint32_t scale, __m128i* const sum,
1056                                   __m128i* const index) {
1057   __m128i sum_sq[2];
1058   *sum = Sum5_16(s5);
1059   Sum5_32(sq5, sum_sq);
1060   *index = CalculateMa<25>(*sum, sum_sq, scale);
1061 }
1062 
CalculateSumAndIndex3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1063 inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
1064                                   const uint32_t scale, __m128i* const sum,
1065                                   __m128i* const index) {
1066   __m128i sum_sq[2];
1067   *sum = Sum3_16(s3);
1068   Sum3_32(sq3, sum_sq);
1069   *index = CalculateMa<9>(*sum, sum_sq, scale);
1070 }
1071 
1072 template <int n, int offset>
LookupIntermediate(const __m128i sum,const __m128i index,__m128i * const ma,__m128i b[2])1073 inline void LookupIntermediate(const __m128i sum, const __m128i index,
1074                                __m128i* const ma, __m128i b[2]) {
1075   static_assert(n == 9 || n == 25, "");
1076   static_assert(offset == 0 || offset == 8, "");
1077   const __m128i idx = _mm_packus_epi16(index, index);
1078   // Actually it's not stored and loaded. The compiler will use a 64-bit
1079   // general-purpose register to process. Faster than using _mm_extract_epi8().
1080   uint8_t temp[8];
1081   StoreLo8(temp, idx);
1082   // offset == 0 is assumed to be the first call to this function. The value is
1083   // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
1084   // equivalent if not faster than pinsrb.
1085   if (offset == 0) {
1086     *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
1087   } else {
1088     *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
1089   }
1090   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
1091   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
1092   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
1093   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
1094   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
1095   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
1096   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
1097   // b = ma * b * one_over_n
1098   // |ma| = [0, 255]
1099   // |sum| is a box sum with radius 1 or 2.
1100   // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1101   // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1102   // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1103   // When radius is 2 |n| is 25. |one_over_n| is 164.
1104   // When radius is 1 |n| is 9. |one_over_n| is 455.
1105   // |kSgrProjReciprocalBits| is 12.
1106   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1107   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1108   __m128i maq;
1109   if (offset == 0) {
1110     maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1111   } else {
1112     maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1113   }
1114   if (n == 9) {
1115     CalculateB3(sum, maq, b);
1116   } else {
1117     CalculateB5(sum, maq, b);
1118   }
1119 }
1120 
1121 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
1122 // to get value 0 as the shuffle result. The most significiant bit 1 comes
1123 // either from the comparison instruction, or from the sign bit of the index.
ShuffleIndex(const __m128i table,const __m128i index)1124 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
1125   __m128i mask;
1126   mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
1127   mask = _mm_or_si128(mask, index);
1128   return _mm_shuffle_epi8(table, mask);
1129 }
1130 
AdjustValue(const __m128i value,const __m128i index,const int threshold)1131 inline __m128i AdjustValue(const __m128i value, const __m128i index,
1132                            const int threshold) {
1133   const __m128i thresholds = _mm_set1_epi8(threshold - 128);
1134   const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
1135   return _mm_add_epi8(value, offset);
1136 }
1137 
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i * const ma,__m128i b0[2],__m128i b1[2])1138 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1139                                   __m128i* const ma, __m128i b0[2],
1140                                   __m128i b1[2]) {
1141   // Use table lookup to read elements whose indices are less than 48.
1142   const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
1143   const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
1144   const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
1145   const __m128i indices = _mm_packus_epi16(index[0], index[1]);
1146   __m128i idx;
1147   // Clip idx to 127 to apply signed comparison instructions.
1148   idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
1149   // All elements whose indices are less than 48 are set to 0.
1150   // Get shuffle results for indices in range [0, 15].
1151   *ma = ShuffleIndex(c0, idx);
1152   // Get shuffle results for indices in range [16, 31].
1153   // Subtract 16 to utilize the sign bit of the index.
1154   idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1155   const __m128i res1 = ShuffleIndex(c1, idx);
1156   // Use OR instruction to combine shuffle results together.
1157   *ma = _mm_or_si128(*ma, res1);
1158   // Get shuffle results for indices in range [32, 47].
1159   // Subtract 16 to utilize the sign bit of the index.
1160   idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1161   const __m128i res2 = ShuffleIndex(c2, idx);
1162   *ma = _mm_or_si128(*ma, res2);
1163 
1164   // For elements whose indices are larger than 47, since they seldom change
1165   // values with the increase of the index, we use comparison and arithmetic
1166   // operations to calculate their values.
1167   // Add -128 to apply signed comparison instructions.
1168   idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
1169   // Elements whose indices are larger than 47 (with value 0) are set to 5.
1170   *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
1171   *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
1172   *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
1173   *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
1174   *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
1175   *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
1176 
1177   // b = ma * b * one_over_n
1178   // |ma| = [0, 255]
1179   // |sum| is a box sum with radius 1 or 2.
1180   // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1181   // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1182   // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1183   // When radius is 2 |n| is 25. |one_over_n| is 164.
1184   // When radius is 1 |n| is 9. |one_over_n| is 455.
1185   // |kSgrProjReciprocalBits| is 12.
1186   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1187   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1188   const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1189   CalculateB3(sum[0], maq0, b0);
1190   const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1191   CalculateB3(sum[1], maq1, b1);
1192 }
1193 
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i ma[2],__m128i b[4])1194 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1195                                   __m128i ma[2], __m128i b[4]) {
1196   __m128i mas;
1197   CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
1198   ma[0] = _mm_unpacklo_epi64(ma[0], mas);
1199   ma[1] = _mm_srli_si128(mas, 8);
1200 }
1201 
1202 // Note: It has been tried to call CalculateIntermediate() to replace the slow
1203 // LookupIntermediate() when calculating 16 intermediate data points. However,
1204 // the compiler generates even slower code.
1205 template <int offset>
CalculateIntermediate5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const ma,__m128i b[2])1206 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
1207                                    const uint32_t scale, __m128i* const ma,
1208                                    __m128i b[2]) {
1209   static_assert(offset == 0 || offset == 8, "");
1210   __m128i sum, index;
1211   CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
1212   LookupIntermediate<25, offset>(sum, index, ma, b);
1213 }
1214 
CalculateIntermediate3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const ma,__m128i b[2])1215 inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
1216                                    const uint32_t scale, __m128i* const ma,
1217                                    __m128i b[2]) {
1218   __m128i sum, index;
1219   CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
1220   LookupIntermediate<9, 0>(sum, index, ma, b);
1221 }
1222 
Store343_444(const __m128i b3[3],const ptrdiff_t x,__m128i sum_b343[2],__m128i sum_b444[2],uint32_t * const b343,uint32_t * const b444)1223 inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
1224                          __m128i sum_b343[2], __m128i sum_b444[2],
1225                          uint32_t* const b343, uint32_t* const b444) {
1226   __m128i b[3], sum_b111[2];
1227   Prepare3_32(b3 + 0, b);
1228   sum_b111[0] = Sum3_32(b);
1229   sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
1230   sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
1231   sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
1232   Prepare3_32(b3 + 1, b);
1233   sum_b111[1] = Sum3_32(b);
1234   sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
1235   sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
1236   sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
1237   StoreAligned32U32(b444 + x, sum_b444);
1238   StoreAligned32U32(b343 + x, sum_b343);
1239 }
1240 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[3],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1241 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
1242                            const ptrdiff_t x, __m128i* const sum_ma343,
1243                            __m128i* const sum_ma444, __m128i sum_b343[2],
1244                            __m128i sum_b444[2], uint16_t* const ma343,
1245                            uint16_t* const ma444, uint32_t* const b343,
1246                            uint32_t* const b444) {
1247   const __m128i sum_ma111 = Sum3WLo16(ma3);
1248   *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1249   StoreAligned16(ma444 + x, *sum_ma444);
1250   const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1251   *sum_ma343 = VaddwLo8(sum333, ma3[1]);
1252   StoreAligned16(ma343 + x, *sum_ma343);
1253   Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1254 }
1255 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[3],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1256 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
1257                            const ptrdiff_t x, __m128i* const sum_ma343,
1258                            __m128i* const sum_ma444, __m128i sum_b343[2],
1259                            __m128i sum_b444[2], uint16_t* const ma343,
1260                            uint16_t* const ma444, uint32_t* const b343,
1261                            uint32_t* const b444) {
1262   const __m128i sum_ma111 = Sum3WHi16(ma3);
1263   *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1264   StoreAligned16(ma444 + x, *sum_ma444);
1265   const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1266   *sum_ma343 = VaddwHi8(sum333, ma3[1]);
1267   StoreAligned16(ma343 + x, *sum_ma343);
1268   Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1269 }
1270 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1271 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1272                            const ptrdiff_t x, __m128i* const sum_ma343,
1273                            __m128i sum_b343[2], uint16_t* const ma343,
1274                            uint16_t* const ma444, uint32_t* const b343,
1275                            uint32_t* const b444) {
1276   __m128i sum_ma444, sum_b444[2];
1277   Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1278                  ma444, b343, b444);
1279 }
1280 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1281 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1282                            const ptrdiff_t x, __m128i* const sum_ma343,
1283                            __m128i sum_b343[2], uint16_t* const ma343,
1284                            uint16_t* const ma444, uint32_t* const b343,
1285                            uint32_t* const b444) {
1286   __m128i sum_ma444, sum_b444[2];
1287   Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1288                  ma444, b343, b444);
1289 }
1290 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1291 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1292                            const ptrdiff_t x, uint16_t* const ma343,
1293                            uint16_t* const ma444, uint32_t* const b343,
1294                            uint32_t* const b444) {
1295   __m128i sum_ma343, sum_b343[2];
1296   Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1297 }
1298 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1299 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1300                            const ptrdiff_t x, uint16_t* const ma343,
1301                            uint16_t* const ma444, uint32_t* const b343,
1302                            uint32_t* const b444) {
1303   __m128i sum_ma343, sum_b343[2];
1304   Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1305 }
1306 
BoxFilterPreProcess5Lo(const __m128i s[2][4],const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i * const ma,__m128i b[2])1307 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
1308     const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
1309     uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
1310     __m128i b[2]) {
1311   __m128i s5[2][5], sq5[5][2];
1312   Square(s[0][1], sq[0] + 2);
1313   Square(s[1][1], sq[1] + 2);
1314   s5[0][3] = Sum5Horizontal16(s[0]);
1315   StoreAligned16(sum5[3], s5[0][3]);
1316   s5[0][4] = Sum5Horizontal16(s[1]);
1317   StoreAligned16(sum5[4], s5[0][4]);
1318   Sum5Horizontal32(sq[0], sq5[3]);
1319   StoreAligned32U32(square_sum5[3], sq5[3]);
1320   Sum5Horizontal32(sq[1], sq5[4]);
1321   StoreAligned32U32(square_sum5[4], sq5[4]);
1322   LoadAligned16x3U16(sum5, 0, s5[0]);
1323   LoadAligned32x3U32(square_sum5, 0, sq5);
1324   CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
1325 }
1326 
BoxFilterPreProcess5(const __m128i s[2][4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i ma[2],__m128i b[6])1327 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
1328     const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
1329     const uint32_t scale, uint16_t* const sum5[5],
1330     uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
1331     __m128i b[6]) {
1332   __m128i s5[2][5], sq5[5][2];
1333   Square(s[0][2], sq[0] + 4);
1334   Square(s[1][2], sq[1] + 4);
1335   s5[0][3] = Sum5Horizontal16(s[0] + 1);
1336   s5[1][3] = Sum5Horizontal16(s[0] + 2);
1337   StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1338   StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1339   s5[0][4] = Sum5Horizontal16(s[1] + 1);
1340   s5[1][4] = Sum5Horizontal16(s[1] + 2);
1341   StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1342   StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1343   Sum5Horizontal32(sq[0] + 2, sq5[3]);
1344   StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1345   Sum5Horizontal32(sq[1] + 2, sq5[4]);
1346   StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1347   LoadAligned16x3U16(sum5, x, s5[0]);
1348   LoadAligned32x3U32(square_sum5, x, sq5);
1349   CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
1350 
1351   Square(s[0][3], sq[0] + 6);
1352   Square(s[1][3], sq[1] + 6);
1353   Sum5Horizontal32(sq[0] + 4, sq5[3]);
1354   StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1355   Sum5Horizontal32(sq[1] + 4, sq5[4]);
1356   StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1357   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1358   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1359   CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
1360 }
1361 
BoxFilterPreProcess5LastRowLo(const __m128i s[2],const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i * const ma,__m128i b[2])1362 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
1363     const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
1364     const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
1365     __m128i b[2]) {
1366   __m128i s5[5], sq5[5][2];
1367   Square(s[1], sq + 2);
1368   s5[3] = s5[4] = Sum5Horizontal16(s);
1369   Sum5Horizontal32(sq, sq5[3]);
1370   sq5[4][0] = sq5[3][0];
1371   sq5[4][1] = sq5[3][1];
1372   LoadAligned16x3U16(sum5, 0, s5);
1373   LoadAligned32x3U32(square_sum5, 0, sq5);
1374   CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
1375 }
1376 
BoxFilterPreProcess5LastRow(const __m128i s[4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[8],__m128i ma[2],__m128i b[6])1377 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
1378     const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
1379     const uint32_t scale, const uint16_t* const sum5[5],
1380     const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
1381     __m128i b[6]) {
1382   __m128i s5[2][5], sq5[5][2];
1383   Square(s[2], sq + 4);
1384   s5[0][3] = Sum5Horizontal16(s + 1);
1385   s5[1][3] = Sum5Horizontal16(s + 2);
1386   s5[0][4] = s5[0][3];
1387   s5[1][4] = s5[1][3];
1388   Sum5Horizontal32(sq + 2, sq5[3]);
1389   sq5[4][0] = sq5[3][0];
1390   sq5[4][1] = sq5[3][1];
1391   LoadAligned16x3U16(sum5, x, s5[0]);
1392   LoadAligned32x3U32(square_sum5, x, sq5);
1393   CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
1394 
1395   Square(s[3], sq + 6);
1396   Sum5Horizontal32(sq + 4, sq5[3]);
1397   sq5[4][0] = sq5[3][0];
1398   sq5[4][1] = sq5[3][1];
1399   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1400   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1401   CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
1402 }
1403 
BoxFilterPreProcess3Lo(const __m128i s[2],const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[4],__m128i * const ma,__m128i b[2])1404 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
1405     const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
1406     uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
1407     __m128i b[2]) {
1408   __m128i s3[3], sq3[3][2];
1409   Square(s[1], sq + 2);
1410   s3[2] = Sum3Horizontal16(s);
1411   StoreAligned16(sum3[2], s3[2]);
1412   Sum3Horizontal32(sq, sq3[2]);
1413   StoreAligned32U32(square_sum3[2], sq3[2]);
1414   LoadAligned16x2U16(sum3, 0, s3);
1415   LoadAligned32x2U32(square_sum3, 0, sq3);
1416   CalculateIntermediate3(s3, sq3, scale, ma, b);
1417 }
1418 
BoxFilterPreProcess3(const __m128i s[4],const ptrdiff_t x,const ptrdiff_t sum_width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[8],__m128i ma[2],__m128i b[6])1419 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
1420     const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
1421     const uint32_t scale, uint16_t* const sum3[3],
1422     uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
1423     __m128i b[6]) {
1424   __m128i s3[4], sq3[3][2], sum[2], index[2];
1425   Square(s[2], sq + 4);
1426   s3[2] = Sum3Horizontal16(s + 1);
1427   s3[3] = Sum3Horizontal16(s + 2);
1428   StoreAligned32U16(sum3[2] + x, s3 + 2);
1429   Sum3Horizontal32(sq + 2, sq3[2]);
1430   StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
1431   LoadAligned16x2U16(sum3, x, s3);
1432   LoadAligned32x2U32(square_sum3, x, sq3);
1433   CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
1434 
1435   Square(s[3], sq + 6);
1436   Sum3Horizontal32(sq + 4, sq3[2]);
1437   StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1438   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
1439   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1440   CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
1441   CalculateIntermediate(sum, index, ma, b + 2);
1442 }
1443 
BoxFilterPreProcessLo(const __m128i s[2][4],const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i ma3[2][2],__m128i b3[2][6],__m128i * const ma5,__m128i b5[2])1444 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
1445     const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
1446     uint16_t* const sum5[5], uint32_t* const square_sum3[4],
1447     uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
1448     __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
1449   __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
1450   Square(s[0][1], sq[0] + 2);
1451   Square(s[1][1], sq[1] + 2);
1452   SumHorizontal16(s[0], &s3[2], &s5[3]);
1453   SumHorizontal16(s[1], &s3[3], &s5[4]);
1454   StoreAligned16(sum3[2], s3[2]);
1455   StoreAligned16(sum3[3], s3[3]);
1456   StoreAligned16(sum5[3], s5[3]);
1457   StoreAligned16(sum5[4], s5[4]);
1458   SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1459   StoreAligned32U32(square_sum3[2], sq3[2]);
1460   StoreAligned32U32(square_sum5[3], sq5[3]);
1461   SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1462   StoreAligned32U32(square_sum3[3], sq3[3]);
1463   StoreAligned32U32(square_sum5[4], sq5[4]);
1464   LoadAligned16x2U16(sum3, 0, s3);
1465   LoadAligned32x2U32(square_sum3, 0, sq3);
1466   LoadAligned16x3U16(sum5, 0, s5);
1467   LoadAligned32x3U32(square_sum5, 0, sq5);
1468   CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
1469   CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
1470   CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
1471   ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
1472   CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1473 }
1474 
BoxFilterPreProcess(const __m128i s[2][4],const ptrdiff_t x,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,__m128i sq[2][8],__m128i ma3[2][2],__m128i b3[2][6],__m128i ma5[2],__m128i b5[6])1475 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
1476     const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
1477     uint16_t* const sum3[4], uint16_t* const sum5[5],
1478     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1479     const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
1480     __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
1481   __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
1482   SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1483   StoreAligned16(sum3[2] + x + 0, s3[0][2]);
1484   StoreAligned16(sum3[2] + x + 8, s3[1][2]);
1485   StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1486   StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1487   SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
1488   StoreAligned16(sum3[3] + x + 0, s3[0][3]);
1489   StoreAligned16(sum3[3] + x + 8, s3[1][3]);
1490   StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1491   StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1492   Square(s[0][2], sq[0] + 4);
1493   Square(s[1][2], sq[1] + 4);
1494   SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1495   StoreAligned32U32(square_sum3[2] + x, sq3[2]);
1496   StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1497   SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1498   StoreAligned32U32(square_sum3[3] + x, sq3[3]);
1499   StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1500   LoadAligned16x2U16(sum3, x, s3[0]);
1501   LoadAligned32x2U32(square_sum3, x, sq3);
1502   CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
1503   CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
1504                         &index[1][0]);
1505   LoadAligned16x3U16(sum5, x, s5[0]);
1506   LoadAligned32x3U32(square_sum5, x, sq5);
1507   CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
1508 
1509   Square(s[0][3], sq[0] + 6);
1510   Square(s[1][3], sq[1] + 6);
1511   SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1512   StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1513   StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1514   SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1515   StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
1516   StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1517   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1518   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1519   CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
1520   CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
1521                         &index[1][1]);
1522   CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
1523   CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
1524   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1525   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1526   CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
1527 }
1528 
BoxFilterPreProcessLastRowLo(const __m128i s[2],const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i * const ma3,__m128i * const ma5,__m128i b3[2],__m128i b5[2])1529 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
1530     const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
1531     const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1532     const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
1533     __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
1534   __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
1535   Square(s[1], sq + 2);
1536   SumHorizontal16(s, &s3[2], &s5[3]);
1537   SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1538   LoadAligned16x3U16(sum5, 0, s5);
1539   s5[4] = s5[3];
1540   LoadAligned32x3U32(square_sum5, 0, sq5);
1541   sq5[4][0] = sq5[3][0];
1542   sq5[4][1] = sq5[3][1];
1543   CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1544   LoadAligned16x2U16(sum3, 0, s3);
1545   LoadAligned32x2U32(square_sum3, 0, sq3);
1546   CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
1547 }
1548 
BoxFilterPreProcessLastRow(const __m128i s[4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[8],__m128i ma3[2],__m128i ma5[2],__m128i b3[6],__m128i b5[6])1549 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
1550     const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
1551     const uint16_t scales[2], const uint16_t* const sum3[4],
1552     const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1553     const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
1554     __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
1555   __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
1556   Square(s[2], sq + 4);
1557   SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1558   SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1559   LoadAligned16x3U16(sum5, x, s5[0]);
1560   s5[0][4] = s5[0][3];
1561   LoadAligned32x3U32(square_sum5, x, sq5);
1562   sq5[4][0] = sq5[3][0];
1563   sq5[4][1] = sq5[3][1];
1564   CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
1565   LoadAligned16x2U16(sum3, x, s3[0]);
1566   LoadAligned32x2U32(square_sum3, x, sq3);
1567   CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
1568 
1569   Square(s[3], sq + 6);
1570   SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1571   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1572   s5[1][4] = s5[1][3];
1573   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1574   sq5[4][0] = sq5[3][0];
1575   sq5[4][1] = sq5[3][1];
1576   CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
1577   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1578   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1579   CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
1580   CalculateIntermediate(sum, index, ma3, b3 + 2);
1581 }
1582 
BoxSumFilterPreProcess5(const uint16_t * const src0,const uint16_t * const src1,const int width,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * ma565,uint32_t * b565)1583 inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
1584                                     const uint16_t* const src1, const int width,
1585                                     const uint32_t scale,
1586                                     uint16_t* const sum5[5],
1587                                     uint32_t* const square_sum5[5],
1588                                     const ptrdiff_t sum_width, uint16_t* ma565,
1589                                     uint32_t* b565) {
1590   const ptrdiff_t overread_in_bytes =
1591       kOverreadInBytesPass1 - sizeof(*src0) * width;
1592   __m128i s[2][4], mas[2], sq[2][8], bs[6];
1593   s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1594   s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1595   s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1596   s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1597   Square(s[0][0], sq[0]);
1598   Square(s[1][0], sq[1]);
1599   BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1600 
1601   int x = 0;
1602   do {
1603     __m128i ma5[3], ma[2], b[4];
1604     s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1605                                   overread_in_bytes + sizeof(*src0) * (x + 16));
1606     s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1607                                   overread_in_bytes + sizeof(*src0) * (x + 24));
1608     s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1609                                   overread_in_bytes + sizeof(*src1) * (x + 16));
1610     s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1611                                   overread_in_bytes + sizeof(*src1) * (x + 24));
1612     BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1613                          bs);
1614     Prepare3_8<0>(mas, ma5);
1615     ma[0] = Sum565Lo(ma5);
1616     ma[1] = Sum565Hi(ma5);
1617     StoreAligned32U16(ma565, ma);
1618     Sum565(bs + 0, b + 0);
1619     Sum565(bs + 2, b + 2);
1620     StoreAligned64U32(b565, b);
1621     s[0][0] = s[0][2];
1622     s[0][1] = s[0][3];
1623     s[1][0] = s[1][2];
1624     s[1][1] = s[1][3];
1625     sq[0][2] = sq[0][6];
1626     sq[0][3] = sq[0][7];
1627     sq[1][2] = sq[1][6];
1628     sq[1][3] = sq[1][7];
1629     mas[0] = mas[1];
1630     bs[0] = bs[4];
1631     bs[1] = bs[5];
1632     ma565 += 16;
1633     b565 += 16;
1634     x += 16;
1635   } while (x < width);
1636 }
1637 
1638 template <bool calculate444>
BoxSumFilterPreProcess3(const uint16_t * const src,const int width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],const ptrdiff_t sum_width,uint16_t * ma343,uint16_t * ma444,uint32_t * b343,uint32_t * b444)1639 LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
1640     const uint16_t* const src, const int width, const uint32_t scale,
1641     uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1642     const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
1643     uint32_t* b444) {
1644   const ptrdiff_t overread_in_bytes =
1645       kOverreadInBytesPass2 - sizeof(*src) * width;
1646   __m128i s[4], mas[2], sq[8], bs[6];
1647   s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
1648   s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
1649   Square(s[0], sq);
1650   BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
1651 
1652   int x = 0;
1653   do {
1654     s[2] = LoadUnaligned16Msan(src + x + 16,
1655                                overread_in_bytes + sizeof(*src) * (x + 16));
1656     s[3] = LoadUnaligned16Msan(src + x + 24,
1657                                overread_in_bytes + sizeof(*src) * (x + 24));
1658     BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1659                          bs);
1660     __m128i ma3[3];
1661     Prepare3_8<0>(mas, ma3);
1662     if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
1663       Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
1664       Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
1665       ma444 += 16;
1666       b444 += 16;
1667     } else {
1668       __m128i ma[2], b[4];
1669       ma[0] = Sum343Lo(ma3);
1670       ma[1] = Sum343Hi(ma3);
1671       StoreAligned32U16(ma343, ma);
1672       Sum343(bs + 0, b + 0);
1673       Sum343(bs + 2, b + 2);
1674       StoreAligned64U32(b343, b);
1675     }
1676     s[1] = s[3];
1677     sq[2] = sq[6];
1678     sq[3] = sq[7];
1679     mas[0] = mas[1];
1680     bs[0] = bs[4];
1681     bs[1] = bs[5];
1682     ma343 += 16;
1683     b343 += 16;
1684     x += 16;
1685   } while (x < width);
1686 }
1687 
BoxSumFilterPreProcess(const uint16_t * const src0,const uint16_t * const src1,const int width,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444,uint16_t * ma565,uint32_t * const b343[4],uint32_t * const b444,uint32_t * b565)1688 inline void BoxSumFilterPreProcess(
1689     const uint16_t* const src0, const uint16_t* const src1, const int width,
1690     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
1691     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1692     const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
1693     uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
1694     uint32_t* b565) {
1695   const ptrdiff_t overread_in_bytes =
1696       kOverreadInBytesPass1 - sizeof(*src0) * width;
1697   __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
1698   s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1699   s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1700   s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1701   s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1702   Square(s[0][0], sq[0]);
1703   Square(s[1][0], sq[1]);
1704   BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
1705                         ma3, b3, &ma5[0], b5);
1706 
1707   int x = 0;
1708   do {
1709     __m128i ma[2], b[4], ma3x[3], ma5x[3];
1710     s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1711                                   overread_in_bytes + sizeof(*src0) * (x + 16));
1712     s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1713                                   overread_in_bytes + sizeof(*src0) * (x + 24));
1714     s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1715                                   overread_in_bytes + sizeof(*src1) * (x + 16));
1716     s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1717                                   overread_in_bytes + sizeof(*src1) * (x + 24));
1718     BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
1719                         sum_width, sq, ma3, b3, ma5, b5);
1720 
1721     Prepare3_8<0>(ma3[0], ma3x);
1722     ma[0] = Sum343Lo(ma3x);
1723     ma[1] = Sum343Hi(ma3x);
1724     StoreAligned32U16(ma343[0] + x, ma);
1725     Sum343(b3[0] + 0, b + 0);
1726     Sum343(b3[0] + 2, b + 2);
1727     StoreAligned64U32(b343[0] + x, b);
1728     Sum565(b5 + 0, b + 0);
1729     Sum565(b5 + 2, b + 2);
1730     StoreAligned64U32(b565, b);
1731     Prepare3_8<0>(ma3[1], ma3x);
1732     Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
1733     Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
1734     Prepare3_8<0>(ma5, ma5x);
1735     ma[0] = Sum565Lo(ma5x);
1736     ma[1] = Sum565Hi(ma5x);
1737     StoreAligned32U16(ma565, ma);
1738     s[0][0] = s[0][2];
1739     s[0][1] = s[0][3];
1740     s[1][0] = s[1][2];
1741     s[1][1] = s[1][3];
1742     sq[0][2] = sq[0][6];
1743     sq[0][3] = sq[0][7];
1744     sq[1][2] = sq[1][6];
1745     sq[1][3] = sq[1][7];
1746     ma3[0][0] = ma3[0][1];
1747     ma3[1][0] = ma3[1][1];
1748     ma5[0] = ma5[1];
1749     b3[0][0] = b3[0][4];
1750     b3[0][1] = b3[0][5];
1751     b3[1][0] = b3[1][4];
1752     b3[1][1] = b3[1][5];
1753     b5[0] = b5[4];
1754     b5[1] = b5[5];
1755     ma565 += 16;
1756     b565 += 16;
1757     x += 16;
1758   } while (x < width);
1759 }
1760 
1761 template <int shift>
FilterOutput(const __m128i ma_x_src,const __m128i b)1762 inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
1763   // ma: 255 * 32 = 8160 (13 bits)
1764   // b: 65088 * 32 = 2082816 (21 bits)
1765   // v: b - ma * 255 (22 bits)
1766   const __m128i v = _mm_sub_epi32(b, ma_x_src);
1767   // kSgrProjSgrBits = 8
1768   // kSgrProjRestoreBits = 4
1769   // shift = 4 or 5
1770   // v >> 8 or 9 (13 bits)
1771   return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
1772 }
1773 
1774 template <int shift>
CalculateFilteredOutput(const __m128i src,const __m128i ma,const __m128i b[2])1775 inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
1776                                        const __m128i b[2]) {
1777   const __m128i ma_x_src_lo = VmullLo16(ma, src);
1778   const __m128i ma_x_src_hi = VmullHi16(ma, src);
1779   const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
1780   const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
1781   return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
1782 }
1783 
CalculateFilteredOutputPass1(const __m128i src,const __m128i ma[2],const __m128i b[2][2])1784 inline __m128i CalculateFilteredOutputPass1(const __m128i src,
1785                                             const __m128i ma[2],
1786                                             const __m128i b[2][2]) {
1787   const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
1788   __m128i b_sum[2];
1789   b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
1790   b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
1791   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1792 }
1793 
CalculateFilteredOutputPass2(const __m128i src,const __m128i ma[3],const __m128i b[3][2])1794 inline __m128i CalculateFilteredOutputPass2(const __m128i src,
1795                                             const __m128i ma[3],
1796                                             const __m128i b[3][2]) {
1797   const __m128i ma_sum = Sum3_16(ma);
1798   __m128i b_sum[2];
1799   Sum3_32(b, b_sum);
1800   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1801 }
1802 
SelfGuidedFinal(const __m128i src,const __m128i v[2])1803 inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
1804   const __m128i v_lo =
1805       VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1806   const __m128i v_hi =
1807       VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1808   const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
1809   return _mm_add_epi16(src, vv);
1810 }
1811 
SelfGuidedDoubleMultiplier(const __m128i src,const __m128i filter[2],const int w0,const int w2)1812 inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
1813                                           const __m128i filter[2], const int w0,
1814                                           const int w2) {
1815   __m128i v[2];
1816   const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
1817   const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
1818   const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
1819   v[0] = _mm_madd_epi16(w0_w2, f_lo);
1820   v[1] = _mm_madd_epi16(w0_w2, f_hi);
1821   return SelfGuidedFinal(src, v);
1822 }
1823 
SelfGuidedSingleMultiplier(const __m128i src,const __m128i filter,const int w0)1824 inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
1825                                           const __m128i filter, const int w0) {
1826   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
1827   __m128i v[2];
1828   v[0] = VmullNLo8(filter, w0);
1829   v[1] = VmullNHi8(filter, w0);
1830   return SelfGuidedFinal(src, v);
1831 }
1832 
ClipAndStore(uint16_t * const dst,const __m128i val)1833 inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
1834   const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
1835   const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
1836   StoreAligned16(dst, val1);
1837 }
1838 
BoxFilterPass1(const uint16_t * const src,const uint16_t * const src0,const uint16_t * const src1,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const ma565[2],uint32_t * const b565[2],uint16_t * const dst)1839 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
1840     const uint16_t* const src, const uint16_t* const src0,
1841     const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
1842     uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
1843     const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
1844     uint32_t* const b565[2], uint16_t* const dst) {
1845   const ptrdiff_t overread_in_bytes =
1846       kOverreadInBytesPass1 - sizeof(*src0) * width;
1847   __m128i s[2][4], mas[2], sq[2][8], bs[6];
1848   s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1849   s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1850   s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1851   s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1852   Square(s[0][0], sq[0]);
1853   Square(s[1][0], sq[1]);
1854   BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1855 
1856   int x = 0;
1857   do {
1858     __m128i ma[2], ma5[3], b[2][2], p[2];
1859     s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1860                                   overread_in_bytes + sizeof(*src0) * (x + 16));
1861     s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1862                                   overread_in_bytes + sizeof(*src0) * (x + 24));
1863     s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1864                                   overread_in_bytes + sizeof(*src1) * (x + 16));
1865     s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1866                                   overread_in_bytes + sizeof(*src1) * (x + 24));
1867     BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1868                          bs);
1869     Prepare3_8<0>(mas, ma5);
1870     ma[1] = Sum565Lo(ma5);
1871     StoreAligned16(ma565[1] + x, ma[1]);
1872     Sum565(bs, b[1]);
1873     StoreAligned32U32(b565[1] + x, b[1]);
1874     const __m128i sr0_lo = LoadAligned16(src + x + 0);
1875     const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
1876     ma[0] = LoadAligned16(ma565[0] + x);
1877     LoadAligned32U32(b565[0] + x, b[0]);
1878     p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
1879     p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
1880     const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
1881     const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
1882 
1883     ma[1] = Sum565Hi(ma5);
1884     StoreAligned16(ma565[1] + x + 8, ma[1]);
1885     Sum565(bs + 2, b[1]);
1886     StoreAligned32U32(b565[1] + x + 8, b[1]);
1887     const __m128i sr0_hi = LoadAligned16(src + x + 8);
1888     const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
1889     ma[0] = LoadAligned16(ma565[0] + x + 8);
1890     LoadAligned32U32(b565[0] + x + 8, b[0]);
1891     p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
1892     p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
1893     const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
1894     ClipAndStore(dst + x + 0, d00);
1895     ClipAndStore(dst + x + 8, d01);
1896     const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
1897     ClipAndStore(dst + stride + x + 0, d10);
1898     ClipAndStore(dst + stride + x + 8, d11);
1899     s[0][0] = s[0][2];
1900     s[0][1] = s[0][3];
1901     s[1][0] = s[1][2];
1902     s[1][1] = s[1][3];
1903     sq[0][2] = sq[0][6];
1904     sq[0][3] = sq[0][7];
1905     sq[1][2] = sq[1][6];
1906     sq[1][3] = sq[1][7];
1907     mas[0] = mas[1];
1908     bs[0] = bs[4];
1909     bs[1] = bs[5];
1910     x += 16;
1911   } while (x < width);
1912 }
1913 
BoxFilterPass1LastRow(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum5[5],uint32_t * const square_sum5[5],uint16_t * ma565,uint32_t * b565,uint16_t * const dst)1914 inline void BoxFilterPass1LastRow(
1915     const uint16_t* const src, const uint16_t* const src0, const int width,
1916     const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1917     uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
1918     uint32_t* b565, uint16_t* const dst) {
1919   const ptrdiff_t overread_in_bytes =
1920       kOverreadInBytesPass1 - sizeof(*src0) * width;
1921   __m128i s[4], mas[2], sq[8], bs[6];
1922   s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1923   s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1924   Square(s[0], sq);
1925   BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1926 
1927   int x = 0;
1928   do {
1929     __m128i ma[2], ma5[3], b[2][2];
1930     s[2] = LoadUnaligned16Msan(src0 + x + 16,
1931                                overread_in_bytes + sizeof(*src0) * (x + 16));
1932     s[3] = LoadUnaligned16Msan(src0 + x + 24,
1933                                overread_in_bytes + sizeof(*src0) * (x + 24));
1934     BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
1935                                 sq, mas, bs);
1936     Prepare3_8<0>(mas, ma5);
1937     ma[1] = Sum565Lo(ma5);
1938     Sum565(bs, b[1]);
1939     ma[0] = LoadAligned16(ma565);
1940     LoadAligned32U32(b565, b[0]);
1941     const __m128i sr_lo = LoadAligned16(src + x + 0);
1942     __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
1943     const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
1944 
1945     ma[1] = Sum565Hi(ma5);
1946     Sum565(bs + 2, b[1]);
1947     ma[0] = LoadAligned16(ma565 + 8);
1948     LoadAligned32U32(b565 + 8, b[0]);
1949     const __m128i sr_hi = LoadAligned16(src + x + 8);
1950     p = CalculateFilteredOutputPass1(sr_hi, ma, b);
1951     const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
1952     ClipAndStore(dst + x + 0, d0);
1953     ClipAndStore(dst + x + 8, d1);
1954     s[1] = s[3];
1955     sq[2] = sq[6];
1956     sq[3] = sq[7];
1957     mas[0] = mas[1];
1958     bs[0] = bs[4];
1959     bs[1] = bs[5];
1960     ma565 += 16;
1961     b565 += 16;
1962     x += 16;
1963   } while (x < width);
1964 }
1965 
BoxFilterPass2(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum3[3],uint32_t * const square_sum3[3],uint16_t * const ma343[3],uint16_t * const ma444[2],uint32_t * const b343[3],uint32_t * const b444[2],uint16_t * const dst)1966 LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
1967     const uint16_t* const src, const uint16_t* const src0, const int width,
1968     const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1969     uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1970     uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
1971     uint32_t* const b444[2], uint16_t* const dst) {
1972   const ptrdiff_t overread_in_bytes =
1973       kOverreadInBytesPass2 - sizeof(*src0) * width;
1974   __m128i s[4], mas[2], sq[8], bs[6];
1975   s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1976   s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1977   Square(s[0], sq);
1978   BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
1979 
1980   int x = 0;
1981   do {
1982     s[2] = LoadUnaligned16Msan(src0 + x + 16,
1983                                overread_in_bytes + sizeof(*src0) * (x + 16));
1984     s[3] = LoadUnaligned16Msan(src0 + x + 24,
1985                                overread_in_bytes + sizeof(*src0) * (x + 24));
1986     BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1987                          bs);
1988     __m128i ma[3], b[3][2], ma3[3];
1989     Prepare3_8<0>(mas, ma3);
1990     Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
1991                    b444[1]);
1992     const __m128i sr_lo = LoadAligned16(src + x + 0);
1993     ma[0] = LoadAligned16(ma343[0] + x);
1994     ma[1] = LoadAligned16(ma444[0] + x);
1995     LoadAligned32U32(b343[0] + x, b[0]);
1996     LoadAligned32U32(b444[0] + x, b[1]);
1997     const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
1998 
1999     Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
2000                    b343[2], b444[1]);
2001     const __m128i sr_hi = LoadAligned16(src + x + 8);
2002     ma[0] = LoadAligned16(ma343[0] + x + 8);
2003     ma[1] = LoadAligned16(ma444[0] + x + 8);
2004     LoadAligned32U32(b343[0] + x + 8, b[0]);
2005     LoadAligned32U32(b444[0] + x + 8, b[1]);
2006     const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
2007     const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
2008     const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
2009     ClipAndStore(dst + x + 0, d0);
2010     ClipAndStore(dst + x + 8, d1);
2011     s[1] = s[3];
2012     sq[2] = sq[6];
2013     sq[3] = sq[7];
2014     mas[0] = mas[1];
2015     bs[0] = bs[4];
2016     bs[1] = bs[5];
2017     x += 16;
2018   } while (x < width);
2019 }
2020 
BoxFilter(const uint16_t * const src,const uint16_t * const src0,const uint16_t * const src1,const ptrdiff_t stride,const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],uint16_t * const dst)2021 LIBGAV1_ALWAYS_INLINE void BoxFilter(
2022     const uint16_t* const src, const uint16_t* const src0,
2023     const uint16_t* const src1, const ptrdiff_t stride, const int width,
2024     const uint16_t scales[2], const int16_t w0, const int16_t w2,
2025     uint16_t* const sum3[4], uint16_t* const sum5[5],
2026     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2027     const ptrdiff_t sum_width, uint16_t* const ma343[4],
2028     uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
2029     uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
2030   const ptrdiff_t overread_in_bytes =
2031       kOverreadInBytesPass1 - sizeof(*src0) * width;
2032   __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
2033   s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
2034   s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
2035   s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
2036   s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
2037   Square(s[0][0], sq[0]);
2038   Square(s[1][0], sq[1]);
2039   BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
2040                         ma3, b3, &ma5[0], b5);
2041 
2042   int x = 0;
2043   do {
2044     __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
2045     s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
2046                                   overread_in_bytes + sizeof(*src0) * (x + 16));
2047     s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
2048                                   overread_in_bytes + sizeof(*src0) * (x + 24));
2049     s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
2050                                   overread_in_bytes + sizeof(*src1) * (x + 16));
2051     s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
2052                                   overread_in_bytes + sizeof(*src1) * (x + 24));
2053     BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
2054                         sum_width, sq, ma3, b3, ma5, b5);
2055     Prepare3_8<0>(ma3[0], ma3x[0]);
2056     Prepare3_8<0>(ma3[1], ma3x[1]);
2057     Prepare3_8<0>(ma5, ma5x);
2058     Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
2059                    ma343[2], ma444[1], b343[2], b444[1]);
2060     Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
2061                    b343[3], b444[2]);
2062     ma[0][1] = Sum565Lo(ma5x);
2063     StoreAligned16(ma565[1] + x, ma[0][1]);
2064     Sum565(b5, b[0][1]);
2065     StoreAligned32U32(b565[1] + x, b[0][1]);
2066     const __m128i sr0_lo = LoadAligned16(src + x);
2067     const __m128i sr1_lo = LoadAligned16(src + stride + x);
2068     ma[0][0] = LoadAligned16(ma565[0] + x);
2069     LoadAligned32U32(b565[0] + x, b[0][0]);
2070     p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
2071     p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
2072     ma[1][0] = LoadAligned16(ma343[0] + x);
2073     ma[1][1] = LoadAligned16(ma444[0] + x);
2074     LoadAligned32U32(b343[0] + x, b[1][0]);
2075     LoadAligned32U32(b444[0] + x, b[1][1]);
2076     p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
2077     const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
2078     ma[2][0] = LoadAligned16(ma343[1] + x);
2079     LoadAligned32U32(b343[1] + x, b[2][0]);
2080     p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
2081     const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
2082 
2083     Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
2084                    b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
2085     Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
2086                    ma444[2], b343[3], b444[2]);
2087     ma[0][1] = Sum565Hi(ma5x);
2088     StoreAligned16(ma565[1] + x + 8, ma[0][1]);
2089     Sum565(b5 + 2, b[0][1]);
2090     StoreAligned32U32(b565[1] + x + 8, b[0][1]);
2091     const __m128i sr0_hi = LoadAligned16(src + x + 8);
2092     const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
2093     ma[0][0] = LoadAligned16(ma565[0] + x + 8);
2094     LoadAligned32U32(b565[0] + x + 8, b[0][0]);
2095     p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
2096     p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
2097     ma[1][0] = LoadAligned16(ma343[0] + x + 8);
2098     ma[1][1] = LoadAligned16(ma444[0] + x + 8);
2099     LoadAligned32U32(b343[0] + x + 8, b[1][0]);
2100     LoadAligned32U32(b444[0] + x + 8, b[1][1]);
2101     p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
2102     const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
2103     ClipAndStore(dst + x + 0, d00);
2104     ClipAndStore(dst + x + 8, d01);
2105     ma[2][0] = LoadAligned16(ma343[1] + x + 8);
2106     LoadAligned32U32(b343[1] + x + 8, b[2][0]);
2107     p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
2108     const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
2109     ClipAndStore(dst + stride + x + 0, d10);
2110     ClipAndStore(dst + stride + x + 8, d11);
2111     s[0][0] = s[0][2];
2112     s[0][1] = s[0][3];
2113     s[1][0] = s[1][2];
2114     s[1][1] = s[1][3];
2115     sq[0][2] = sq[0][6];
2116     sq[0][3] = sq[0][7];
2117     sq[1][2] = sq[1][6];
2118     sq[1][3] = sq[1][7];
2119     ma3[0][0] = ma3[0][1];
2120     ma3[1][0] = ma3[1][1];
2121     ma5[0] = ma5[1];
2122     b3[0][0] = b3[0][4];
2123     b3[0][1] = b3[0][5];
2124     b3[1][0] = b3[1][4];
2125     b3[1][1] = b3[1][5];
2126     b5[0] = b5[4];
2127     b5[1] = b5[5];
2128     x += 16;
2129   } while (x < width);
2130 }
2131 
BoxFilterLastRow(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],uint16_t * const ma343,uint16_t * const ma444,uint16_t * const ma565,uint32_t * const b343,uint32_t * const b444,uint32_t * const b565,uint16_t * const dst)2132 inline void BoxFilterLastRow(
2133     const uint16_t* const src, const uint16_t* const src0, const int width,
2134     const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
2135     const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
2136     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2137     uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
2138     uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
2139     uint16_t* const dst) {
2140   const ptrdiff_t overread_in_bytes =
2141       kOverreadInBytesPass1 - sizeof(*src0) * width;
2142   __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
2143   s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
2144   s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
2145   Square(s[0], sq);
2146   BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
2147                                sq, &ma3[0], &ma5[0], b3, b5);
2148 
2149   int x = 0;
2150   do {
2151     __m128i ma3x[3], ma5x[3], p[2];
2152     s[2] = LoadUnaligned16Msan(src0 + x + 16,
2153                                overread_in_bytes + sizeof(*src0) * (x + 16));
2154     s[3] = LoadUnaligned16Msan(src0 + x + 24,
2155                                overread_in_bytes + sizeof(*src0) * (x + 24));
2156     BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
2157                                square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
2158     Prepare3_8<0>(ma3, ma3x);
2159     Prepare3_8<0>(ma5, ma5x);
2160     ma[1] = Sum565Lo(ma5x);
2161     Sum565(b5, b[1]);
2162     ma[2] = Sum343Lo(ma3x);
2163     Sum343(b3, b[2]);
2164     const __m128i sr_lo = LoadAligned16(src + x + 0);
2165     ma[0] = LoadAligned16(ma565 + x);
2166     LoadAligned32U32(b565 + x, b[0]);
2167     p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
2168     ma[0] = LoadAligned16(ma343 + x);
2169     ma[1] = LoadAligned16(ma444 + x);
2170     LoadAligned32U32(b343 + x, b[0]);
2171     LoadAligned32U32(b444 + x, b[1]);
2172     p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
2173     const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
2174 
2175     ma[1] = Sum565Hi(ma5x);
2176     Sum565(b5 + 2, b[1]);
2177     ma[2] = Sum343Hi(ma3x);
2178     Sum343(b3 + 2, b[2]);
2179     const __m128i sr_hi = LoadAligned16(src + x + 8);
2180     ma[0] = LoadAligned16(ma565 + x + 8);
2181     LoadAligned32U32(b565 + x + 8, b[0]);
2182     p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
2183     ma[0] = LoadAligned16(ma343 + x + 8);
2184     ma[1] = LoadAligned16(ma444 + x + 8);
2185     LoadAligned32U32(b343 + x + 8, b[0]);
2186     LoadAligned32U32(b444 + x + 8, b[1]);
2187     p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
2188     const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
2189     ClipAndStore(dst + x + 0, d0);
2190     ClipAndStore(dst + x + 8, d1);
2191     s[1] = s[3];
2192     sq[2] = sq[6];
2193     sq[3] = sq[7];
2194     ma3[0] = ma3[1];
2195     ma5[0] = ma5[1];
2196     b3[0] = b3[4];
2197     b3[1] = b3[5];
2198     b5[0] = b5[4];
2199     b5[1] = b5[5];
2200     x += 16;
2201   } while (x < width);
2202 }
2203 
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2204 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
2205     const RestorationUnitInfo& restoration_info, const uint16_t* src,
2206     const ptrdiff_t stride, const uint16_t* const top_border,
2207     const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
2208     const ptrdiff_t bottom_border_stride, const int width, const int height,
2209     SgrBuffer* const sgr_buffer, uint16_t* dst) {
2210   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2211   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2212   const auto sum_stride = temp_stride + 16;
2213   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2214   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
2215   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2216   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2217   const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
2218   uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
2219   uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
2220   sum3[0] = sgr_buffer->sum3;
2221   square_sum3[0] = sgr_buffer->square_sum3;
2222   ma343[0] = sgr_buffer->ma343;
2223   b343[0] = sgr_buffer->b343;
2224   for (int i = 1; i <= 3; ++i) {
2225     sum3[i] = sum3[i - 1] + sum_stride;
2226     square_sum3[i] = square_sum3[i - 1] + sum_stride;
2227     ma343[i] = ma343[i - 1] + temp_stride;
2228     b343[i] = b343[i - 1] + temp_stride;
2229   }
2230   sum5[0] = sgr_buffer->sum5;
2231   square_sum5[0] = sgr_buffer->square_sum5;
2232   for (int i = 1; i <= 4; ++i) {
2233     sum5[i] = sum5[i - 1] + sum_stride;
2234     square_sum5[i] = square_sum5[i - 1] + sum_stride;
2235   }
2236   ma444[0] = sgr_buffer->ma444;
2237   b444[0] = sgr_buffer->b444;
2238   for (int i = 1; i <= 2; ++i) {
2239     ma444[i] = ma444[i - 1] + temp_stride;
2240     b444[i] = b444[i - 1] + temp_stride;
2241   }
2242   ma565[0] = sgr_buffer->ma565;
2243   ma565[1] = ma565[0] + temp_stride;
2244   b565[0] = sgr_buffer->b565;
2245   b565[1] = b565[0] + temp_stride;
2246   assert(scales[0] != 0);
2247   assert(scales[1] != 0);
2248   BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
2249          sum5[1], square_sum3[0], square_sum5[1]);
2250   sum5[0] = sum5[1];
2251   square_sum5[0] = square_sum5[1];
2252   const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
2253   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
2254                          square_sum5, sum_width, ma343, ma444[0], ma565[0],
2255                          b343, b444[0], b565[0]);
2256   sum5[0] = sgr_buffer->sum5;
2257   square_sum5[0] = sgr_buffer->square_sum5;
2258 
2259   for (int y = (height >> 1) - 1; y > 0; --y) {
2260     Circulate4PointersBy2<uint16_t>(sum3);
2261     Circulate4PointersBy2<uint32_t>(square_sum3);
2262     Circulate5PointersBy2<uint16_t>(sum5);
2263     Circulate5PointersBy2<uint32_t>(square_sum5);
2264     BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
2265               scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
2266               ma343, ma444, ma565, b343, b444, b565, dst);
2267     src += 2 * stride;
2268     dst += 2 * stride;
2269     Circulate4PointersBy2<uint16_t>(ma343);
2270     Circulate4PointersBy2<uint32_t>(b343);
2271     std::swap(ma444[0], ma444[2]);
2272     std::swap(b444[0], b444[2]);
2273     std::swap(ma565[0], ma565[1]);
2274     std::swap(b565[0], b565[1]);
2275   }
2276 
2277   Circulate4PointersBy2<uint16_t>(sum3);
2278   Circulate4PointersBy2<uint32_t>(square_sum3);
2279   Circulate5PointersBy2<uint16_t>(sum5);
2280   Circulate5PointersBy2<uint32_t>(square_sum5);
2281   if ((height & 1) == 0 || height > 1) {
2282     const uint16_t* sr[2];
2283     if ((height & 1) == 0) {
2284       sr[0] = bottom_border;
2285       sr[1] = bottom_border + bottom_border_stride;
2286     } else {
2287       sr[0] = src + 2 * stride;
2288       sr[1] = bottom_border;
2289     }
2290     BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
2291               square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
2292               b444, b565, dst);
2293   }
2294   if ((height & 1) != 0) {
2295     if (height > 1) {
2296       src += 2 * stride;
2297       dst += 2 * stride;
2298       Circulate4PointersBy2<uint16_t>(sum3);
2299       Circulate4PointersBy2<uint32_t>(square_sum3);
2300       Circulate5PointersBy2<uint16_t>(sum5);
2301       Circulate5PointersBy2<uint32_t>(square_sum5);
2302       Circulate4PointersBy2<uint16_t>(ma343);
2303       Circulate4PointersBy2<uint32_t>(b343);
2304       std::swap(ma444[0], ma444[2]);
2305       std::swap(b444[0], b444[2]);
2306       std::swap(ma565[0], ma565[1]);
2307       std::swap(b565[0], b565[1]);
2308     }
2309     BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
2310                      sum_width, scales, w0, w2, sum3, sum5, square_sum3,
2311                      square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
2312                      b444[0], b565[0], dst);
2313   }
2314 }
2315 
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2316 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
2317                                   const uint16_t* src, const ptrdiff_t stride,
2318                                   const uint16_t* const top_border,
2319                                   const ptrdiff_t top_border_stride,
2320                                   const uint16_t* bottom_border,
2321                                   const ptrdiff_t bottom_border_stride,
2322                                   const int width, const int height,
2323                                   SgrBuffer* const sgr_buffer, uint16_t* dst) {
2324   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2325   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2326   const auto sum_stride = temp_stride + 16;
2327   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2328   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
2329   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2330   uint16_t *sum5[5], *ma565[2];
2331   uint32_t *square_sum5[5], *b565[2];
2332   sum5[0] = sgr_buffer->sum5;
2333   square_sum5[0] = sgr_buffer->square_sum5;
2334   for (int i = 1; i <= 4; ++i) {
2335     sum5[i] = sum5[i - 1] + sum_stride;
2336     square_sum5[i] = square_sum5[i - 1] + sum_stride;
2337   }
2338   ma565[0] = sgr_buffer->ma565;
2339   ma565[1] = ma565[0] + temp_stride;
2340   b565[0] = sgr_buffer->b565;
2341   b565[1] = b565[0] + temp_stride;
2342   assert(scale != 0);
2343   BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
2344             sum5[1], square_sum5[1]);
2345   sum5[0] = sum5[1];
2346   square_sum5[0] = square_sum5[1];
2347   const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
2348   BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
2349                           ma565[0], b565[0]);
2350   sum5[0] = sgr_buffer->sum5;
2351   square_sum5[0] = sgr_buffer->square_sum5;
2352 
2353   for (int y = (height >> 1) - 1; y > 0; --y) {
2354     Circulate5PointersBy2<uint16_t>(sum5);
2355     Circulate5PointersBy2<uint32_t>(square_sum5);
2356     BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
2357                    square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
2358     src += 2 * stride;
2359     dst += 2 * stride;
2360     std::swap(ma565[0], ma565[1]);
2361     std::swap(b565[0], b565[1]);
2362   }
2363 
2364   Circulate5PointersBy2<uint16_t>(sum5);
2365   Circulate5PointersBy2<uint32_t>(square_sum5);
2366   if ((height & 1) == 0 || height > 1) {
2367     const uint16_t* sr[2];
2368     if ((height & 1) == 0) {
2369       sr[0] = bottom_border;
2370       sr[1] = bottom_border + bottom_border_stride;
2371     } else {
2372       sr[0] = src + 2 * stride;
2373       sr[1] = bottom_border;
2374     }
2375     BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
2376                    sum_width, scale, w0, ma565, b565, dst);
2377   }
2378   if ((height & 1) != 0) {
2379     src += 3;
2380     if (height > 1) {
2381       src += 2 * stride;
2382       dst += 2 * stride;
2383       std::swap(ma565[0], ma565[1]);
2384       std::swap(b565[0], b565[1]);
2385       Circulate5PointersBy2<uint16_t>(sum5);
2386       Circulate5PointersBy2<uint32_t>(square_sum5);
2387     }
2388     BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
2389                           sum_width, scale, w0, sum5, square_sum5, ma565[0],
2390                           b565[0], dst);
2391   }
2392 }
2393 
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2394 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
2395                                   const uint16_t* src, const ptrdiff_t stride,
2396                                   const uint16_t* const top_border,
2397                                   const ptrdiff_t top_border_stride,
2398                                   const uint16_t* bottom_border,
2399                                   const ptrdiff_t bottom_border_stride,
2400                                   const int width, const int height,
2401                                   SgrBuffer* const sgr_buffer, uint16_t* dst) {
2402   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
2403   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2404   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2405   const auto sum_stride = temp_stride + 16;
2406   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2407   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
2408   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2409   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
2410   uint16_t *sum3[3], *ma343[3], *ma444[2];
2411   uint32_t *square_sum3[3], *b343[3], *b444[2];
2412   sum3[0] = sgr_buffer->sum3;
2413   square_sum3[0] = sgr_buffer->square_sum3;
2414   ma343[0] = sgr_buffer->ma343;
2415   b343[0] = sgr_buffer->b343;
2416   for (int i = 1; i <= 2; ++i) {
2417     sum3[i] = sum3[i - 1] + sum_stride;
2418     square_sum3[i] = square_sum3[i - 1] + sum_stride;
2419     ma343[i] = ma343[i - 1] + temp_stride;
2420     b343[i] = b343[i - 1] + temp_stride;
2421   }
2422   ma444[0] = sgr_buffer->ma444;
2423   ma444[1] = ma444[0] + temp_stride;
2424   b444[0] = sgr_buffer->b444;
2425   b444[1] = b444[0] + temp_stride;
2426   assert(scale != 0);
2427   BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
2428             sum3[0], square_sum3[0]);
2429   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
2430                                  sum_width, ma343[0], nullptr, b343[0],
2431                                  nullptr);
2432   Circulate3PointersBy1<uint16_t>(sum3);
2433   Circulate3PointersBy1<uint32_t>(square_sum3);
2434   const uint16_t* s;
2435   if (height > 1) {
2436     s = src + stride;
2437   } else {
2438     s = bottom_border;
2439     bottom_border += bottom_border_stride;
2440   }
2441   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
2442                                 ma343[1], ma444[0], b343[1], b444[0]);
2443 
2444   for (int y = height - 2; y > 0; --y) {
2445     Circulate3PointersBy1<uint16_t>(sum3);
2446     Circulate3PointersBy1<uint32_t>(square_sum3);
2447     BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
2448                    square_sum3, ma343, ma444, b343, b444, dst);
2449     src += stride;
2450     dst += stride;
2451     Circulate3PointersBy1<uint16_t>(ma343);
2452     Circulate3PointersBy1<uint32_t>(b343);
2453     std::swap(ma444[0], ma444[1]);
2454     std::swap(b444[0], b444[1]);
2455   }
2456 
2457   int y = std::min(height, 2);
2458   src += 2;
2459   do {
2460     Circulate3PointersBy1<uint16_t>(sum3);
2461     Circulate3PointersBy1<uint32_t>(square_sum3);
2462     BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
2463                    square_sum3, ma343, ma444, b343, b444, dst);
2464     src += stride;
2465     dst += stride;
2466     bottom_border += bottom_border_stride;
2467     Circulate3PointersBy1<uint16_t>(ma343);
2468     Circulate3PointersBy1<uint32_t>(b343);
2469     std::swap(ma444[0], ma444[1]);
2470     std::swap(b444[0], b444[1]);
2471   } while (--y != 0);
2472 }
2473 
2474 // If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
2475 // in the end of each row. It is safe to overwrite the output as it will not be
2476 // part of the visible frame.
SelfGuidedFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)2477 void SelfGuidedFilter_SSE4_1(
2478     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
2479     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
2480     const void* LIBGAV1_RESTRICT const top_border,
2481     const ptrdiff_t top_border_stride,
2482     const void* LIBGAV1_RESTRICT const bottom_border,
2483     const ptrdiff_t bottom_border_stride, const int width, const int height,
2484     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
2485     void* LIBGAV1_RESTRICT const dest) {
2486   const int index = restoration_info.sgr_proj_info.index;
2487   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
2488   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
2489   const auto* const src = static_cast<const uint16_t*>(source);
2490   const auto* const top = static_cast<const uint16_t*>(top_border);
2491   const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
2492   auto* const dst = static_cast<uint16_t*>(dest);
2493   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
2494   if (radius_pass_1 == 0) {
2495     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
2496     // following assertion.
2497     assert(radius_pass_0 != 0);
2498     BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
2499                           top_border_stride, bottom - 3, bottom_border_stride,
2500                           width, height, sgr_buffer, dst);
2501   } else if (radius_pass_0 == 0) {
2502     BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
2503                           top_border_stride, bottom - 2, bottom_border_stride,
2504                           width, height, sgr_buffer, dst);
2505   } else {
2506     BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
2507                      top_border_stride, bottom - 3, bottom_border_stride, width,
2508                      height, sgr_buffer, dst);
2509   }
2510 }
2511 
Init10bpp()2512 void Init10bpp() {
2513   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2514   assert(dsp != nullptr);
2515   static_cast<void>(dsp);
2516 #if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
2517   dsp->loop_restorations[0] = WienerFilter_SSE4_1;
2518 #else
2519   static_cast<void>(WienerFilter_SSE4_1);
2520 #endif
2521 #if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
2522   dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
2523 #else
2524   static_cast<void>(SelfGuidedFilter_SSE4_1);
2525 #endif
2526 }
2527 
2528 }  // namespace
2529 
LoopRestorationInit10bpp_SSE4_1()2530 void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
2531 
2532 }  // namespace dsp
2533 }  // namespace libgav1
2534 
2535 #else   // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
2536 namespace libgav1 {
2537 namespace dsp {
2538 
LoopRestorationInit10bpp_SSE4_1()2539 void LoopRestorationInit10bpp_SSE4_1() {}
2540 
2541 }  // namespace dsp
2542 }  // namespace libgav1
2543 #endif  // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
2544