1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/loop_restoration.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
19 #include <smmintrin.h>
20
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26
27 #include "src/dsp/common.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/constants.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37
WienerHorizontalClip(const __m128i s[2],int16_t * const wiener_buffer)38 inline void WienerHorizontalClip(const __m128i s[2],
39 int16_t* const wiener_buffer) {
40 constexpr int offset =
41 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
42 constexpr int limit = (offset << 2) - 1;
43 const __m128i offsets = _mm_set1_epi16(-offset);
44 const __m128i limits = _mm_set1_epi16(limit - offset);
45 const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
46 const __m128i sum0 = _mm_add_epi32(s[0], round);
47 const __m128i sum1 = _mm_add_epi32(s[1], round);
48 const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
49 const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
50 const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
51 const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
52 const __m128i d1 = _mm_min_epi16(d0, limits);
53 StoreAligned16(wiener_buffer, d1);
54 }
55
WienerHorizontalTap7(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)56 inline void WienerHorizontalTap7(const uint16_t* src,
57 const ptrdiff_t src_stride,
58 const ptrdiff_t width, const int height,
59 const __m128i coefficients,
60 int16_t** const wiener_buffer) {
61 __m128i filter[2];
62 filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
63 filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
64 for (int y = height; y != 0; --y) {
65 ptrdiff_t x = 0;
66 do {
67 __m128i s[7], madds[4];
68 s[0] = LoadUnaligned16(src + x + 0);
69 s[1] = LoadUnaligned16(src + x + 1);
70 s[2] = LoadUnaligned16(src + x + 2);
71 s[3] = LoadUnaligned16(src + x + 3);
72 s[4] = LoadUnaligned16(src + x + 4);
73 s[5] = LoadUnaligned16(src + x + 5);
74 s[6] = LoadUnaligned16(src + x + 6);
75 const __m128i s06 = _mm_add_epi16(s[0], s[6]);
76 const __m128i s15 = _mm_add_epi16(s[1], s[5]);
77 const __m128i s24 = _mm_add_epi16(s[2], s[4]);
78 const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
79 const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
80 const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
81 const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
82 madds[0] = _mm_madd_epi16(ss0, filter[0]);
83 madds[1] = _mm_madd_epi16(ss1, filter[0]);
84 madds[2] = _mm_madd_epi16(ss2, filter[1]);
85 madds[3] = _mm_madd_epi16(ss3, filter[1]);
86 madds[0] = _mm_add_epi32(madds[0], madds[2]);
87 madds[1] = _mm_add_epi32(madds[1], madds[3]);
88 WienerHorizontalClip(madds, *wiener_buffer + x);
89 x += 8;
90 } while (x < width);
91 src += src_stride;
92 *wiener_buffer += width;
93 }
94 }
95
WienerHorizontalTap5(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)96 inline void WienerHorizontalTap5(const uint16_t* src,
97 const ptrdiff_t src_stride,
98 const ptrdiff_t width, const int height,
99 const __m128i coefficients,
100 int16_t** const wiener_buffer) {
101 const __m128i filter =
102 _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
103 for (int y = height; y != 0; --y) {
104 ptrdiff_t x = 0;
105 do {
106 __m128i s[5], madds[2];
107 s[0] = LoadUnaligned16(src + x + 0);
108 s[1] = LoadUnaligned16(src + x + 1);
109 s[2] = LoadUnaligned16(src + x + 2);
110 s[3] = LoadUnaligned16(src + x + 3);
111 s[4] = LoadUnaligned16(src + x + 4);
112 const __m128i s04 = _mm_add_epi16(s[0], s[4]);
113 const __m128i s13 = _mm_add_epi16(s[1], s[3]);
114 const __m128i s2d = _mm_add_epi16(s[2], s[2]);
115 const __m128i s0m = _mm_sub_epi16(s04, s2d);
116 const __m128i s1m = _mm_sub_epi16(s13, s2d);
117 const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
118 const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
119 madds[0] = _mm_madd_epi16(ss0, filter);
120 madds[1] = _mm_madd_epi16(ss1, filter);
121 const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
122 const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
123 const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
124 const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
125 madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
126 madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
127 WienerHorizontalClip(madds, *wiener_buffer + x);
128 x += 8;
129 } while (x < width);
130 src += src_stride;
131 *wiener_buffer += width;
132 }
133 }
134
WienerHorizontalTap3(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const __m128i coefficients,int16_t ** const wiener_buffer)135 inline void WienerHorizontalTap3(const uint16_t* src,
136 const ptrdiff_t src_stride,
137 const ptrdiff_t width, const int height,
138 const __m128i coefficients,
139 int16_t** const wiener_buffer) {
140 const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
141 for (int y = height; y != 0; --y) {
142 ptrdiff_t x = 0;
143 do {
144 __m128i s[3], madds[2];
145 s[0] = LoadUnaligned16(src + x + 0);
146 s[1] = LoadUnaligned16(src + x + 1);
147 s[2] = LoadUnaligned16(src + x + 2);
148 const __m128i s02 = _mm_add_epi16(s[0], s[2]);
149 const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
150 const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
151 madds[0] = _mm_madd_epi16(ss0, filter);
152 madds[1] = _mm_madd_epi16(ss1, filter);
153 WienerHorizontalClip(madds, *wiener_buffer + x);
154 x += 8;
155 } while (x < width);
156 src += src_stride;
157 *wiener_buffer += width;
158 }
159 }
160
WienerHorizontalTap1(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,int16_t ** const wiener_buffer)161 inline void WienerHorizontalTap1(const uint16_t* src,
162 const ptrdiff_t src_stride,
163 const ptrdiff_t width, const int height,
164 int16_t** const wiener_buffer) {
165 for (int y = height; y != 0; --y) {
166 ptrdiff_t x = 0;
167 do {
168 const __m128i s = LoadUnaligned16(src + x);
169 const __m128i d = _mm_slli_epi16(s, 4);
170 StoreAligned16(*wiener_buffer + x, d);
171 x += 8;
172 } while (x < width);
173 src += src_stride;
174 *wiener_buffer += width;
175 }
176 }
177
WienerVertical7(const __m128i a[4],const __m128i filter[4])178 inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
179 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
180 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
181 const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
182 const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
183 const __m128i madd01 = _mm_add_epi32(madd0, madd1);
184 const __m128i madd23 = _mm_add_epi32(madd2, madd3);
185 const __m128i sum = _mm_add_epi32(madd01, madd23);
186 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
187 }
188
WienerVertical5(const __m128i a[3],const __m128i filter[3])189 inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
190 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
191 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
192 const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
193 const __m128i madd01 = _mm_add_epi32(madd0, madd1);
194 const __m128i sum = _mm_add_epi32(madd01, madd2);
195 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
196 }
197
WienerVertical3(const __m128i a[2],const __m128i filter[2])198 inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
199 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
200 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
201 const __m128i sum = _mm_add_epi32(madd0, madd1);
202 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
203 }
204
WienerVerticalClip(const __m128i s[2])205 inline __m128i WienerVerticalClip(const __m128i s[2]) {
206 const __m128i d = _mm_packus_epi32(s[0], s[1]);
207 return _mm_min_epu16(d, _mm_set1_epi16(1023));
208 }
209
WienerVerticalFilter7(const __m128i a[7],const __m128i filter[2])210 inline __m128i WienerVerticalFilter7(const __m128i a[7],
211 const __m128i filter[2]) {
212 const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
213 __m128i b[4], c[2];
214 b[0] = _mm_unpacklo_epi16(a[0], a[1]);
215 b[1] = _mm_unpacklo_epi16(a[2], a[3]);
216 b[2] = _mm_unpacklo_epi16(a[4], a[5]);
217 b[3] = _mm_unpacklo_epi16(a[6], round);
218 c[0] = WienerVertical7(b, filter);
219 b[0] = _mm_unpackhi_epi16(a[0], a[1]);
220 b[1] = _mm_unpackhi_epi16(a[2], a[3]);
221 b[2] = _mm_unpackhi_epi16(a[4], a[5]);
222 b[3] = _mm_unpackhi_epi16(a[6], round);
223 c[1] = WienerVertical7(b, filter);
224 return WienerVerticalClip(c);
225 }
226
WienerVerticalFilter5(const __m128i a[5],const __m128i filter[3])227 inline __m128i WienerVerticalFilter5(const __m128i a[5],
228 const __m128i filter[3]) {
229 const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
230 __m128i b[3], c[2];
231 b[0] = _mm_unpacklo_epi16(a[0], a[1]);
232 b[1] = _mm_unpacklo_epi16(a[2], a[3]);
233 b[2] = _mm_unpacklo_epi16(a[4], round);
234 c[0] = WienerVertical5(b, filter);
235 b[0] = _mm_unpackhi_epi16(a[0], a[1]);
236 b[1] = _mm_unpackhi_epi16(a[2], a[3]);
237 b[2] = _mm_unpackhi_epi16(a[4], round);
238 c[1] = WienerVertical5(b, filter);
239 return WienerVerticalClip(c);
240 }
241
WienerVerticalFilter3(const __m128i a[3],const __m128i filter[2])242 inline __m128i WienerVerticalFilter3(const __m128i a[3],
243 const __m128i filter[2]) {
244 const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
245 __m128i b[2], c[2];
246 b[0] = _mm_unpacklo_epi16(a[0], a[1]);
247 b[1] = _mm_unpacklo_epi16(a[2], round);
248 c[0] = WienerVertical3(b, filter);
249 b[0] = _mm_unpackhi_epi16(a[0], a[1]);
250 b[1] = _mm_unpackhi_epi16(a[2], round);
251 c[1] = WienerVertical3(b, filter);
252 return WienerVerticalClip(c);
253 }
254
WienerVerticalTap7Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[7])255 inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
256 const ptrdiff_t wiener_stride,
257 const __m128i filter[2], __m128i a[7]) {
258 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
259 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
260 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
261 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
262 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
263 a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
264 a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
265 return WienerVerticalFilter7(a, filter);
266 }
267
WienerVerticalTap5Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[3],__m128i a[5])268 inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
269 const ptrdiff_t wiener_stride,
270 const __m128i filter[3], __m128i a[5]) {
271 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
272 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
273 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
274 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
275 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
276 return WienerVerticalFilter5(a, filter);
277 }
278
WienerVerticalTap3Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[3])279 inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
280 const ptrdiff_t wiener_stride,
281 const __m128i filter[2], __m128i a[3]) {
282 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
283 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
284 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
285 return WienerVerticalFilter3(a, filter);
286 }
287
WienerVerticalTap7(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[4],uint16_t * dst,const ptrdiff_t dst_stride)288 inline void WienerVerticalTap7(const int16_t* wiener_buffer,
289 const ptrdiff_t width, const int height,
290 const int16_t coefficients[4], uint16_t* dst,
291 const ptrdiff_t dst_stride) {
292 const __m128i c = LoadLo8(coefficients);
293 __m128i filter[4];
294 filter[0] = _mm_shuffle_epi32(c, 0x0);
295 filter[1] = _mm_shuffle_epi32(c, 0x55);
296 filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
297 filter[3] =
298 _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
299 for (int y = height >> 1; y > 0; --y) {
300 ptrdiff_t x = 0;
301 do {
302 __m128i a[8], d[2];
303 d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
304 a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
305 d[1] = WienerVerticalFilter7(a + 1, filter);
306 StoreAligned16(dst + x, d[0]);
307 StoreAligned16(dst + dst_stride + x, d[1]);
308 x += 8;
309 } while (x < width);
310 dst += 2 * dst_stride;
311 wiener_buffer += 2 * width;
312 }
313
314 if ((height & 1) != 0) {
315 ptrdiff_t x = 0;
316 do {
317 __m128i a[7];
318 const __m128i d =
319 WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
320 StoreAligned16(dst + x, d);
321 x += 8;
322 } while (x < width);
323 }
324 }
325
WienerVerticalTap5(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[3],uint16_t * dst,const ptrdiff_t dst_stride)326 inline void WienerVerticalTap5(const int16_t* wiener_buffer,
327 const ptrdiff_t width, const int height,
328 const int16_t coefficients[3], uint16_t* dst,
329 const ptrdiff_t dst_stride) {
330 const __m128i c = LoadLo8(coefficients);
331 __m128i filter[3];
332 filter[0] = _mm_shuffle_epi32(c, 0x0);
333 filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
334 filter[2] =
335 _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
336 for (int y = height >> 1; y > 0; --y) {
337 ptrdiff_t x = 0;
338 do {
339 __m128i a[6], d[2];
340 d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
341 a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
342 d[1] = WienerVerticalFilter5(a + 1, filter);
343 StoreAligned16(dst + x, d[0]);
344 StoreAligned16(dst + dst_stride + x, d[1]);
345 x += 8;
346 } while (x < width);
347 dst += 2 * dst_stride;
348 wiener_buffer += 2 * width;
349 }
350
351 if ((height & 1) != 0) {
352 ptrdiff_t x = 0;
353 do {
354 __m128i a[5];
355 const __m128i d =
356 WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
357 StoreAligned16(dst + x, d);
358 x += 8;
359 } while (x < width);
360 }
361 }
362
WienerVerticalTap3(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[2],uint16_t * dst,const ptrdiff_t dst_stride)363 inline void WienerVerticalTap3(const int16_t* wiener_buffer,
364 const ptrdiff_t width, const int height,
365 const int16_t coefficients[2], uint16_t* dst,
366 const ptrdiff_t dst_stride) {
367 __m128i filter[2];
368 filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
369 filter[1] =
370 _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
371 for (int y = height >> 1; y > 0; --y) {
372 ptrdiff_t x = 0;
373 do {
374 __m128i a[4], d[2];
375 d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
376 a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
377 d[1] = WienerVerticalFilter3(a + 1, filter);
378 StoreAligned16(dst + x, d[0]);
379 StoreAligned16(dst + dst_stride + x, d[1]);
380 x += 8;
381 } while (x < width);
382 dst += 2 * dst_stride;
383 wiener_buffer += 2 * width;
384 }
385
386 if ((height & 1) != 0) {
387 ptrdiff_t x = 0;
388 do {
389 __m128i a[3];
390 const __m128i d =
391 WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
392 StoreAligned16(dst + x, d);
393 x += 8;
394 } while (x < width);
395 }
396 }
397
WienerVerticalTap1Kernel(const int16_t * const wiener_buffer,uint16_t * const dst)398 inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
399 uint16_t* const dst) {
400 const __m128i a = LoadAligned16(wiener_buffer);
401 const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
402 const __m128i c = _mm_srai_epi16(b, 4);
403 const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
404 const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
405 StoreAligned16(dst, e);
406 }
407
WienerVerticalTap1(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,uint16_t * dst,const ptrdiff_t dst_stride)408 inline void WienerVerticalTap1(const int16_t* wiener_buffer,
409 const ptrdiff_t width, const int height,
410 uint16_t* dst, const ptrdiff_t dst_stride) {
411 for (int y = height >> 1; y > 0; --y) {
412 ptrdiff_t x = 0;
413 do {
414 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
415 WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
416 x += 8;
417 } while (x < width);
418 dst += 2 * dst_stride;
419 wiener_buffer += 2 * width;
420 }
421
422 if ((height & 1) != 0) {
423 ptrdiff_t x = 0;
424 do {
425 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
426 x += 8;
427 } while (x < width);
428 }
429 }
430
WienerFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)431 void WienerFilter_SSE4_1(
432 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
433 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
434 const void* LIBGAV1_RESTRICT const top_border,
435 const ptrdiff_t top_border_stride,
436 const void* LIBGAV1_RESTRICT const bottom_border,
437 const ptrdiff_t bottom_border_stride, const int width, const int height,
438 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
439 void* LIBGAV1_RESTRICT const dest) {
440 const int16_t* const number_leading_zero_coefficients =
441 restoration_info.wiener_info.number_leading_zero_coefficients;
442 const int number_rows_to_skip = std::max(
443 static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
444 1);
445 const ptrdiff_t wiener_stride = Align(width, 16);
446 int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
447 // The values are saturated to 13 bits before storing.
448 int16_t* wiener_buffer_horizontal =
449 wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
450
451 // horizontal filtering.
452 // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
453 const int height_horizontal =
454 height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
455 const int height_extra = (height_horizontal - height) >> 1;
456 assert(height_extra <= 2);
457 const auto* const src = static_cast<const uint16_t*>(source);
458 const auto* const top = static_cast<const uint16_t*>(top_border);
459 const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
460 const __m128i coefficients_horizontal =
461 LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
462 if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
463 WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
464 top_border_stride, wiener_stride, height_extra,
465 coefficients_horizontal, &wiener_buffer_horizontal);
466 WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
467 coefficients_horizontal, &wiener_buffer_horizontal);
468 WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
469 height_extra, coefficients_horizontal,
470 &wiener_buffer_horizontal);
471 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
472 WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
473 top_border_stride, wiener_stride, height_extra,
474 coefficients_horizontal, &wiener_buffer_horizontal);
475 WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
476 coefficients_horizontal, &wiener_buffer_horizontal);
477 WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
478 height_extra, coefficients_horizontal,
479 &wiener_buffer_horizontal);
480 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
481 // The maximum over-reads happen here.
482 WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
483 top_border_stride, wiener_stride, height_extra,
484 coefficients_horizontal, &wiener_buffer_horizontal);
485 WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
486 coefficients_horizontal, &wiener_buffer_horizontal);
487 WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
488 height_extra, coefficients_horizontal,
489 &wiener_buffer_horizontal);
490 } else {
491 assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
492 WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
493 top_border_stride, wiener_stride, height_extra,
494 &wiener_buffer_horizontal);
495 WienerHorizontalTap1(src, stride, wiener_stride, height,
496 &wiener_buffer_horizontal);
497 WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
498 height_extra, &wiener_buffer_horizontal);
499 }
500
501 // vertical filtering.
502 // Over-writes up to 15 values.
503 const int16_t* const filter_vertical =
504 restoration_info.wiener_info.filter[WienerInfo::kVertical];
505 auto* dst = static_cast<uint16_t*>(dest);
506 if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
507 // Because the top row of |source| is a duplicate of the second row, and the
508 // bottom row of |source| is a duplicate of its above row, we can duplicate
509 // the top and bottom row of |wiener_buffer| accordingly.
510 memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
511 sizeof(*wiener_buffer_horizontal) * wiener_stride);
512 memcpy(restoration_buffer->wiener_buffer,
513 restoration_buffer->wiener_buffer + wiener_stride,
514 sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
515 WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
516 filter_vertical, dst, stride);
517 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
518 WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
519 height, filter_vertical + 1, dst, stride);
520 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
521 WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
522 wiener_stride, height, filter_vertical + 2, dst, stride);
523 } else {
524 assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
525 WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
526 wiener_stride, height, dst, stride);
527 }
528 }
529
530 //------------------------------------------------------------------------------
531 // SGR
532
533 // SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
534 // Pass 1 and 2 for Pass 2.
535 constexpr int kOverreadInBytesPass1 = 4;
536 constexpr int kOverreadInBytesPass2 = 8;
537
LoadAligned16x2U16(const uint16_t * const src[2],const ptrdiff_t x,__m128i dst[2])538 inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
539 __m128i dst[2]) {
540 dst[0] = LoadAligned16(src[0] + x);
541 dst[1] = LoadAligned16(src[1] + x);
542 }
543
LoadAligned16x2U16Msan(const uint16_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])544 inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
545 const ptrdiff_t x, const ptrdiff_t border,
546 __m128i dst[2]) {
547 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
548 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
549 }
550
LoadAligned16x3U16(const uint16_t * const src[3],const ptrdiff_t x,__m128i dst[3])551 inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
552 __m128i dst[3]) {
553 dst[0] = LoadAligned16(src[0] + x);
554 dst[1] = LoadAligned16(src[1] + x);
555 dst[2] = LoadAligned16(src[2] + x);
556 }
557
LoadAligned16x3U16Msan(const uint16_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3])558 inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
559 const ptrdiff_t x, const ptrdiff_t border,
560 __m128i dst[3]) {
561 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
562 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
563 dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
564 }
565
LoadAligned32U32(const uint32_t * const src,__m128i dst[2])566 inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
567 dst[0] = LoadAligned16(src + 0);
568 dst[1] = LoadAligned16(src + 4);
569 }
570
LoadAligned32U32Msan(const uint32_t * const src,const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])571 inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
572 const ptrdiff_t border, __m128i dst[2]) {
573 dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
574 dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
575 }
576
LoadAligned32x2U32(const uint32_t * const src[2],const ptrdiff_t x,__m128i dst[2][2])577 inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
578 __m128i dst[2][2]) {
579 LoadAligned32U32(src[0] + x, dst[0]);
580 LoadAligned32U32(src[1] + x, dst[1]);
581 }
582
LoadAligned32x2U32Msan(const uint32_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2][2])583 inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
584 const ptrdiff_t x, const ptrdiff_t border,
585 __m128i dst[2][2]) {
586 LoadAligned32U32Msan(src[0], x, border, dst[0]);
587 LoadAligned32U32Msan(src[1], x, border, dst[1]);
588 }
589
LoadAligned32x3U32(const uint32_t * const src[3],const ptrdiff_t x,__m128i dst[3][2])590 inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
591 __m128i dst[3][2]) {
592 LoadAligned32U32(src[0] + x, dst[0]);
593 LoadAligned32U32(src[1] + x, dst[1]);
594 LoadAligned32U32(src[2] + x, dst[2]);
595 }
596
LoadAligned32x3U32Msan(const uint32_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3][2])597 inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
598 const ptrdiff_t x, const ptrdiff_t border,
599 __m128i dst[3][2]) {
600 LoadAligned32U32Msan(src[0], x, border, dst[0]);
601 LoadAligned32U32Msan(src[1], x, border, dst[1]);
602 LoadAligned32U32Msan(src[2], x, border, dst[2]);
603 }
604
StoreAligned32U16(uint16_t * const dst,const __m128i src[2])605 inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
606 StoreAligned16(dst + 0, src[0]);
607 StoreAligned16(dst + 8, src[1]);
608 }
609
StoreAligned32U32(uint32_t * const dst,const __m128i src[2])610 inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
611 StoreAligned16(dst + 0, src[0]);
612 StoreAligned16(dst + 4, src[1]);
613 }
614
StoreAligned64U32(uint32_t * const dst,const __m128i src[4])615 inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
616 StoreAligned32U32(dst + 0, src + 0);
617 StoreAligned32U32(dst + 8, src + 2);
618 }
619
620 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
621 // functions. Some compilers may generate super inefficient code and the whole
622 // decoder could be 15% slower.
623
VaddlLo8(const __m128i src0,const __m128i src1)624 inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
625 const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
626 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
627 return _mm_add_epi16(s0, s1);
628 }
629
VaddlHi8(const __m128i src0,const __m128i src1)630 inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
631 const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
632 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
633 return _mm_add_epi16(s0, s1);
634 }
635
VaddwLo8(const __m128i src0,const __m128i src1)636 inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
637 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
638 return _mm_add_epi16(src0, s1);
639 }
640
VaddwHi8(const __m128i src0,const __m128i src1)641 inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
642 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
643 return _mm_add_epi16(src0, s1);
644 }
645
VmullNLo8(const __m128i src0,const int src1)646 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
647 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
648 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
649 }
650
VmullNHi8(const __m128i src0,const int src1)651 inline __m128i VmullNHi8(const __m128i src0, const int src1) {
652 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
653 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
654 }
655
VmullLo16(const __m128i src0,const __m128i src1)656 inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
657 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
658 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
659 return _mm_madd_epi16(s0, s1);
660 }
661
VmullHi16(const __m128i src0,const __m128i src1)662 inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
663 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
664 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
665 return _mm_madd_epi16(s0, s1);
666 }
667
VrshrU16(const __m128i src0,const int src1)668 inline __m128i VrshrU16(const __m128i src0, const int src1) {
669 const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
670 return _mm_srli_epi16(sum, src1);
671 }
672
VrshrS32(const __m128i src0,const int src1)673 inline __m128i VrshrS32(const __m128i src0, const int src1) {
674 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
675 return _mm_srai_epi32(sum, src1);
676 }
677
VrshrU32(const __m128i src0,const int src1)678 inline __m128i VrshrU32(const __m128i src0, const int src1) {
679 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
680 return _mm_srli_epi32(sum, src1);
681 }
682
Square(const __m128i src,__m128i dst[2])683 inline void Square(const __m128i src, __m128i dst[2]) {
684 const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
685 const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
686 dst[0] = _mm_madd_epi16(s0, s0);
687 dst[1] = _mm_madd_epi16(s1, s1);
688 }
689
690 template <int offset>
Prepare3_8(const __m128i src[2],__m128i dst[3])691 inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
692 dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
693 dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
694 dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
695 }
696
Prepare3_16(const __m128i src[2],__m128i dst[3])697 inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
698 dst[0] = src[0];
699 dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
700 dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
701 }
702
Prepare3_32(const __m128i src[2],__m128i dst[3])703 inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
704 dst[0] = src[0];
705 dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
706 dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
707 }
708
Prepare5_16(const __m128i src[2],__m128i dst[5])709 inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
710 Prepare3_16(src, dst);
711 dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
712 dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
713 }
714
Prepare5_32(const __m128i src[2],__m128i dst[5])715 inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
716 Prepare3_32(src, dst);
717 dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
718 dst[4] = src[1];
719 }
720
Sum3_16(const __m128i src0,const __m128i src1,const __m128i src2)721 inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
722 const __m128i src2) {
723 const __m128i sum = _mm_add_epi16(src0, src1);
724 return _mm_add_epi16(sum, src2);
725 }
726
Sum3_16(const __m128i src[3])727 inline __m128i Sum3_16(const __m128i src[3]) {
728 return Sum3_16(src[0], src[1], src[2]);
729 }
730
Sum3_32(const __m128i src0,const __m128i src1,const __m128i src2)731 inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
732 const __m128i src2) {
733 const __m128i sum = _mm_add_epi32(src0, src1);
734 return _mm_add_epi32(sum, src2);
735 }
736
Sum3_32(const __m128i src[3])737 inline __m128i Sum3_32(const __m128i src[3]) {
738 return Sum3_32(src[0], src[1], src[2]);
739 }
740
Sum3_32(const __m128i src[3][2],__m128i dst[2])741 inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
742 dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
743 dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
744 }
745
Sum3WLo16(const __m128i src[3])746 inline __m128i Sum3WLo16(const __m128i src[3]) {
747 const __m128i sum = VaddlLo8(src[0], src[1]);
748 return VaddwLo8(sum, src[2]);
749 }
750
Sum3WHi16(const __m128i src[3])751 inline __m128i Sum3WHi16(const __m128i src[3]) {
752 const __m128i sum = VaddlHi8(src[0], src[1]);
753 return VaddwHi8(sum, src[2]);
754 }
755
Sum5_16(const __m128i src[5])756 inline __m128i Sum5_16(const __m128i src[5]) {
757 const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
758 const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
759 const __m128i sum = _mm_add_epi16(sum01, sum23);
760 return _mm_add_epi16(sum, src[4]);
761 }
762
Sum5_32(const __m128i * const src0,const __m128i * const src1,const __m128i * const src2,const __m128i * const src3,const __m128i * const src4)763 inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
764 const __m128i* const src2, const __m128i* const src3,
765 const __m128i* const src4) {
766 const __m128i sum01 = _mm_add_epi32(*src0, *src1);
767 const __m128i sum23 = _mm_add_epi32(*src2, *src3);
768 const __m128i sum = _mm_add_epi32(sum01, sum23);
769 return _mm_add_epi32(sum, *src4);
770 }
771
Sum5_32(const __m128i src[5])772 inline __m128i Sum5_32(const __m128i src[5]) {
773 return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
774 }
775
Sum5_32(const __m128i src[5][2],__m128i dst[2])776 inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
777 dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
778 dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
779 }
780
Sum3Horizontal16(const __m128i src[2])781 inline __m128i Sum3Horizontal16(const __m128i src[2]) {
782 __m128i s[3];
783 Prepare3_16(src, s);
784 return Sum3_16(s);
785 }
786
Sum3Horizontal32(const __m128i src[3],__m128i dst[2])787 inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
788 __m128i s[3];
789 Prepare3_32(src + 0, s);
790 dst[0] = Sum3_32(s);
791 Prepare3_32(src + 1, s);
792 dst[1] = Sum3_32(s);
793 }
794
Sum5Horizontal16(const __m128i src[2])795 inline __m128i Sum5Horizontal16(const __m128i src[2]) {
796 __m128i s[5];
797 Prepare5_16(src, s);
798 return Sum5_16(s);
799 }
800
Sum5Horizontal32(const __m128i src[3],__m128i dst[2])801 inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
802 __m128i s[5];
803 Prepare5_32(src + 0, s);
804 dst[0] = Sum5_32(s);
805 Prepare5_32(src + 1, s);
806 dst[1] = Sum5_32(s);
807 }
808
SumHorizontal16(const __m128i src[2],__m128i * const row3,__m128i * const row5)809 void SumHorizontal16(const __m128i src[2], __m128i* const row3,
810 __m128i* const row5) {
811 __m128i s[5];
812 Prepare5_16(src, s);
813 const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
814 *row3 = Sum3_16(s + 1);
815 *row5 = _mm_add_epi16(sum04, *row3);
816 }
817
SumHorizontal16(const __m128i src[3],__m128i * const row3_0,__m128i * const row3_1,__m128i * const row5_0,__m128i * const row5_1)818 inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
819 __m128i* const row3_1, __m128i* const row5_0,
820 __m128i* const row5_1) {
821 SumHorizontal16(src + 0, row3_0, row5_0);
822 SumHorizontal16(src + 1, row3_1, row5_1);
823 }
824
SumHorizontal32(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)825 void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
826 __m128i* const row_sq5) {
827 const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
828 *row_sq3 = Sum3_32(src + 1);
829 *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
830 }
831
SumHorizontal32(const __m128i src[3],__m128i * const row_sq3_0,__m128i * const row_sq3_1,__m128i * const row_sq5_0,__m128i * const row_sq5_1)832 inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
833 __m128i* const row_sq3_1, __m128i* const row_sq5_0,
834 __m128i* const row_sq5_1) {
835 __m128i s[5];
836 Prepare5_32(src + 0, s);
837 SumHorizontal32(s, row_sq3_0, row_sq5_0);
838 Prepare5_32(src + 1, s);
839 SumHorizontal32(s, row_sq3_1, row_sq5_1);
840 }
841
Sum343Lo(const __m128i ma3[3])842 inline __m128i Sum343Lo(const __m128i ma3[3]) {
843 const __m128i sum = Sum3WLo16(ma3);
844 const __m128i sum3 = Sum3_16(sum, sum, sum);
845 return VaddwLo8(sum3, ma3[1]);
846 }
847
Sum343Hi(const __m128i ma3[3])848 inline __m128i Sum343Hi(const __m128i ma3[3]) {
849 const __m128i sum = Sum3WHi16(ma3);
850 const __m128i sum3 = Sum3_16(sum, sum, sum);
851 return VaddwHi8(sum3, ma3[1]);
852 }
853
Sum343(const __m128i src[3])854 inline __m128i Sum343(const __m128i src[3]) {
855 const __m128i sum = Sum3_32(src);
856 const __m128i sum3 = Sum3_32(sum, sum, sum);
857 return _mm_add_epi32(sum3, src[1]);
858 }
859
Sum343(const __m128i src[3],__m128i dst[2])860 inline void Sum343(const __m128i src[3], __m128i dst[2]) {
861 __m128i s[3];
862 Prepare3_32(src + 0, s);
863 dst[0] = Sum343(s);
864 Prepare3_32(src + 1, s);
865 dst[1] = Sum343(s);
866 }
867
Sum565Lo(const __m128i src[3])868 inline __m128i Sum565Lo(const __m128i src[3]) {
869 const __m128i sum = Sum3WLo16(src);
870 const __m128i sum4 = _mm_slli_epi16(sum, 2);
871 const __m128i sum5 = _mm_add_epi16(sum4, sum);
872 return VaddwLo8(sum5, src[1]);
873 }
874
Sum565Hi(const __m128i src[3])875 inline __m128i Sum565Hi(const __m128i src[3]) {
876 const __m128i sum = Sum3WHi16(src);
877 const __m128i sum4 = _mm_slli_epi16(sum, 2);
878 const __m128i sum5 = _mm_add_epi16(sum4, sum);
879 return VaddwHi8(sum5, src[1]);
880 }
881
Sum565(const __m128i src[3])882 inline __m128i Sum565(const __m128i src[3]) {
883 const __m128i sum = Sum3_32(src);
884 const __m128i sum4 = _mm_slli_epi32(sum, 2);
885 const __m128i sum5 = _mm_add_epi32(sum4, sum);
886 return _mm_add_epi32(sum5, src[1]);
887 }
888
Sum565(const __m128i src[3],__m128i dst[2])889 inline void Sum565(const __m128i src[3], __m128i dst[2]) {
890 __m128i s[3];
891 Prepare3_32(src + 0, s);
892 dst[0] = Sum565(s);
893 Prepare3_32(src + 1, s);
894 dst[1] = Sum565(s);
895 }
896
BoxSum(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sum3,uint16_t * sum5,uint32_t * square_sum3,uint32_t * square_sum5)897 inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
898 const ptrdiff_t width, const ptrdiff_t sum_stride,
899 const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
900 uint32_t* square_sum3, uint32_t* square_sum5) {
901 const ptrdiff_t overread_in_bytes =
902 kOverreadInBytesPass1 - sizeof(*src) * width;
903 int y = 2;
904 do {
905 __m128i s[3], sq[6];
906 s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
907 Square(s[0], sq);
908 ptrdiff_t x = sum_width;
909 do {
910 __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
911 s[1] = LoadUnaligned16Msan(
912 src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
913 x -= 16;
914 src += 16;
915 s[2] = LoadUnaligned16Msan(
916 src, overread_in_bytes + sizeof(*src) * (sum_width - x));
917 Square(s[1], sq + 2);
918 Square(s[2], sq + 4);
919 SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
920 StoreAligned32U16(sum3, row3);
921 StoreAligned32U16(sum5, row5);
922 SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
923 &row_sq5[1]);
924 StoreAligned32U32(square_sum3 + 0, row_sq3);
925 StoreAligned32U32(square_sum5 + 0, row_sq5);
926 SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
927 &row_sq5[1]);
928 StoreAligned32U32(square_sum3 + 8, row_sq3);
929 StoreAligned32U32(square_sum5 + 8, row_sq5);
930 s[0] = s[2];
931 sq[0] = sq[4];
932 sq[1] = sq[5];
933 sum3 += 16;
934 sum5 += 16;
935 square_sum3 += 16;
936 square_sum5 += 16;
937 } while (x != 0);
938 src += src_stride - sum_width;
939 sum3 += sum_stride - sum_width;
940 sum5 += sum_stride - sum_width;
941 square_sum3 += sum_stride - sum_width;
942 square_sum5 += sum_stride - sum_width;
943 } while (--y != 0);
944 }
945
946 template <int size>
BoxSum(const uint16_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sums,uint32_t * square_sums)947 inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
948 const ptrdiff_t width, const ptrdiff_t sum_stride,
949 const ptrdiff_t sum_width, uint16_t* sums,
950 uint32_t* square_sums) {
951 static_assert(size == 3 || size == 5, "");
952 const ptrdiff_t overread_in_bytes =
953 ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
954 sizeof(*src) * width;
955 int y = 2;
956 do {
957 __m128i s[3], sq[6];
958 s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
959 Square(s[0], sq);
960 ptrdiff_t x = sum_width;
961 do {
962 __m128i row[2], row_sq[4];
963 s[1] = LoadUnaligned16Msan(
964 src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
965 x -= 16;
966 src += 16;
967 s[2] = LoadUnaligned16Msan(
968 src, overread_in_bytes + sizeof(*src) * (sum_width - x));
969 Square(s[1], sq + 2);
970 Square(s[2], sq + 4);
971 if (size == 3) {
972 row[0] = Sum3Horizontal16(s + 0);
973 row[1] = Sum3Horizontal16(s + 1);
974 Sum3Horizontal32(sq + 0, row_sq + 0);
975 Sum3Horizontal32(sq + 2, row_sq + 2);
976 } else {
977 row[0] = Sum5Horizontal16(s + 0);
978 row[1] = Sum5Horizontal16(s + 1);
979 Sum5Horizontal32(sq + 0, row_sq + 0);
980 Sum5Horizontal32(sq + 2, row_sq + 2);
981 }
982 StoreAligned32U16(sums, row);
983 StoreAligned64U32(square_sums, row_sq);
984 s[0] = s[2];
985 sq[0] = sq[4];
986 sq[1] = sq[5];
987 sums += 16;
988 square_sums += 16;
989 } while (x != 0);
990 src += src_stride - sum_width;
991 sums += sum_stride - sum_width;
992 square_sums += sum_stride - sum_width;
993 } while (--y != 0);
994 }
995
996 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq,const uint32_t scale)997 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
998 const uint32_t scale) {
999 static_assert(n == 9 || n == 25, "");
1000 // a = |sum_sq|
1001 // d = |sum|
1002 // p = (a * n < d * d) ? 0 : a * n - d * d;
1003 const __m128i dxd = _mm_madd_epi16(sum, sum);
1004 // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
1005 // Some compilers could do this for us but we make this explicit.
1006 // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
1007 __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
1008 if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
1009 const __m128i sub = _mm_sub_epi32(axn, dxd);
1010 const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
1011 const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
1012 return VrshrU32(pxs, kSgrProjScaleBits);
1013 }
1014
1015 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq[2],const uint32_t scale)1016 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
1017 const uint32_t scale) {
1018 static_assert(n == 9 || n == 25, "");
1019 const __m128i b = VrshrU16(sum, 2);
1020 const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
1021 const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
1022 const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
1023 const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
1024 return _mm_packus_epi32(z0, z1);
1025 }
1026
CalculateB5(const __m128i sum,const __m128i ma,__m128i b[2])1027 inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
1028 // one_over_n == 164.
1029 constexpr uint32_t one_over_n =
1030 ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
1031 // one_over_n_quarter == 41.
1032 constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
1033 static_assert(one_over_n == one_over_n_quarter << 2, "");
1034 // |ma| is in range [0, 255].
1035 const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
1036 const __m128i m0 = VmullLo16(m, sum);
1037 const __m128i m1 = VmullHi16(m, sum);
1038 b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
1039 b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
1040 }
1041
CalculateB3(const __m128i sum,const __m128i ma,__m128i b[2])1042 inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
1043 // one_over_n == 455.
1044 constexpr uint32_t one_over_n =
1045 ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
1046 const __m128i m0 = VmullLo16(ma, sum);
1047 const __m128i m1 = VmullHi16(ma, sum);
1048 const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
1049 const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
1050 b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
1051 b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
1052 }
1053
CalculateSumAndIndex5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1054 inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
1055 const uint32_t scale, __m128i* const sum,
1056 __m128i* const index) {
1057 __m128i sum_sq[2];
1058 *sum = Sum5_16(s5);
1059 Sum5_32(sq5, sum_sq);
1060 *index = CalculateMa<25>(*sum, sum_sq, scale);
1061 }
1062
CalculateSumAndIndex3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1063 inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
1064 const uint32_t scale, __m128i* const sum,
1065 __m128i* const index) {
1066 __m128i sum_sq[2];
1067 *sum = Sum3_16(s3);
1068 Sum3_32(sq3, sum_sq);
1069 *index = CalculateMa<9>(*sum, sum_sq, scale);
1070 }
1071
1072 template <int n, int offset>
LookupIntermediate(const __m128i sum,const __m128i index,__m128i * const ma,__m128i b[2])1073 inline void LookupIntermediate(const __m128i sum, const __m128i index,
1074 __m128i* const ma, __m128i b[2]) {
1075 static_assert(n == 9 || n == 25, "");
1076 static_assert(offset == 0 || offset == 8, "");
1077 const __m128i idx = _mm_packus_epi16(index, index);
1078 // Actually it's not stored and loaded. The compiler will use a 64-bit
1079 // general-purpose register to process. Faster than using _mm_extract_epi8().
1080 uint8_t temp[8];
1081 StoreLo8(temp, idx);
1082 // offset == 0 is assumed to be the first call to this function. The value is
1083 // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
1084 // equivalent if not faster than pinsrb.
1085 if (offset == 0) {
1086 *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
1087 } else {
1088 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
1089 }
1090 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
1091 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
1092 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
1093 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
1094 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
1095 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
1096 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
1097 // b = ma * b * one_over_n
1098 // |ma| = [0, 255]
1099 // |sum| is a box sum with radius 1 or 2.
1100 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1101 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1102 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1103 // When radius is 2 |n| is 25. |one_over_n| is 164.
1104 // When radius is 1 |n| is 9. |one_over_n| is 455.
1105 // |kSgrProjReciprocalBits| is 12.
1106 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1107 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1108 __m128i maq;
1109 if (offset == 0) {
1110 maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1111 } else {
1112 maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1113 }
1114 if (n == 9) {
1115 CalculateB3(sum, maq, b);
1116 } else {
1117 CalculateB5(sum, maq, b);
1118 }
1119 }
1120
1121 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
1122 // to get value 0 as the shuffle result. The most significiant bit 1 comes
1123 // either from the comparison instruction, or from the sign bit of the index.
ShuffleIndex(const __m128i table,const __m128i index)1124 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
1125 __m128i mask;
1126 mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
1127 mask = _mm_or_si128(mask, index);
1128 return _mm_shuffle_epi8(table, mask);
1129 }
1130
AdjustValue(const __m128i value,const __m128i index,const int threshold)1131 inline __m128i AdjustValue(const __m128i value, const __m128i index,
1132 const int threshold) {
1133 const __m128i thresholds = _mm_set1_epi8(threshold - 128);
1134 const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
1135 return _mm_add_epi8(value, offset);
1136 }
1137
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i * const ma,__m128i b0[2],__m128i b1[2])1138 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1139 __m128i* const ma, __m128i b0[2],
1140 __m128i b1[2]) {
1141 // Use table lookup to read elements whose indices are less than 48.
1142 const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
1143 const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
1144 const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
1145 const __m128i indices = _mm_packus_epi16(index[0], index[1]);
1146 __m128i idx;
1147 // Clip idx to 127 to apply signed comparison instructions.
1148 idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
1149 // All elements whose indices are less than 48 are set to 0.
1150 // Get shuffle results for indices in range [0, 15].
1151 *ma = ShuffleIndex(c0, idx);
1152 // Get shuffle results for indices in range [16, 31].
1153 // Subtract 16 to utilize the sign bit of the index.
1154 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1155 const __m128i res1 = ShuffleIndex(c1, idx);
1156 // Use OR instruction to combine shuffle results together.
1157 *ma = _mm_or_si128(*ma, res1);
1158 // Get shuffle results for indices in range [32, 47].
1159 // Subtract 16 to utilize the sign bit of the index.
1160 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1161 const __m128i res2 = ShuffleIndex(c2, idx);
1162 *ma = _mm_or_si128(*ma, res2);
1163
1164 // For elements whose indices are larger than 47, since they seldom change
1165 // values with the increase of the index, we use comparison and arithmetic
1166 // operations to calculate their values.
1167 // Add -128 to apply signed comparison instructions.
1168 idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
1169 // Elements whose indices are larger than 47 (with value 0) are set to 5.
1170 *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
1171 *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
1172 *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
1173 *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
1174 *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
1175 *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
1176
1177 // b = ma * b * one_over_n
1178 // |ma| = [0, 255]
1179 // |sum| is a box sum with radius 1 or 2.
1180 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1181 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1182 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1183 // When radius is 2 |n| is 25. |one_over_n| is 164.
1184 // When radius is 1 |n| is 9. |one_over_n| is 455.
1185 // |kSgrProjReciprocalBits| is 12.
1186 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1187 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1188 const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1189 CalculateB3(sum[0], maq0, b0);
1190 const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1191 CalculateB3(sum[1], maq1, b1);
1192 }
1193
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i ma[2],__m128i b[4])1194 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1195 __m128i ma[2], __m128i b[4]) {
1196 __m128i mas;
1197 CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
1198 ma[0] = _mm_unpacklo_epi64(ma[0], mas);
1199 ma[1] = _mm_srli_si128(mas, 8);
1200 }
1201
1202 // Note: It has been tried to call CalculateIntermediate() to replace the slow
1203 // LookupIntermediate() when calculating 16 intermediate data points. However,
1204 // the compiler generates even slower code.
1205 template <int offset>
CalculateIntermediate5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const ma,__m128i b[2])1206 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
1207 const uint32_t scale, __m128i* const ma,
1208 __m128i b[2]) {
1209 static_assert(offset == 0 || offset == 8, "");
1210 __m128i sum, index;
1211 CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
1212 LookupIntermediate<25, offset>(sum, index, ma, b);
1213 }
1214
CalculateIntermediate3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const ma,__m128i b[2])1215 inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
1216 const uint32_t scale, __m128i* const ma,
1217 __m128i b[2]) {
1218 __m128i sum, index;
1219 CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
1220 LookupIntermediate<9, 0>(sum, index, ma, b);
1221 }
1222
Store343_444(const __m128i b3[3],const ptrdiff_t x,__m128i sum_b343[2],__m128i sum_b444[2],uint32_t * const b343,uint32_t * const b444)1223 inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
1224 __m128i sum_b343[2], __m128i sum_b444[2],
1225 uint32_t* const b343, uint32_t* const b444) {
1226 __m128i b[3], sum_b111[2];
1227 Prepare3_32(b3 + 0, b);
1228 sum_b111[0] = Sum3_32(b);
1229 sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
1230 sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
1231 sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
1232 Prepare3_32(b3 + 1, b);
1233 sum_b111[1] = Sum3_32(b);
1234 sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
1235 sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
1236 sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
1237 StoreAligned32U32(b444 + x, sum_b444);
1238 StoreAligned32U32(b343 + x, sum_b343);
1239 }
1240
Store343_444Lo(const __m128i ma3[3],const __m128i b3[3],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1241 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
1242 const ptrdiff_t x, __m128i* const sum_ma343,
1243 __m128i* const sum_ma444, __m128i sum_b343[2],
1244 __m128i sum_b444[2], uint16_t* const ma343,
1245 uint16_t* const ma444, uint32_t* const b343,
1246 uint32_t* const b444) {
1247 const __m128i sum_ma111 = Sum3WLo16(ma3);
1248 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1249 StoreAligned16(ma444 + x, *sum_ma444);
1250 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1251 *sum_ma343 = VaddwLo8(sum333, ma3[1]);
1252 StoreAligned16(ma343 + x, *sum_ma343);
1253 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1254 }
1255
Store343_444Hi(const __m128i ma3[3],const __m128i b3[3],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1256 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
1257 const ptrdiff_t x, __m128i* const sum_ma343,
1258 __m128i* const sum_ma444, __m128i sum_b343[2],
1259 __m128i sum_b444[2], uint16_t* const ma343,
1260 uint16_t* const ma444, uint32_t* const b343,
1261 uint32_t* const b444) {
1262 const __m128i sum_ma111 = Sum3WHi16(ma3);
1263 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1264 StoreAligned16(ma444 + x, *sum_ma444);
1265 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1266 *sum_ma343 = VaddwHi8(sum333, ma3[1]);
1267 StoreAligned16(ma343 + x, *sum_ma343);
1268 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1269 }
1270
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1271 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1272 const ptrdiff_t x, __m128i* const sum_ma343,
1273 __m128i sum_b343[2], uint16_t* const ma343,
1274 uint16_t* const ma444, uint32_t* const b343,
1275 uint32_t* const b444) {
1276 __m128i sum_ma444, sum_b444[2];
1277 Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1278 ma444, b343, b444);
1279 }
1280
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1281 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1282 const ptrdiff_t x, __m128i* const sum_ma343,
1283 __m128i sum_b343[2], uint16_t* const ma343,
1284 uint16_t* const ma444, uint32_t* const b343,
1285 uint32_t* const b444) {
1286 __m128i sum_ma444, sum_b444[2];
1287 Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1288 ma444, b343, b444);
1289 }
1290
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1291 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1292 const ptrdiff_t x, uint16_t* const ma343,
1293 uint16_t* const ma444, uint32_t* const b343,
1294 uint32_t* const b444) {
1295 __m128i sum_ma343, sum_b343[2];
1296 Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1297 }
1298
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1299 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1300 const ptrdiff_t x, uint16_t* const ma343,
1301 uint16_t* const ma444, uint32_t* const b343,
1302 uint32_t* const b444) {
1303 __m128i sum_ma343, sum_b343[2];
1304 Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1305 }
1306
BoxFilterPreProcess5Lo(const __m128i s[2][4],const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i * const ma,__m128i b[2])1307 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
1308 const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
1309 uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
1310 __m128i b[2]) {
1311 __m128i s5[2][5], sq5[5][2];
1312 Square(s[0][1], sq[0] + 2);
1313 Square(s[1][1], sq[1] + 2);
1314 s5[0][3] = Sum5Horizontal16(s[0]);
1315 StoreAligned16(sum5[3], s5[0][3]);
1316 s5[0][4] = Sum5Horizontal16(s[1]);
1317 StoreAligned16(sum5[4], s5[0][4]);
1318 Sum5Horizontal32(sq[0], sq5[3]);
1319 StoreAligned32U32(square_sum5[3], sq5[3]);
1320 Sum5Horizontal32(sq[1], sq5[4]);
1321 StoreAligned32U32(square_sum5[4], sq5[4]);
1322 LoadAligned16x3U16(sum5, 0, s5[0]);
1323 LoadAligned32x3U32(square_sum5, 0, sq5);
1324 CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
1325 }
1326
BoxFilterPreProcess5(const __m128i s[2][4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i ma[2],__m128i b[6])1327 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
1328 const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
1329 const uint32_t scale, uint16_t* const sum5[5],
1330 uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
1331 __m128i b[6]) {
1332 __m128i s5[2][5], sq5[5][2];
1333 Square(s[0][2], sq[0] + 4);
1334 Square(s[1][2], sq[1] + 4);
1335 s5[0][3] = Sum5Horizontal16(s[0] + 1);
1336 s5[1][3] = Sum5Horizontal16(s[0] + 2);
1337 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1338 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1339 s5[0][4] = Sum5Horizontal16(s[1] + 1);
1340 s5[1][4] = Sum5Horizontal16(s[1] + 2);
1341 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1342 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1343 Sum5Horizontal32(sq[0] + 2, sq5[3]);
1344 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1345 Sum5Horizontal32(sq[1] + 2, sq5[4]);
1346 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1347 LoadAligned16x3U16(sum5, x, s5[0]);
1348 LoadAligned32x3U32(square_sum5, x, sq5);
1349 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
1350
1351 Square(s[0][3], sq[0] + 6);
1352 Square(s[1][3], sq[1] + 6);
1353 Sum5Horizontal32(sq[0] + 4, sq5[3]);
1354 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1355 Sum5Horizontal32(sq[1] + 4, sq5[4]);
1356 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1357 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1358 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1359 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
1360 }
1361
BoxFilterPreProcess5LastRowLo(const __m128i s[2],const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i * const ma,__m128i b[2])1362 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
1363 const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
1364 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
1365 __m128i b[2]) {
1366 __m128i s5[5], sq5[5][2];
1367 Square(s[1], sq + 2);
1368 s5[3] = s5[4] = Sum5Horizontal16(s);
1369 Sum5Horizontal32(sq, sq5[3]);
1370 sq5[4][0] = sq5[3][0];
1371 sq5[4][1] = sq5[3][1];
1372 LoadAligned16x3U16(sum5, 0, s5);
1373 LoadAligned32x3U32(square_sum5, 0, sq5);
1374 CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
1375 }
1376
BoxFilterPreProcess5LastRow(const __m128i s[4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[8],__m128i ma[2],__m128i b[6])1377 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
1378 const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
1379 const uint32_t scale, const uint16_t* const sum5[5],
1380 const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
1381 __m128i b[6]) {
1382 __m128i s5[2][5], sq5[5][2];
1383 Square(s[2], sq + 4);
1384 s5[0][3] = Sum5Horizontal16(s + 1);
1385 s5[1][3] = Sum5Horizontal16(s + 2);
1386 s5[0][4] = s5[0][3];
1387 s5[1][4] = s5[1][3];
1388 Sum5Horizontal32(sq + 2, sq5[3]);
1389 sq5[4][0] = sq5[3][0];
1390 sq5[4][1] = sq5[3][1];
1391 LoadAligned16x3U16(sum5, x, s5[0]);
1392 LoadAligned32x3U32(square_sum5, x, sq5);
1393 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
1394
1395 Square(s[3], sq + 6);
1396 Sum5Horizontal32(sq + 4, sq5[3]);
1397 sq5[4][0] = sq5[3][0];
1398 sq5[4][1] = sq5[3][1];
1399 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1400 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1401 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
1402 }
1403
BoxFilterPreProcess3Lo(const __m128i s[2],const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[4],__m128i * const ma,__m128i b[2])1404 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
1405 const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
1406 uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
1407 __m128i b[2]) {
1408 __m128i s3[3], sq3[3][2];
1409 Square(s[1], sq + 2);
1410 s3[2] = Sum3Horizontal16(s);
1411 StoreAligned16(sum3[2], s3[2]);
1412 Sum3Horizontal32(sq, sq3[2]);
1413 StoreAligned32U32(square_sum3[2], sq3[2]);
1414 LoadAligned16x2U16(sum3, 0, s3);
1415 LoadAligned32x2U32(square_sum3, 0, sq3);
1416 CalculateIntermediate3(s3, sq3, scale, ma, b);
1417 }
1418
BoxFilterPreProcess3(const __m128i s[4],const ptrdiff_t x,const ptrdiff_t sum_width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[8],__m128i ma[2],__m128i b[6])1419 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
1420 const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
1421 const uint32_t scale, uint16_t* const sum3[3],
1422 uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
1423 __m128i b[6]) {
1424 __m128i s3[4], sq3[3][2], sum[2], index[2];
1425 Square(s[2], sq + 4);
1426 s3[2] = Sum3Horizontal16(s + 1);
1427 s3[3] = Sum3Horizontal16(s + 2);
1428 StoreAligned32U16(sum3[2] + x, s3 + 2);
1429 Sum3Horizontal32(sq + 2, sq3[2]);
1430 StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
1431 LoadAligned16x2U16(sum3, x, s3);
1432 LoadAligned32x2U32(square_sum3, x, sq3);
1433 CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
1434
1435 Square(s[3], sq + 6);
1436 Sum3Horizontal32(sq + 4, sq3[2]);
1437 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1438 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
1439 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1440 CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
1441 CalculateIntermediate(sum, index, ma, b + 2);
1442 }
1443
BoxFilterPreProcessLo(const __m128i s[2][4],const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],__m128i sq[2][8],__m128i ma3[2][2],__m128i b3[2][6],__m128i * const ma5,__m128i b5[2])1444 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
1445 const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
1446 uint16_t* const sum5[5], uint32_t* const square_sum3[4],
1447 uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
1448 __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
1449 __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
1450 Square(s[0][1], sq[0] + 2);
1451 Square(s[1][1], sq[1] + 2);
1452 SumHorizontal16(s[0], &s3[2], &s5[3]);
1453 SumHorizontal16(s[1], &s3[3], &s5[4]);
1454 StoreAligned16(sum3[2], s3[2]);
1455 StoreAligned16(sum3[3], s3[3]);
1456 StoreAligned16(sum5[3], s5[3]);
1457 StoreAligned16(sum5[4], s5[4]);
1458 SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1459 StoreAligned32U32(square_sum3[2], sq3[2]);
1460 StoreAligned32U32(square_sum5[3], sq5[3]);
1461 SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1462 StoreAligned32U32(square_sum3[3], sq3[3]);
1463 StoreAligned32U32(square_sum5[4], sq5[4]);
1464 LoadAligned16x2U16(sum3, 0, s3);
1465 LoadAligned32x2U32(square_sum3, 0, sq3);
1466 LoadAligned16x3U16(sum5, 0, s5);
1467 LoadAligned32x3U32(square_sum5, 0, sq5);
1468 CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
1469 CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
1470 CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
1471 ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
1472 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1473 }
1474
BoxFilterPreProcess(const __m128i s[2][4],const ptrdiff_t x,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,__m128i sq[2][8],__m128i ma3[2][2],__m128i b3[2][6],__m128i ma5[2],__m128i b5[6])1475 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
1476 const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
1477 uint16_t* const sum3[4], uint16_t* const sum5[5],
1478 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1479 const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
1480 __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
1481 __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
1482 SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1483 StoreAligned16(sum3[2] + x + 0, s3[0][2]);
1484 StoreAligned16(sum3[2] + x + 8, s3[1][2]);
1485 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1486 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1487 SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
1488 StoreAligned16(sum3[3] + x + 0, s3[0][3]);
1489 StoreAligned16(sum3[3] + x + 8, s3[1][3]);
1490 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1491 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1492 Square(s[0][2], sq[0] + 4);
1493 Square(s[1][2], sq[1] + 4);
1494 SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1495 StoreAligned32U32(square_sum3[2] + x, sq3[2]);
1496 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1497 SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1498 StoreAligned32U32(square_sum3[3] + x, sq3[3]);
1499 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1500 LoadAligned16x2U16(sum3, x, s3[0]);
1501 LoadAligned32x2U32(square_sum3, x, sq3);
1502 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
1503 CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
1504 &index[1][0]);
1505 LoadAligned16x3U16(sum5, x, s5[0]);
1506 LoadAligned32x3U32(square_sum5, x, sq5);
1507 CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
1508
1509 Square(s[0][3], sq[0] + 6);
1510 Square(s[1][3], sq[1] + 6);
1511 SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1512 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1513 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1514 SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1515 StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
1516 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1517 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1518 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1519 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
1520 CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
1521 &index[1][1]);
1522 CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
1523 CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
1524 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1525 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1526 CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
1527 }
1528
BoxFilterPreProcessLastRowLo(const __m128i s[2],const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i * const ma3,__m128i * const ma5,__m128i b3[2],__m128i b5[2])1529 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
1530 const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
1531 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1532 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
1533 __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
1534 __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
1535 Square(s[1], sq + 2);
1536 SumHorizontal16(s, &s3[2], &s5[3]);
1537 SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1538 LoadAligned16x3U16(sum5, 0, s5);
1539 s5[4] = s5[3];
1540 LoadAligned32x3U32(square_sum5, 0, sq5);
1541 sq5[4][0] = sq5[3][0];
1542 sq5[4][1] = sq5[3][1];
1543 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1544 LoadAligned16x2U16(sum3, 0, s3);
1545 LoadAligned32x2U32(square_sum3, 0, sq3);
1546 CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
1547 }
1548
BoxFilterPreProcessLastRow(const __m128i s[4],const ptrdiff_t sum_width,const ptrdiff_t x,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[8],__m128i ma3[2],__m128i ma5[2],__m128i b3[6],__m128i b5[6])1549 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
1550 const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
1551 const uint16_t scales[2], const uint16_t* const sum3[4],
1552 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1553 const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
1554 __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
1555 __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
1556 Square(s[2], sq + 4);
1557 SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1558 SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1559 LoadAligned16x3U16(sum5, x, s5[0]);
1560 s5[0][4] = s5[0][3];
1561 LoadAligned32x3U32(square_sum5, x, sq5);
1562 sq5[4][0] = sq5[3][0];
1563 sq5[4][1] = sq5[3][1];
1564 CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
1565 LoadAligned16x2U16(sum3, x, s3[0]);
1566 LoadAligned32x2U32(square_sum3, x, sq3);
1567 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
1568
1569 Square(s[3], sq + 6);
1570 SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1571 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1572 s5[1][4] = s5[1][3];
1573 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1574 sq5[4][0] = sq5[3][0];
1575 sq5[4][1] = sq5[3][1];
1576 CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
1577 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1578 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1579 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
1580 CalculateIntermediate(sum, index, ma3, b3 + 2);
1581 }
1582
BoxSumFilterPreProcess5(const uint16_t * const src0,const uint16_t * const src1,const int width,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * ma565,uint32_t * b565)1583 inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
1584 const uint16_t* const src1, const int width,
1585 const uint32_t scale,
1586 uint16_t* const sum5[5],
1587 uint32_t* const square_sum5[5],
1588 const ptrdiff_t sum_width, uint16_t* ma565,
1589 uint32_t* b565) {
1590 const ptrdiff_t overread_in_bytes =
1591 kOverreadInBytesPass1 - sizeof(*src0) * width;
1592 __m128i s[2][4], mas[2], sq[2][8], bs[6];
1593 s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1594 s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1595 s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1596 s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1597 Square(s[0][0], sq[0]);
1598 Square(s[1][0], sq[1]);
1599 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1600
1601 int x = 0;
1602 do {
1603 __m128i ma5[3], ma[2], b[4];
1604 s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1605 overread_in_bytes + sizeof(*src0) * (x + 16));
1606 s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1607 overread_in_bytes + sizeof(*src0) * (x + 24));
1608 s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1609 overread_in_bytes + sizeof(*src1) * (x + 16));
1610 s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1611 overread_in_bytes + sizeof(*src1) * (x + 24));
1612 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1613 bs);
1614 Prepare3_8<0>(mas, ma5);
1615 ma[0] = Sum565Lo(ma5);
1616 ma[1] = Sum565Hi(ma5);
1617 StoreAligned32U16(ma565, ma);
1618 Sum565(bs + 0, b + 0);
1619 Sum565(bs + 2, b + 2);
1620 StoreAligned64U32(b565, b);
1621 s[0][0] = s[0][2];
1622 s[0][1] = s[0][3];
1623 s[1][0] = s[1][2];
1624 s[1][1] = s[1][3];
1625 sq[0][2] = sq[0][6];
1626 sq[0][3] = sq[0][7];
1627 sq[1][2] = sq[1][6];
1628 sq[1][3] = sq[1][7];
1629 mas[0] = mas[1];
1630 bs[0] = bs[4];
1631 bs[1] = bs[5];
1632 ma565 += 16;
1633 b565 += 16;
1634 x += 16;
1635 } while (x < width);
1636 }
1637
1638 template <bool calculate444>
BoxSumFilterPreProcess3(const uint16_t * const src,const int width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],const ptrdiff_t sum_width,uint16_t * ma343,uint16_t * ma444,uint32_t * b343,uint32_t * b444)1639 LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
1640 const uint16_t* const src, const int width, const uint32_t scale,
1641 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1642 const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
1643 uint32_t* b444) {
1644 const ptrdiff_t overread_in_bytes =
1645 kOverreadInBytesPass2 - sizeof(*src) * width;
1646 __m128i s[4], mas[2], sq[8], bs[6];
1647 s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
1648 s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
1649 Square(s[0], sq);
1650 BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
1651
1652 int x = 0;
1653 do {
1654 s[2] = LoadUnaligned16Msan(src + x + 16,
1655 overread_in_bytes + sizeof(*src) * (x + 16));
1656 s[3] = LoadUnaligned16Msan(src + x + 24,
1657 overread_in_bytes + sizeof(*src) * (x + 24));
1658 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1659 bs);
1660 __m128i ma3[3];
1661 Prepare3_8<0>(mas, ma3);
1662 if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
1663 Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
1664 Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
1665 ma444 += 16;
1666 b444 += 16;
1667 } else {
1668 __m128i ma[2], b[4];
1669 ma[0] = Sum343Lo(ma3);
1670 ma[1] = Sum343Hi(ma3);
1671 StoreAligned32U16(ma343, ma);
1672 Sum343(bs + 0, b + 0);
1673 Sum343(bs + 2, b + 2);
1674 StoreAligned64U32(b343, b);
1675 }
1676 s[1] = s[3];
1677 sq[2] = sq[6];
1678 sq[3] = sq[7];
1679 mas[0] = mas[1];
1680 bs[0] = bs[4];
1681 bs[1] = bs[5];
1682 ma343 += 16;
1683 b343 += 16;
1684 x += 16;
1685 } while (x < width);
1686 }
1687
BoxSumFilterPreProcess(const uint16_t * const src0,const uint16_t * const src1,const int width,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444,uint16_t * ma565,uint32_t * const b343[4],uint32_t * const b444,uint32_t * b565)1688 inline void BoxSumFilterPreProcess(
1689 const uint16_t* const src0, const uint16_t* const src1, const int width,
1690 const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
1691 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1692 const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
1693 uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
1694 uint32_t* b565) {
1695 const ptrdiff_t overread_in_bytes =
1696 kOverreadInBytesPass1 - sizeof(*src0) * width;
1697 __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
1698 s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1699 s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1700 s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1701 s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1702 Square(s[0][0], sq[0]);
1703 Square(s[1][0], sq[1]);
1704 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
1705 ma3, b3, &ma5[0], b5);
1706
1707 int x = 0;
1708 do {
1709 __m128i ma[2], b[4], ma3x[3], ma5x[3];
1710 s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1711 overread_in_bytes + sizeof(*src0) * (x + 16));
1712 s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1713 overread_in_bytes + sizeof(*src0) * (x + 24));
1714 s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1715 overread_in_bytes + sizeof(*src1) * (x + 16));
1716 s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1717 overread_in_bytes + sizeof(*src1) * (x + 24));
1718 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
1719 sum_width, sq, ma3, b3, ma5, b5);
1720
1721 Prepare3_8<0>(ma3[0], ma3x);
1722 ma[0] = Sum343Lo(ma3x);
1723 ma[1] = Sum343Hi(ma3x);
1724 StoreAligned32U16(ma343[0] + x, ma);
1725 Sum343(b3[0] + 0, b + 0);
1726 Sum343(b3[0] + 2, b + 2);
1727 StoreAligned64U32(b343[0] + x, b);
1728 Sum565(b5 + 0, b + 0);
1729 Sum565(b5 + 2, b + 2);
1730 StoreAligned64U32(b565, b);
1731 Prepare3_8<0>(ma3[1], ma3x);
1732 Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
1733 Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
1734 Prepare3_8<0>(ma5, ma5x);
1735 ma[0] = Sum565Lo(ma5x);
1736 ma[1] = Sum565Hi(ma5x);
1737 StoreAligned32U16(ma565, ma);
1738 s[0][0] = s[0][2];
1739 s[0][1] = s[0][3];
1740 s[1][0] = s[1][2];
1741 s[1][1] = s[1][3];
1742 sq[0][2] = sq[0][6];
1743 sq[0][3] = sq[0][7];
1744 sq[1][2] = sq[1][6];
1745 sq[1][3] = sq[1][7];
1746 ma3[0][0] = ma3[0][1];
1747 ma3[1][0] = ma3[1][1];
1748 ma5[0] = ma5[1];
1749 b3[0][0] = b3[0][4];
1750 b3[0][1] = b3[0][5];
1751 b3[1][0] = b3[1][4];
1752 b3[1][1] = b3[1][5];
1753 b5[0] = b5[4];
1754 b5[1] = b5[5];
1755 ma565 += 16;
1756 b565 += 16;
1757 x += 16;
1758 } while (x < width);
1759 }
1760
1761 template <int shift>
FilterOutput(const __m128i ma_x_src,const __m128i b)1762 inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
1763 // ma: 255 * 32 = 8160 (13 bits)
1764 // b: 65088 * 32 = 2082816 (21 bits)
1765 // v: b - ma * 255 (22 bits)
1766 const __m128i v = _mm_sub_epi32(b, ma_x_src);
1767 // kSgrProjSgrBits = 8
1768 // kSgrProjRestoreBits = 4
1769 // shift = 4 or 5
1770 // v >> 8 or 9 (13 bits)
1771 return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
1772 }
1773
1774 template <int shift>
CalculateFilteredOutput(const __m128i src,const __m128i ma,const __m128i b[2])1775 inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
1776 const __m128i b[2]) {
1777 const __m128i ma_x_src_lo = VmullLo16(ma, src);
1778 const __m128i ma_x_src_hi = VmullHi16(ma, src);
1779 const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
1780 const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
1781 return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
1782 }
1783
CalculateFilteredOutputPass1(const __m128i src,const __m128i ma[2],const __m128i b[2][2])1784 inline __m128i CalculateFilteredOutputPass1(const __m128i src,
1785 const __m128i ma[2],
1786 const __m128i b[2][2]) {
1787 const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
1788 __m128i b_sum[2];
1789 b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
1790 b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
1791 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1792 }
1793
CalculateFilteredOutputPass2(const __m128i src,const __m128i ma[3],const __m128i b[3][2])1794 inline __m128i CalculateFilteredOutputPass2(const __m128i src,
1795 const __m128i ma[3],
1796 const __m128i b[3][2]) {
1797 const __m128i ma_sum = Sum3_16(ma);
1798 __m128i b_sum[2];
1799 Sum3_32(b, b_sum);
1800 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1801 }
1802
SelfGuidedFinal(const __m128i src,const __m128i v[2])1803 inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
1804 const __m128i v_lo =
1805 VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1806 const __m128i v_hi =
1807 VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1808 const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
1809 return _mm_add_epi16(src, vv);
1810 }
1811
SelfGuidedDoubleMultiplier(const __m128i src,const __m128i filter[2],const int w0,const int w2)1812 inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
1813 const __m128i filter[2], const int w0,
1814 const int w2) {
1815 __m128i v[2];
1816 const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
1817 const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
1818 const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
1819 v[0] = _mm_madd_epi16(w0_w2, f_lo);
1820 v[1] = _mm_madd_epi16(w0_w2, f_hi);
1821 return SelfGuidedFinal(src, v);
1822 }
1823
SelfGuidedSingleMultiplier(const __m128i src,const __m128i filter,const int w0)1824 inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
1825 const __m128i filter, const int w0) {
1826 // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
1827 __m128i v[2];
1828 v[0] = VmullNLo8(filter, w0);
1829 v[1] = VmullNHi8(filter, w0);
1830 return SelfGuidedFinal(src, v);
1831 }
1832
ClipAndStore(uint16_t * const dst,const __m128i val)1833 inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
1834 const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
1835 const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
1836 StoreAligned16(dst, val1);
1837 }
1838
BoxFilterPass1(const uint16_t * const src,const uint16_t * const src0,const uint16_t * const src1,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const ma565[2],uint32_t * const b565[2],uint16_t * const dst)1839 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
1840 const uint16_t* const src, const uint16_t* const src0,
1841 const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
1842 uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
1843 const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
1844 uint32_t* const b565[2], uint16_t* const dst) {
1845 const ptrdiff_t overread_in_bytes =
1846 kOverreadInBytesPass1 - sizeof(*src0) * width;
1847 __m128i s[2][4], mas[2], sq[2][8], bs[6];
1848 s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1849 s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1850 s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
1851 s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
1852 Square(s[0][0], sq[0]);
1853 Square(s[1][0], sq[1]);
1854 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1855
1856 int x = 0;
1857 do {
1858 __m128i ma[2], ma5[3], b[2][2], p[2];
1859 s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
1860 overread_in_bytes + sizeof(*src0) * (x + 16));
1861 s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
1862 overread_in_bytes + sizeof(*src0) * (x + 24));
1863 s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
1864 overread_in_bytes + sizeof(*src1) * (x + 16));
1865 s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
1866 overread_in_bytes + sizeof(*src1) * (x + 24));
1867 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1868 bs);
1869 Prepare3_8<0>(mas, ma5);
1870 ma[1] = Sum565Lo(ma5);
1871 StoreAligned16(ma565[1] + x, ma[1]);
1872 Sum565(bs, b[1]);
1873 StoreAligned32U32(b565[1] + x, b[1]);
1874 const __m128i sr0_lo = LoadAligned16(src + x + 0);
1875 const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
1876 ma[0] = LoadAligned16(ma565[0] + x);
1877 LoadAligned32U32(b565[0] + x, b[0]);
1878 p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
1879 p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
1880 const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
1881 const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
1882
1883 ma[1] = Sum565Hi(ma5);
1884 StoreAligned16(ma565[1] + x + 8, ma[1]);
1885 Sum565(bs + 2, b[1]);
1886 StoreAligned32U32(b565[1] + x + 8, b[1]);
1887 const __m128i sr0_hi = LoadAligned16(src + x + 8);
1888 const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
1889 ma[0] = LoadAligned16(ma565[0] + x + 8);
1890 LoadAligned32U32(b565[0] + x + 8, b[0]);
1891 p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
1892 p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
1893 const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
1894 ClipAndStore(dst + x + 0, d00);
1895 ClipAndStore(dst + x + 8, d01);
1896 const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
1897 ClipAndStore(dst + stride + x + 0, d10);
1898 ClipAndStore(dst + stride + x + 8, d11);
1899 s[0][0] = s[0][2];
1900 s[0][1] = s[0][3];
1901 s[1][0] = s[1][2];
1902 s[1][1] = s[1][3];
1903 sq[0][2] = sq[0][6];
1904 sq[0][3] = sq[0][7];
1905 sq[1][2] = sq[1][6];
1906 sq[1][3] = sq[1][7];
1907 mas[0] = mas[1];
1908 bs[0] = bs[4];
1909 bs[1] = bs[5];
1910 x += 16;
1911 } while (x < width);
1912 }
1913
BoxFilterPass1LastRow(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum5[5],uint32_t * const square_sum5[5],uint16_t * ma565,uint32_t * b565,uint16_t * const dst)1914 inline void BoxFilterPass1LastRow(
1915 const uint16_t* const src, const uint16_t* const src0, const int width,
1916 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1917 uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
1918 uint32_t* b565, uint16_t* const dst) {
1919 const ptrdiff_t overread_in_bytes =
1920 kOverreadInBytesPass1 - sizeof(*src0) * width;
1921 __m128i s[4], mas[2], sq[8], bs[6];
1922 s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1923 s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1924 Square(s[0], sq);
1925 BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
1926
1927 int x = 0;
1928 do {
1929 __m128i ma[2], ma5[3], b[2][2];
1930 s[2] = LoadUnaligned16Msan(src0 + x + 16,
1931 overread_in_bytes + sizeof(*src0) * (x + 16));
1932 s[3] = LoadUnaligned16Msan(src0 + x + 24,
1933 overread_in_bytes + sizeof(*src0) * (x + 24));
1934 BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
1935 sq, mas, bs);
1936 Prepare3_8<0>(mas, ma5);
1937 ma[1] = Sum565Lo(ma5);
1938 Sum565(bs, b[1]);
1939 ma[0] = LoadAligned16(ma565);
1940 LoadAligned32U32(b565, b[0]);
1941 const __m128i sr_lo = LoadAligned16(src + x + 0);
1942 __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
1943 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
1944
1945 ma[1] = Sum565Hi(ma5);
1946 Sum565(bs + 2, b[1]);
1947 ma[0] = LoadAligned16(ma565 + 8);
1948 LoadAligned32U32(b565 + 8, b[0]);
1949 const __m128i sr_hi = LoadAligned16(src + x + 8);
1950 p = CalculateFilteredOutputPass1(sr_hi, ma, b);
1951 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
1952 ClipAndStore(dst + x + 0, d0);
1953 ClipAndStore(dst + x + 8, d1);
1954 s[1] = s[3];
1955 sq[2] = sq[6];
1956 sq[3] = sq[7];
1957 mas[0] = mas[1];
1958 bs[0] = bs[4];
1959 bs[1] = bs[5];
1960 ma565 += 16;
1961 b565 += 16;
1962 x += 16;
1963 } while (x < width);
1964 }
1965
BoxFilterPass2(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum3[3],uint32_t * const square_sum3[3],uint16_t * const ma343[3],uint16_t * const ma444[2],uint32_t * const b343[3],uint32_t * const b444[2],uint16_t * const dst)1966 LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
1967 const uint16_t* const src, const uint16_t* const src0, const int width,
1968 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1969 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1970 uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
1971 uint32_t* const b444[2], uint16_t* const dst) {
1972 const ptrdiff_t overread_in_bytes =
1973 kOverreadInBytesPass2 - sizeof(*src0) * width;
1974 __m128i s[4], mas[2], sq[8], bs[6];
1975 s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
1976 s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
1977 Square(s[0], sq);
1978 BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
1979
1980 int x = 0;
1981 do {
1982 s[2] = LoadUnaligned16Msan(src0 + x + 16,
1983 overread_in_bytes + sizeof(*src0) * (x + 16));
1984 s[3] = LoadUnaligned16Msan(src0 + x + 24,
1985 overread_in_bytes + sizeof(*src0) * (x + 24));
1986 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1987 bs);
1988 __m128i ma[3], b[3][2], ma3[3];
1989 Prepare3_8<0>(mas, ma3);
1990 Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
1991 b444[1]);
1992 const __m128i sr_lo = LoadAligned16(src + x + 0);
1993 ma[0] = LoadAligned16(ma343[0] + x);
1994 ma[1] = LoadAligned16(ma444[0] + x);
1995 LoadAligned32U32(b343[0] + x, b[0]);
1996 LoadAligned32U32(b444[0] + x, b[1]);
1997 const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
1998
1999 Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
2000 b343[2], b444[1]);
2001 const __m128i sr_hi = LoadAligned16(src + x + 8);
2002 ma[0] = LoadAligned16(ma343[0] + x + 8);
2003 ma[1] = LoadAligned16(ma444[0] + x + 8);
2004 LoadAligned32U32(b343[0] + x + 8, b[0]);
2005 LoadAligned32U32(b444[0] + x + 8, b[1]);
2006 const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
2007 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
2008 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
2009 ClipAndStore(dst + x + 0, d0);
2010 ClipAndStore(dst + x + 8, d1);
2011 s[1] = s[3];
2012 sq[2] = sq[6];
2013 sq[3] = sq[7];
2014 mas[0] = mas[1];
2015 bs[0] = bs[4];
2016 bs[1] = bs[5];
2017 x += 16;
2018 } while (x < width);
2019 }
2020
BoxFilter(const uint16_t * const src,const uint16_t * const src0,const uint16_t * const src1,const ptrdiff_t stride,const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],uint16_t * const dst)2021 LIBGAV1_ALWAYS_INLINE void BoxFilter(
2022 const uint16_t* const src, const uint16_t* const src0,
2023 const uint16_t* const src1, const ptrdiff_t stride, const int width,
2024 const uint16_t scales[2], const int16_t w0, const int16_t w2,
2025 uint16_t* const sum3[4], uint16_t* const sum5[5],
2026 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2027 const ptrdiff_t sum_width, uint16_t* const ma343[4],
2028 uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
2029 uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
2030 const ptrdiff_t overread_in_bytes =
2031 kOverreadInBytesPass1 - sizeof(*src0) * width;
2032 __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
2033 s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
2034 s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
2035 s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
2036 s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
2037 Square(s[0][0], sq[0]);
2038 Square(s[1][0], sq[1]);
2039 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
2040 ma3, b3, &ma5[0], b5);
2041
2042 int x = 0;
2043 do {
2044 __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
2045 s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
2046 overread_in_bytes + sizeof(*src0) * (x + 16));
2047 s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
2048 overread_in_bytes + sizeof(*src0) * (x + 24));
2049 s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
2050 overread_in_bytes + sizeof(*src1) * (x + 16));
2051 s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
2052 overread_in_bytes + sizeof(*src1) * (x + 24));
2053 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
2054 sum_width, sq, ma3, b3, ma5, b5);
2055 Prepare3_8<0>(ma3[0], ma3x[0]);
2056 Prepare3_8<0>(ma3[1], ma3x[1]);
2057 Prepare3_8<0>(ma5, ma5x);
2058 Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
2059 ma343[2], ma444[1], b343[2], b444[1]);
2060 Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
2061 b343[3], b444[2]);
2062 ma[0][1] = Sum565Lo(ma5x);
2063 StoreAligned16(ma565[1] + x, ma[0][1]);
2064 Sum565(b5, b[0][1]);
2065 StoreAligned32U32(b565[1] + x, b[0][1]);
2066 const __m128i sr0_lo = LoadAligned16(src + x);
2067 const __m128i sr1_lo = LoadAligned16(src + stride + x);
2068 ma[0][0] = LoadAligned16(ma565[0] + x);
2069 LoadAligned32U32(b565[0] + x, b[0][0]);
2070 p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
2071 p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
2072 ma[1][0] = LoadAligned16(ma343[0] + x);
2073 ma[1][1] = LoadAligned16(ma444[0] + x);
2074 LoadAligned32U32(b343[0] + x, b[1][0]);
2075 LoadAligned32U32(b444[0] + x, b[1][1]);
2076 p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
2077 const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
2078 ma[2][0] = LoadAligned16(ma343[1] + x);
2079 LoadAligned32U32(b343[1] + x, b[2][0]);
2080 p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
2081 const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
2082
2083 Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
2084 b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
2085 Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
2086 ma444[2], b343[3], b444[2]);
2087 ma[0][1] = Sum565Hi(ma5x);
2088 StoreAligned16(ma565[1] + x + 8, ma[0][1]);
2089 Sum565(b5 + 2, b[0][1]);
2090 StoreAligned32U32(b565[1] + x + 8, b[0][1]);
2091 const __m128i sr0_hi = LoadAligned16(src + x + 8);
2092 const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
2093 ma[0][0] = LoadAligned16(ma565[0] + x + 8);
2094 LoadAligned32U32(b565[0] + x + 8, b[0][0]);
2095 p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
2096 p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
2097 ma[1][0] = LoadAligned16(ma343[0] + x + 8);
2098 ma[1][1] = LoadAligned16(ma444[0] + x + 8);
2099 LoadAligned32U32(b343[0] + x + 8, b[1][0]);
2100 LoadAligned32U32(b444[0] + x + 8, b[1][1]);
2101 p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
2102 const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
2103 ClipAndStore(dst + x + 0, d00);
2104 ClipAndStore(dst + x + 8, d01);
2105 ma[2][0] = LoadAligned16(ma343[1] + x + 8);
2106 LoadAligned32U32(b343[1] + x + 8, b[2][0]);
2107 p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
2108 const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
2109 ClipAndStore(dst + stride + x + 0, d10);
2110 ClipAndStore(dst + stride + x + 8, d11);
2111 s[0][0] = s[0][2];
2112 s[0][1] = s[0][3];
2113 s[1][0] = s[1][2];
2114 s[1][1] = s[1][3];
2115 sq[0][2] = sq[0][6];
2116 sq[0][3] = sq[0][7];
2117 sq[1][2] = sq[1][6];
2118 sq[1][3] = sq[1][7];
2119 ma3[0][0] = ma3[0][1];
2120 ma3[1][0] = ma3[1][1];
2121 ma5[0] = ma5[1];
2122 b3[0][0] = b3[0][4];
2123 b3[0][1] = b3[0][5];
2124 b3[1][0] = b3[1][4];
2125 b3[1][1] = b3[1][5];
2126 b5[0] = b5[4];
2127 b5[1] = b5[5];
2128 x += 16;
2129 } while (x < width);
2130 }
2131
BoxFilterLastRow(const uint16_t * const src,const uint16_t * const src0,const int width,const ptrdiff_t sum_width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],uint16_t * const ma343,uint16_t * const ma444,uint16_t * const ma565,uint32_t * const b343,uint32_t * const b444,uint32_t * const b565,uint16_t * const dst)2132 inline void BoxFilterLastRow(
2133 const uint16_t* const src, const uint16_t* const src0, const int width,
2134 const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
2135 const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
2136 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2137 uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
2138 uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
2139 uint16_t* const dst) {
2140 const ptrdiff_t overread_in_bytes =
2141 kOverreadInBytesPass1 - sizeof(*src0) * width;
2142 __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
2143 s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
2144 s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
2145 Square(s[0], sq);
2146 BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
2147 sq, &ma3[0], &ma5[0], b3, b5);
2148
2149 int x = 0;
2150 do {
2151 __m128i ma3x[3], ma5x[3], p[2];
2152 s[2] = LoadUnaligned16Msan(src0 + x + 16,
2153 overread_in_bytes + sizeof(*src0) * (x + 16));
2154 s[3] = LoadUnaligned16Msan(src0 + x + 24,
2155 overread_in_bytes + sizeof(*src0) * (x + 24));
2156 BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
2157 square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
2158 Prepare3_8<0>(ma3, ma3x);
2159 Prepare3_8<0>(ma5, ma5x);
2160 ma[1] = Sum565Lo(ma5x);
2161 Sum565(b5, b[1]);
2162 ma[2] = Sum343Lo(ma3x);
2163 Sum343(b3, b[2]);
2164 const __m128i sr_lo = LoadAligned16(src + x + 0);
2165 ma[0] = LoadAligned16(ma565 + x);
2166 LoadAligned32U32(b565 + x, b[0]);
2167 p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
2168 ma[0] = LoadAligned16(ma343 + x);
2169 ma[1] = LoadAligned16(ma444 + x);
2170 LoadAligned32U32(b343 + x, b[0]);
2171 LoadAligned32U32(b444 + x, b[1]);
2172 p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
2173 const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
2174
2175 ma[1] = Sum565Hi(ma5x);
2176 Sum565(b5 + 2, b[1]);
2177 ma[2] = Sum343Hi(ma3x);
2178 Sum343(b3 + 2, b[2]);
2179 const __m128i sr_hi = LoadAligned16(src + x + 8);
2180 ma[0] = LoadAligned16(ma565 + x + 8);
2181 LoadAligned32U32(b565 + x + 8, b[0]);
2182 p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
2183 ma[0] = LoadAligned16(ma343 + x + 8);
2184 ma[1] = LoadAligned16(ma444 + x + 8);
2185 LoadAligned32U32(b343 + x + 8, b[0]);
2186 LoadAligned32U32(b444 + x + 8, b[1]);
2187 p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
2188 const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
2189 ClipAndStore(dst + x + 0, d0);
2190 ClipAndStore(dst + x + 8, d1);
2191 s[1] = s[3];
2192 sq[2] = sq[6];
2193 sq[3] = sq[7];
2194 ma3[0] = ma3[1];
2195 ma5[0] = ma5[1];
2196 b3[0] = b3[4];
2197 b3[1] = b3[5];
2198 b5[0] = b5[4];
2199 b5[1] = b5[5];
2200 x += 16;
2201 } while (x < width);
2202 }
2203
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2204 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
2205 const RestorationUnitInfo& restoration_info, const uint16_t* src,
2206 const ptrdiff_t stride, const uint16_t* const top_border,
2207 const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
2208 const ptrdiff_t bottom_border_stride, const int width, const int height,
2209 SgrBuffer* const sgr_buffer, uint16_t* dst) {
2210 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2211 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2212 const auto sum_stride = temp_stride + 16;
2213 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2214 const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
2215 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2216 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2217 const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
2218 uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
2219 uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
2220 sum3[0] = sgr_buffer->sum3;
2221 square_sum3[0] = sgr_buffer->square_sum3;
2222 ma343[0] = sgr_buffer->ma343;
2223 b343[0] = sgr_buffer->b343;
2224 for (int i = 1; i <= 3; ++i) {
2225 sum3[i] = sum3[i - 1] + sum_stride;
2226 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2227 ma343[i] = ma343[i - 1] + temp_stride;
2228 b343[i] = b343[i - 1] + temp_stride;
2229 }
2230 sum5[0] = sgr_buffer->sum5;
2231 square_sum5[0] = sgr_buffer->square_sum5;
2232 for (int i = 1; i <= 4; ++i) {
2233 sum5[i] = sum5[i - 1] + sum_stride;
2234 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2235 }
2236 ma444[0] = sgr_buffer->ma444;
2237 b444[0] = sgr_buffer->b444;
2238 for (int i = 1; i <= 2; ++i) {
2239 ma444[i] = ma444[i - 1] + temp_stride;
2240 b444[i] = b444[i - 1] + temp_stride;
2241 }
2242 ma565[0] = sgr_buffer->ma565;
2243 ma565[1] = ma565[0] + temp_stride;
2244 b565[0] = sgr_buffer->b565;
2245 b565[1] = b565[0] + temp_stride;
2246 assert(scales[0] != 0);
2247 assert(scales[1] != 0);
2248 BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
2249 sum5[1], square_sum3[0], square_sum5[1]);
2250 sum5[0] = sum5[1];
2251 square_sum5[0] = square_sum5[1];
2252 const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
2253 BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
2254 square_sum5, sum_width, ma343, ma444[0], ma565[0],
2255 b343, b444[0], b565[0]);
2256 sum5[0] = sgr_buffer->sum5;
2257 square_sum5[0] = sgr_buffer->square_sum5;
2258
2259 for (int y = (height >> 1) - 1; y > 0; --y) {
2260 Circulate4PointersBy2<uint16_t>(sum3);
2261 Circulate4PointersBy2<uint32_t>(square_sum3);
2262 Circulate5PointersBy2<uint16_t>(sum5);
2263 Circulate5PointersBy2<uint32_t>(square_sum5);
2264 BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
2265 scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
2266 ma343, ma444, ma565, b343, b444, b565, dst);
2267 src += 2 * stride;
2268 dst += 2 * stride;
2269 Circulate4PointersBy2<uint16_t>(ma343);
2270 Circulate4PointersBy2<uint32_t>(b343);
2271 std::swap(ma444[0], ma444[2]);
2272 std::swap(b444[0], b444[2]);
2273 std::swap(ma565[0], ma565[1]);
2274 std::swap(b565[0], b565[1]);
2275 }
2276
2277 Circulate4PointersBy2<uint16_t>(sum3);
2278 Circulate4PointersBy2<uint32_t>(square_sum3);
2279 Circulate5PointersBy2<uint16_t>(sum5);
2280 Circulate5PointersBy2<uint32_t>(square_sum5);
2281 if ((height & 1) == 0 || height > 1) {
2282 const uint16_t* sr[2];
2283 if ((height & 1) == 0) {
2284 sr[0] = bottom_border;
2285 sr[1] = bottom_border + bottom_border_stride;
2286 } else {
2287 sr[0] = src + 2 * stride;
2288 sr[1] = bottom_border;
2289 }
2290 BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
2291 square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
2292 b444, b565, dst);
2293 }
2294 if ((height & 1) != 0) {
2295 if (height > 1) {
2296 src += 2 * stride;
2297 dst += 2 * stride;
2298 Circulate4PointersBy2<uint16_t>(sum3);
2299 Circulate4PointersBy2<uint32_t>(square_sum3);
2300 Circulate5PointersBy2<uint16_t>(sum5);
2301 Circulate5PointersBy2<uint32_t>(square_sum5);
2302 Circulate4PointersBy2<uint16_t>(ma343);
2303 Circulate4PointersBy2<uint32_t>(b343);
2304 std::swap(ma444[0], ma444[2]);
2305 std::swap(b444[0], b444[2]);
2306 std::swap(ma565[0], ma565[1]);
2307 std::swap(b565[0], b565[1]);
2308 }
2309 BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
2310 sum_width, scales, w0, w2, sum3, sum5, square_sum3,
2311 square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
2312 b444[0], b565[0], dst);
2313 }
2314 }
2315
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2316 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
2317 const uint16_t* src, const ptrdiff_t stride,
2318 const uint16_t* const top_border,
2319 const ptrdiff_t top_border_stride,
2320 const uint16_t* bottom_border,
2321 const ptrdiff_t bottom_border_stride,
2322 const int width, const int height,
2323 SgrBuffer* const sgr_buffer, uint16_t* dst) {
2324 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2325 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2326 const auto sum_stride = temp_stride + 16;
2327 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2328 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
2329 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2330 uint16_t *sum5[5], *ma565[2];
2331 uint32_t *square_sum5[5], *b565[2];
2332 sum5[0] = sgr_buffer->sum5;
2333 square_sum5[0] = sgr_buffer->square_sum5;
2334 for (int i = 1; i <= 4; ++i) {
2335 sum5[i] = sum5[i - 1] + sum_stride;
2336 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2337 }
2338 ma565[0] = sgr_buffer->ma565;
2339 ma565[1] = ma565[0] + temp_stride;
2340 b565[0] = sgr_buffer->b565;
2341 b565[1] = b565[0] + temp_stride;
2342 assert(scale != 0);
2343 BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
2344 sum5[1], square_sum5[1]);
2345 sum5[0] = sum5[1];
2346 square_sum5[0] = square_sum5[1];
2347 const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
2348 BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
2349 ma565[0], b565[0]);
2350 sum5[0] = sgr_buffer->sum5;
2351 square_sum5[0] = sgr_buffer->square_sum5;
2352
2353 for (int y = (height >> 1) - 1; y > 0; --y) {
2354 Circulate5PointersBy2<uint16_t>(sum5);
2355 Circulate5PointersBy2<uint32_t>(square_sum5);
2356 BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
2357 square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
2358 src += 2 * stride;
2359 dst += 2 * stride;
2360 std::swap(ma565[0], ma565[1]);
2361 std::swap(b565[0], b565[1]);
2362 }
2363
2364 Circulate5PointersBy2<uint16_t>(sum5);
2365 Circulate5PointersBy2<uint32_t>(square_sum5);
2366 if ((height & 1) == 0 || height > 1) {
2367 const uint16_t* sr[2];
2368 if ((height & 1) == 0) {
2369 sr[0] = bottom_border;
2370 sr[1] = bottom_border + bottom_border_stride;
2371 } else {
2372 sr[0] = src + 2 * stride;
2373 sr[1] = bottom_border;
2374 }
2375 BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
2376 sum_width, scale, w0, ma565, b565, dst);
2377 }
2378 if ((height & 1) != 0) {
2379 src += 3;
2380 if (height > 1) {
2381 src += 2 * stride;
2382 dst += 2 * stride;
2383 std::swap(ma565[0], ma565[1]);
2384 std::swap(b565[0], b565[1]);
2385 Circulate5PointersBy2<uint16_t>(sum5);
2386 Circulate5PointersBy2<uint32_t>(square_sum5);
2387 }
2388 BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
2389 sum_width, scale, w0, sum5, square_sum5, ma565[0],
2390 b565[0], dst);
2391 }
2392 }
2393
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const uint16_t * src,const ptrdiff_t stride,const uint16_t * const top_border,const ptrdiff_t top_border_stride,const uint16_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint16_t * dst)2394 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
2395 const uint16_t* src, const ptrdiff_t stride,
2396 const uint16_t* const top_border,
2397 const ptrdiff_t top_border_stride,
2398 const uint16_t* bottom_border,
2399 const ptrdiff_t bottom_border_stride,
2400 const int width, const int height,
2401 SgrBuffer* const sgr_buffer, uint16_t* dst) {
2402 assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
2403 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2404 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2405 const auto sum_stride = temp_stride + 16;
2406 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2407 const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
2408 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2409 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
2410 uint16_t *sum3[3], *ma343[3], *ma444[2];
2411 uint32_t *square_sum3[3], *b343[3], *b444[2];
2412 sum3[0] = sgr_buffer->sum3;
2413 square_sum3[0] = sgr_buffer->square_sum3;
2414 ma343[0] = sgr_buffer->ma343;
2415 b343[0] = sgr_buffer->b343;
2416 for (int i = 1; i <= 2; ++i) {
2417 sum3[i] = sum3[i - 1] + sum_stride;
2418 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2419 ma343[i] = ma343[i - 1] + temp_stride;
2420 b343[i] = b343[i - 1] + temp_stride;
2421 }
2422 ma444[0] = sgr_buffer->ma444;
2423 ma444[1] = ma444[0] + temp_stride;
2424 b444[0] = sgr_buffer->b444;
2425 b444[1] = b444[0] + temp_stride;
2426 assert(scale != 0);
2427 BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
2428 sum3[0], square_sum3[0]);
2429 BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
2430 sum_width, ma343[0], nullptr, b343[0],
2431 nullptr);
2432 Circulate3PointersBy1<uint16_t>(sum3);
2433 Circulate3PointersBy1<uint32_t>(square_sum3);
2434 const uint16_t* s;
2435 if (height > 1) {
2436 s = src + stride;
2437 } else {
2438 s = bottom_border;
2439 bottom_border += bottom_border_stride;
2440 }
2441 BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
2442 ma343[1], ma444[0], b343[1], b444[0]);
2443
2444 for (int y = height - 2; y > 0; --y) {
2445 Circulate3PointersBy1<uint16_t>(sum3);
2446 Circulate3PointersBy1<uint32_t>(square_sum3);
2447 BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
2448 square_sum3, ma343, ma444, b343, b444, dst);
2449 src += stride;
2450 dst += stride;
2451 Circulate3PointersBy1<uint16_t>(ma343);
2452 Circulate3PointersBy1<uint32_t>(b343);
2453 std::swap(ma444[0], ma444[1]);
2454 std::swap(b444[0], b444[1]);
2455 }
2456
2457 int y = std::min(height, 2);
2458 src += 2;
2459 do {
2460 Circulate3PointersBy1<uint16_t>(sum3);
2461 Circulate3PointersBy1<uint32_t>(square_sum3);
2462 BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
2463 square_sum3, ma343, ma444, b343, b444, dst);
2464 src += stride;
2465 dst += stride;
2466 bottom_border += bottom_border_stride;
2467 Circulate3PointersBy1<uint16_t>(ma343);
2468 Circulate3PointersBy1<uint32_t>(b343);
2469 std::swap(ma444[0], ma444[1]);
2470 std::swap(b444[0], b444[1]);
2471 } while (--y != 0);
2472 }
2473
2474 // If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
2475 // in the end of each row. It is safe to overwrite the output as it will not be
2476 // part of the visible frame.
SelfGuidedFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)2477 void SelfGuidedFilter_SSE4_1(
2478 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
2479 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
2480 const void* LIBGAV1_RESTRICT const top_border,
2481 const ptrdiff_t top_border_stride,
2482 const void* LIBGAV1_RESTRICT const bottom_border,
2483 const ptrdiff_t bottom_border_stride, const int width, const int height,
2484 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
2485 void* LIBGAV1_RESTRICT const dest) {
2486 const int index = restoration_info.sgr_proj_info.index;
2487 const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
2488 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
2489 const auto* const src = static_cast<const uint16_t*>(source);
2490 const auto* const top = static_cast<const uint16_t*>(top_border);
2491 const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
2492 auto* const dst = static_cast<uint16_t*>(dest);
2493 SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
2494 if (radius_pass_1 == 0) {
2495 // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
2496 // following assertion.
2497 assert(radius_pass_0 != 0);
2498 BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
2499 top_border_stride, bottom - 3, bottom_border_stride,
2500 width, height, sgr_buffer, dst);
2501 } else if (radius_pass_0 == 0) {
2502 BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
2503 top_border_stride, bottom - 2, bottom_border_stride,
2504 width, height, sgr_buffer, dst);
2505 } else {
2506 BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
2507 top_border_stride, bottom - 3, bottom_border_stride, width,
2508 height, sgr_buffer, dst);
2509 }
2510 }
2511
Init10bpp()2512 void Init10bpp() {
2513 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2514 assert(dsp != nullptr);
2515 static_cast<void>(dsp);
2516 #if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
2517 dsp->loop_restorations[0] = WienerFilter_SSE4_1;
2518 #else
2519 static_cast<void>(WienerFilter_SSE4_1);
2520 #endif
2521 #if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
2522 dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
2523 #else
2524 static_cast<void>(SelfGuidedFilter_SSE4_1);
2525 #endif
2526 }
2527
2528 } // namespace
2529
LoopRestorationInit10bpp_SSE4_1()2530 void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
2531
2532 } // namespace dsp
2533 } // namespace libgav1
2534
2535 #else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
2536 namespace libgav1 {
2537 namespace dsp {
2538
LoopRestorationInit10bpp_SSE4_1()2539 void LoopRestorationInit10bpp_SSE4_1() {}
2540
2541 } // namespace dsp
2542 } // namespace libgav1
2543 #endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
2544