1 // Copyright 2021 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred_directional.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/memory.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39
40 //------------------------------------------------------------------------------
41 // 7.11.2.4. Directional intra prediction process
42
43 // Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
44 // upsampling is ruled out. In addition, the bits masked by 0x3F for
45 // |shift_val| are 0 for all multiples of 64, so the formula
46 // val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
47 // val = top[top_base_x+1] << 5, meaning only the second set of pixels is
48 // involved in the output. Hence |top| is offset by 1.
DirectionalZone1_Step64(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top,const int width,const int height)49 inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
50 const uint8_t* const top, const int width,
51 const int height) {
52 ptrdiff_t offset = 1;
53 if (height == 4) {
54 memcpy(dst, top + offset, width);
55 dst += stride;
56 memcpy(dst, top + offset + 1, width);
57 dst += stride;
58 memcpy(dst, top + offset + 2, width);
59 dst += stride;
60 memcpy(dst, top + offset + 3, width);
61 return;
62 }
63 int y = 0;
64 do {
65 memcpy(dst, top + offset, width);
66 dst += stride;
67 memcpy(dst, top + offset + 1, width);
68 dst += stride;
69 memcpy(dst, top + offset + 2, width);
70 dst += stride;
71 memcpy(dst, top + offset + 3, width);
72 dst += stride;
73 memcpy(dst, top + offset + 4, width);
74 dst += stride;
75 memcpy(dst, top + offset + 5, width);
76 dst += stride;
77 memcpy(dst, top + offset + 6, width);
78 dst += stride;
79 memcpy(dst, top + offset + 7, width);
80 dst += stride;
81
82 offset += 8;
83 y += 8;
84 } while (y < height);
85 }
86
DirectionalZone1_4xH(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top,const int height,const int xstep,const bool upsampled)87 inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
88 const uint8_t* const top, const int height,
89 const int xstep, const bool upsampled) {
90 const int upsample_shift = static_cast<int>(upsampled);
91 const int scale_bits = 6 - upsample_shift;
92 const __m128i max_shift = _mm_set1_epi8(32);
93 // Downscaling for a weighted average whose weights sum to 32 (max_shift).
94 const int rounding_bits = 5;
95 const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
96 const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
97 const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
98 : _mm_set_epi64x(0, 0x0403030202010100);
99 // Each 16-bit value here corresponds to a position that may exceed
100 // |max_base_x|. When added to the top_base_x, it is used to mask values
101 // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
102 // not supported for packed integers.
103 const __m128i offsets =
104 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
105
106 // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
107 // is always greater than |height|, so clipping to 1 is enough to make the
108 // logic work.
109 const int xstep_units = std::max(xstep >> scale_bits, 1);
110 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
111
112 // Rows up to this y-value can be computed without checking for bounds.
113 int y = 0;
114 int top_x = xstep;
115
116 for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
117 const int top_base_x = top_x >> scale_bits;
118
119 // Permit negative values of |top_x|.
120 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
121 const __m128i shift = _mm_set1_epi8(shift_val);
122 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
123 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
124 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
125 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
126 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
127
128 // Load 8 values because we will select the sampled values based on
129 // |upsampled|.
130 const __m128i values = LoadLo8(top + top_base_x);
131 const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
132 const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
133 __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
134 prod = RightShiftWithRounding_U16(prod, rounding_bits);
135 // Replace pixels from invalid range with top-right corner.
136 prod = _mm_blendv_epi8(prod, final_top_val, past_max);
137 Store4(dst, _mm_packus_epi16(prod, prod));
138 }
139
140 // Fill in corner-only rows.
141 for (; y < height; ++y) {
142 memset(dst, top[max_base_x], /* width */ 4);
143 dst += stride;
144 }
145 }
146
147 // 7.11.2.4 (7) angle < 90
DirectionalZone1_Large(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const int width,const int height,const int xstep,const bool upsampled)148 inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
149 const uint8_t* const top_row,
150 const int width, const int height,
151 const int xstep, const bool upsampled) {
152 const int upsample_shift = static_cast<int>(upsampled);
153 const __m128i sampler =
154 upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
155 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
156 const int scale_bits = 6 - upsample_shift;
157 const int max_base_x = ((width + height) - 1) << upsample_shift;
158
159 const __m128i max_shift = _mm_set1_epi8(32);
160 // Downscaling for a weighted average whose weights sum to 32 (max_shift).
161 const int rounding_bits = 5;
162 const int base_step = 1 << upsample_shift;
163 const int base_step8 = base_step << 3;
164
165 // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
166 // is always greater than |height|, so clipping to 1 is enough to make the
167 // logic work.
168 const int xstep_units = std::max(xstep >> scale_bits, 1);
169 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
170
171 // Rows up to this y-value can be computed without checking for bounds.
172 const int max_no_corner_y = std::min(
173 LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
174 height);
175 // No need to check for exceeding |max_base_x| in the first loop.
176 int y = 0;
177 int top_x = xstep;
178 for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
179 int top_base_x = top_x >> scale_bits;
180 // Permit negative values of |top_x|.
181 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
182 const __m128i shift = _mm_set1_epi8(shift_val);
183 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
184 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
185 int x = 0;
186 do {
187 const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
188 __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
189 vals = _mm_maddubs_epi16(vals, shifts);
190 vals = RightShiftWithRounding_U16(vals, rounding_bits);
191 StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
192 top_base_x += base_step8;
193 x += 8;
194 } while (x < width);
195 }
196
197 // Each 16-bit value here corresponds to a position that may exceed
198 // |max_base_x|. When added to the top_base_x, it is used to mask values
199 // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
200 // not supported for packed integers.
201 const __m128i offsets =
202 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
203
204 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
205 const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
206 const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
207 for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
208 int top_base_x = top_x >> scale_bits;
209
210 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
211 const __m128i shift = _mm_set1_epi8(shift_val);
212 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
213 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
214 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
215 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
216
217 int x = 0;
218 const int min_corner_only_x =
219 std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
220 for (; x < min_corner_only_x;
221 x += 8, top_base_x += base_step8,
222 top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
223 const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
224 // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
225 // reading out of bounds. If all indices are past max and we don't need to
226 // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
227 // reset for the next |y|.
228 top_base_x &= ~_mm_cvtsi128_si32(past_max);
229 const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
230 __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
231 vals = _mm_maddubs_epi16(vals, shifts);
232 vals = RightShiftWithRounding_U16(vals, rounding_bits);
233 vals = _mm_blendv_epi8(vals, final_top_val, past_max);
234 StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
235 }
236 // Corner-only section of the row.
237 memset(dest + x, top_row[max_base_x], width - x);
238 }
239 // Fill in corner-only rows.
240 for (; y < height; ++y) {
241 memset(dest, top_row[max_base_x], width);
242 dest += stride;
243 }
244 }
245
246 // 7.11.2.4 (7) angle < 90
DirectionalZone1_SSE4_1(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const int width,const int height,const int xstep,const bool upsampled)247 inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
248 const uint8_t* const top_row,
249 const int width, const int height,
250 const int xstep, const bool upsampled) {
251 const int upsample_shift = static_cast<int>(upsampled);
252 if (xstep == 64) {
253 DirectionalZone1_Step64(dest, stride, top_row, width, height);
254 return;
255 }
256 if (width == 4) {
257 DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
258 return;
259 }
260 if (width >= 32) {
261 DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
262 upsampled);
263 return;
264 }
265 const __m128i sampler =
266 upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
267 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
268 const int scale_bits = 6 - upsample_shift;
269 const int max_base_x = ((width + height) - 1) << upsample_shift;
270
271 const __m128i max_shift = _mm_set1_epi8(32);
272 // Downscaling for a weighted average whose weights sum to 32 (max_shift).
273 const int rounding_bits = 5;
274 const int base_step = 1 << upsample_shift;
275 const int base_step8 = base_step << 3;
276
277 // No need to check for exceeding |max_base_x| in the loops.
278 if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
279 int top_x = xstep;
280 int y = 0;
281 do {
282 int top_base_x = top_x >> scale_bits;
283 // Permit negative values of |top_x|.
284 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
285 const __m128i shift = _mm_set1_epi8(shift_val);
286 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
287 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
288 int x = 0;
289 do {
290 const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
291 __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
292 vals = _mm_maddubs_epi16(vals, shifts);
293 vals = RightShiftWithRounding_U16(vals, rounding_bits);
294 StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
295 top_base_x += base_step8;
296 x += 8;
297 } while (x < width);
298 dest += stride;
299 top_x += xstep;
300 } while (++y < height);
301 return;
302 }
303
304 // Each 16-bit value here corresponds to a position that may exceed
305 // |max_base_x|. When added to the top_base_x, it is used to mask values
306 // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
307 // not supported for packed integers.
308 const __m128i offsets =
309 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
310
311 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
312 const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
313 const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
314 int top_x = xstep;
315 int y = 0;
316 do {
317 int top_base_x = top_x >> scale_bits;
318
319 if (top_base_x >= max_base_x) {
320 for (int i = y; i < height; ++i) {
321 memset(dest, top_row[max_base_x], width);
322 dest += stride;
323 }
324 return;
325 }
326
327 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
328 const __m128i shift = _mm_set1_epi8(shift_val);
329 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
330 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
331 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
332 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
333
334 int x = 0;
335 for (; x < width - 8;
336 x += 8, top_base_x += base_step8,
337 top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
338 const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
339 // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
340 // reading out of bounds. If all indices are past max and we don't need to
341 // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
342 // reset for the next |y|.
343 top_base_x &= ~_mm_cvtsi128_si32(past_max);
344 const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
345 __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
346 vals = _mm_maddubs_epi16(vals, shifts);
347 vals = RightShiftWithRounding_U16(vals, rounding_bits);
348 vals = _mm_blendv_epi8(vals, final_top_val, past_max);
349 StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
350 }
351 const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
352 __m128i vals;
353 if (upsampled) {
354 vals = LoadUnaligned16(top_row + top_base_x);
355 } else {
356 const __m128i top_vals = LoadLo8(top_row + top_base_x);
357 vals = _mm_shuffle_epi8(top_vals, sampler);
358 vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
359 }
360 vals = _mm_maddubs_epi16(vals, shifts);
361 vals = RightShiftWithRounding_U16(vals, rounding_bits);
362 vals = _mm_blendv_epi8(vals, final_top_val, past_max);
363 StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
364 dest += stride;
365 top_x += xstep;
366 } while (++y < height);
367 }
368
DirectionalIntraPredictorZone1_SSE4_1(void * const dest,ptrdiff_t stride,const void * const top_row,const int width,const int height,const int xstep,const bool upsampled_top)369 void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
370 const void* const top_row,
371 const int width, const int height,
372 const int xstep,
373 const bool upsampled_top) {
374 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
375 auto* dst = static_cast<uint8_t*>(dest);
376 DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
377 upsampled_top);
378 }
379
380 template <bool upsampled>
DirectionalZone3_4x4(uint8_t * dest,ptrdiff_t stride,const uint8_t * const left_column,const int base_left_y,const int ystep)381 inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
382 const uint8_t* const left_column,
383 const int base_left_y, const int ystep) {
384 // For use in the non-upsampled case.
385 const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
386 const int upsample_shift = static_cast<int>(upsampled);
387 const int scale_bits = 6 - upsample_shift;
388 const __m128i max_shift = _mm_set1_epi8(32);
389 // Downscaling for a weighted average whose weights sum to 32 (max_shift).
390 const int rounding_bits = 5;
391
392 __m128i result_block[4];
393 for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
394 const int left_base_y = left_y >> scale_bits;
395 const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
396 const __m128i shift = _mm_set1_epi8(shift_val);
397 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
398 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
399 __m128i vals;
400 if (upsampled) {
401 vals = LoadLo8(left_column + left_base_y);
402 } else {
403 const __m128i top_vals = LoadLo8(left_column + left_base_y);
404 vals = _mm_shuffle_epi8(top_vals, sampler);
405 }
406 vals = _mm_maddubs_epi16(vals, shifts);
407 vals = RightShiftWithRounding_U16(vals, rounding_bits);
408 result_block[x] = _mm_packus_epi16(vals, vals);
409 }
410 const __m128i result = Transpose4x4_U8(result_block);
411 // This is result_row0.
412 Store4(dest, result);
413 dest += stride;
414 const int result_row1 = _mm_extract_epi32(result, 1);
415 memcpy(dest, &result_row1, sizeof(result_row1));
416 dest += stride;
417 const int result_row2 = _mm_extract_epi32(result, 2);
418 memcpy(dest, &result_row2, sizeof(result_row2));
419 dest += stride;
420 const int result_row3 = _mm_extract_epi32(result, 3);
421 memcpy(dest, &result_row3, sizeof(result_row3));
422 }
423
424 template <bool upsampled, int height>
DirectionalZone3_8xH(uint8_t * dest,ptrdiff_t stride,const uint8_t * const left_column,const int base_left_y,const int ystep)425 inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
426 const uint8_t* const left_column,
427 const int base_left_y, const int ystep) {
428 // For use in the non-upsampled case.
429 const __m128i sampler =
430 _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
431 const int upsample_shift = static_cast<int>(upsampled);
432 const int scale_bits = 6 - upsample_shift;
433 const __m128i max_shift = _mm_set1_epi8(32);
434 // Downscaling for a weighted average whose weights sum to 32 (max_shift).
435 const int rounding_bits = 5;
436
437 __m128i result_block[8];
438 for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
439 const int left_base_y = left_y >> scale_bits;
440 const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
441 const __m128i shift = _mm_set1_epi8(shift_val);
442 const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
443 const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
444 __m128i vals;
445 if (upsampled) {
446 vals = LoadUnaligned16(left_column + left_base_y);
447 } else {
448 const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
449 vals = _mm_shuffle_epi8(top_vals, sampler);
450 }
451 vals = _mm_maddubs_epi16(vals, shifts);
452 result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
453 }
454 Transpose8x8_U16(result_block, result_block);
455 for (int y = 0; y < height; ++y) {
456 StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
457 dest += stride;
458 }
459 }
460
461 // 7.11.2.4 (9) angle > 180
DirectionalIntraPredictorZone3_SSE4_1(void * dest,ptrdiff_t stride,const void * const left_column,const int width,const int height,const int ystep,const bool upsampled)462 void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
463 const void* const left_column,
464 const int width, const int height,
465 const int ystep,
466 const bool upsampled) {
467 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
468 auto* dst = static_cast<uint8_t*>(dest);
469 const int upsample_shift = static_cast<int>(upsampled);
470 if (width == 4 || height == 4) {
471 const ptrdiff_t stride4 = stride << 2;
472 if (upsampled) {
473 int left_y = ystep;
474 int x = 0;
475 do {
476 uint8_t* dst_x = dst + x;
477 int y = 0;
478 do {
479 DirectionalZone3_4x4<true>(
480 dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
481 dst_x += stride4;
482 y += 4;
483 } while (y < height);
484 left_y += ystep << 2;
485 x += 4;
486 } while (x < width);
487 } else {
488 int left_y = ystep;
489 int x = 0;
490 do {
491 uint8_t* dst_x = dst + x;
492 int y = 0;
493 do {
494 DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
495 ystep);
496 dst_x += stride4;
497 y += 4;
498 } while (y < height);
499 left_y += ystep << 2;
500 x += 4;
501 } while (x < width);
502 }
503 return;
504 }
505
506 const ptrdiff_t stride8 = stride << 3;
507 if (upsampled) {
508 int left_y = ystep;
509 int x = 0;
510 do {
511 uint8_t* dst_x = dst + x;
512 int y = 0;
513 do {
514 DirectionalZone3_8xH<true, 8>(
515 dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
516 dst_x += stride8;
517 y += 8;
518 } while (y < height);
519 left_y += ystep << 3;
520 x += 8;
521 } while (x < width);
522 } else {
523 int left_y = ystep;
524 int x = 0;
525 do {
526 uint8_t* dst_x = dst + x;
527 int y = 0;
528 do {
529 DirectionalZone3_8xH<false, 8>(
530 dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
531 dst_x += stride8;
532 y += 8;
533 } while (y < height);
534 left_y += ystep << 3;
535 x += 8;
536 } while (x < width);
537 }
538 }
539
540 //------------------------------------------------------------------------------
541 // Directional Zone 2 Functions
542 // 7.11.2.4 (8)
543
544 // DirectionalBlend* selectively overwrites the values written by
545 // DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
546 // row.
547 template <int y_selector>
DirectionalBlend4_SSE4_1(uint8_t * dest,const __m128i & dest_index_vect,const __m128i & vals,const __m128i & zone_bounds)548 inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
549 const __m128i& dest_index_vect,
550 const __m128i& vals,
551 const __m128i& zone_bounds) {
552 const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
553 const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
554 const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
555 const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
556 Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
557 }
558
DirectionalBlend8_SSE4_1(uint8_t * dest,const __m128i & dest_index_vect,const __m128i & vals,const __m128i & zone_bounds,const __m128i & bounds_selector)559 inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
560 const __m128i& dest_index_vect,
561 const __m128i& vals,
562 const __m128i& zone_bounds,
563 const __m128i& bounds_selector) {
564 const __m128i max_dest_x_vect =
565 _mm_shuffle_epi8(zone_bounds, bounds_selector);
566 const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
567 const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
568 const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
569 StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
570 }
571
572 constexpr int kDirectionalWeightBits = 5;
573 // |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
574 // |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
575 // shift) and shift. Shift is guaranteed to be between 0 and 32.
DirectionalZone2FromSource_SSE4_1(const uint8_t * const source,const __m128i & shifts,const __m128i & sampler)576 inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
577 const __m128i& shifts,
578 const __m128i& sampler) {
579 const __m128i src_vals = LoadUnaligned16(source);
580 __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
581 vals = _mm_maddubs_epi16(vals, shifts);
582 return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
583 }
584
585 // Because the source values "move backwards" as the row index increases, the
586 // indices derived from ystep are generally negative. This is accommodated by
587 // making sure the relative indices are within [-15, 0] when the function is
588 // called, and sliding them into the inclusive range [0, 15], relative to a
589 // lower base address.
590 constexpr int kPositiveIndexOffset = 15;
591
592 template <bool upsampled>
DirectionalZone2FromLeftCol_4x4_SSE4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * const left_column_base,__m128i left_y)593 inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
594 uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
595 __m128i left_y) {
596 const int upsample_shift = static_cast<int>(upsampled);
597 const int scale_bits = 6 - upsample_shift;
598 const __m128i max_shifts = _mm_set1_epi8(32);
599 const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
600 const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
601 const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
602 // Left_column and sampler are both offset by 15 so the indices are always
603 // positive.
604 const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
605 for (int y = 0; y < 4; dst += stride, ++y) {
606 __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
607 offset_y = _mm_packs_epi16(offset_y, offset_y);
608
609 const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
610 __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
611 // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
612 // can work as shuffle indices. Some values may be out of bounds, but their
613 // pred results will be masked over by top prediction.
614 sampler = _mm_add_epi8(sampler, positive_offset);
615
616 __m128i shifts = _mm_srli_epi16(
617 _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
618 shifts = _mm_packus_epi16(shifts, shifts);
619 const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
620 shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
621 const __m128i vals = DirectionalZone2FromSource_SSE4_1(
622 left_column + (y << upsample_shift), shifts, sampler);
623 Store4(dst, _mm_packus_epi16(vals, vals));
624 }
625 }
626
627 template <bool upsampled>
DirectionalZone2FromLeftCol_8x8_SSE4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * const left_column,__m128i left_y)628 inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
629 uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
630 __m128i left_y) {
631 const int upsample_shift = static_cast<int>(upsampled);
632 const int scale_bits = 6 - upsample_shift;
633 const __m128i max_shifts = _mm_set1_epi8(32);
634 const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
635 const __m128i index_increment = _mm_set1_epi8(1);
636 const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
637 for (int y = 0; y < 8; dst += stride, ++y) {
638 __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
639 offset_y = _mm_packs_epi16(offset_y, offset_y);
640 const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
641
642 // Offset the relative index because ystep is negative in Zone 2 and shuffle
643 // indices must be nonnegative.
644 __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
645 sampler = _mm_add_epi8(sampler, denegation);
646
647 __m128i shifts = _mm_srli_epi16(
648 _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
649 shifts = _mm_packus_epi16(shifts, shifts);
650 const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
651 shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
652
653 // The specification adds (y << 6) to left_y, which is subject to
654 // upsampling, but this puts sampler indices out of the 0-15 range. It is
655 // equivalent to offset the source address by (y << upsample_shift) instead.
656 const __m128i vals = DirectionalZone2FromSource_SSE4_1(
657 left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
658 sampler);
659 StoreLo8(dst, _mm_packus_epi16(vals, vals));
660 }
661 }
662
663 // |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
664 // upsampled_top), for each row. When there are 4 values, they can be duplicated
665 // with a non-register shuffle mask.
666 // |shifts| is one pair of weights that applies throughout a given row.
667 template <bool upsampled_top>
DirectionalZone1Blend_4x4(uint8_t * dest,const uint8_t * const top_row,ptrdiff_t stride,__m128i sampler,const __m128i & zone_bounds,const __m128i & shifts,const __m128i & dest_index_x,int top_x,const int xstep)668 inline void DirectionalZone1Blend_4x4(
669 uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
670 __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
671 const __m128i& dest_index_x, int top_x, const int xstep) {
672 const int upsample_shift = static_cast<int>(upsampled_top);
673 const int scale_bits_x = 6 - upsample_shift;
674 top_x -= xstep;
675
676 int top_base_x = (top_x >> scale_bits_x);
677 const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
678 top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
679 DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
680 top_x -= xstep;
681 dest += stride;
682
683 top_base_x = (top_x >> scale_bits_x);
684 const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
685 top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
686 DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
687 top_x -= xstep;
688 dest += stride;
689
690 top_base_x = (top_x >> scale_bits_x);
691 const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
692 top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
693 DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
694 top_x -= xstep;
695 dest += stride;
696
697 top_base_x = (top_x >> scale_bits_x);
698 const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
699 top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
700 DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
701 }
702
703 template <bool upsampled_top, int height>
DirectionalZone1Blend_8xH(uint8_t * dest,const uint8_t * const top_row,ptrdiff_t stride,__m128i sampler,const __m128i & zone_bounds,const __m128i & shifts,const __m128i & dest_index_x,int top_x,const int xstep)704 inline void DirectionalZone1Blend_8xH(
705 uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
706 __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
707 const __m128i& dest_index_x, int top_x, const int xstep) {
708 const int upsample_shift = static_cast<int>(upsampled_top);
709 const int scale_bits_x = 6 - upsample_shift;
710
711 __m128i y_selector = _mm_set1_epi32(0x01000100);
712 const __m128i index_increment = _mm_set1_epi32(0x02020202);
713 for (int y = 0; y < height; ++y,
714 y_selector = _mm_add_epi8(y_selector, index_increment),
715 dest += stride) {
716 top_x -= xstep;
717 const int top_base_x = top_x >> scale_bits_x;
718 const __m128i vals = DirectionalZone2FromSource_SSE4_1(
719 top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
720 DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
721 }
722 }
723
724 template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const __m128i & xstep_for_shift,const __m128i & xstep_bounds_base,const __m128i & left_y)725 inline void DirectionalZone2_8xH(
726 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
727 const uint8_t* LIBGAV1_RESTRICT const top_row,
728 const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
729 const int xstep, const int ystep, const int x, const int left_offset,
730 const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
731 const __m128i& left_y) {
732 const int upsample_left_shift = static_cast<int>(upsampled_left);
733 const int upsample_top_shift = static_cast<int>(upsampled_top);
734
735 // Loop incrementers for moving by block (8x8). This function handles blocks
736 // with height 4 as well. They are calculated in one pass so these variables
737 // do not get used.
738 const ptrdiff_t stride8 = stride << 3;
739 const int xstep8 = xstep << 3;
740 const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
741
742 // Cover 8x4 case.
743 const int min_height = (height == 4) ? 4 : 8;
744
745 // The first stage, before the first y-loop, covers blocks that are only
746 // computed from the top row. The second stage, comprising two y-loops, covers
747 // blocks that have a mixture of values computed from top or left. The final
748 // stage covers blocks that are only computed from the left.
749 uint8_t* dst_x = dst + x;
750
751 // Round down to the nearest multiple of 8 (or 4, if height is 4).
752 const int max_top_only_y =
753 std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
754 DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
755 max_top_only_y, -xstep, upsampled_top);
756 DirectionalZone1_4xH(dst_x + 4, stride,
757 top_row + ((x + 4) << upsample_top_shift),
758 max_top_only_y, -xstep, upsampled_top);
759 if (max_top_only_y == height) return;
760
761 const __m128i max_shift = _mm_set1_epi8(32);
762 const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
763 const __m128i dest_index_x =
764 _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
765 const __m128i sampler_top =
766 upsampled_top
767 ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
768 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
769 int y = max_top_only_y;
770 dst_x += stride * y;
771 const int xstep_y = xstep * y;
772 const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
773 // All rows from |min_left_only_y| down for this set of columns, only need
774 // |left_column| to compute.
775 const int min_left_only_y =
776 Align(std::min(((x + 8) << 6) / xstep, height), 8);
777
778 __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
779 __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
780 int top_x = -xstep_y;
781
782 const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
783 for (; y < min_left_only_y;
784 y += 8, dst_x += stride8,
785 xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
786 xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
787 top_x -= xstep8) {
788 // Pick up from the last y-value, using the 10% slower but secure method for
789 // left prediction.
790 if (shuffle_left_column) {
791 DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
792 dst_x, stride,
793 left_column + ((left_offset + y) << upsample_left_shift), left_y);
794 } else {
795 DirectionalZone3_8xH<upsampled_left, 8>(
796 dst_x, stride,
797 left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
798 -ystep);
799 }
800
801 __m128i shifts = _mm_srli_epi16(
802 _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
803 shift_mask),
804 1);
805 shifts = _mm_packus_epi16(shifts, shifts);
806 __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
807 shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
808 __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
809 DirectionalZone1Blend_8xH<upsampled_top, 8>(
810 dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
811 xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
812 }
813 // Loop over y for left_only rows.
814 for (; y < height; y += 8, dst_x += stride8) {
815 DirectionalZone3_8xH<upsampled_left, 8>(
816 dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
817 base_left_y, -ystep);
818 }
819 }
820
821 // 7.11.2.4 (8) 90 < angle > 180
822 // The strategy for this function is to know how many blocks can be processed
823 // with just pixels from |top_ptr|, then handle mixed blocks, then handle only
824 // blocks that take from |left_ptr|. Additionally, a fast index-shuffle
825 // approach is used for pred values from |left_column| in sections that permit
826 // it.
827 template <bool upsampled_left, bool upsampled_top>
DirectionalZone2_SSE4_1(void * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int width,const int height,const int xstep,const int ystep)828 inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
829 const uint8_t* const top_row,
830 const uint8_t* const left_column,
831 const int width, const int height,
832 const int xstep, const int ystep) {
833 auto* dst = static_cast<uint8_t*>(dest);
834 const int upsample_top_shift = static_cast<int>(upsampled_top);
835 // All columns from |min_top_only_x| to the right will only need |top_row|
836 // to compute. This assumes minimum |xstep| is 3.
837 const int min_top_only_x = std::min((height * xstep) >> 6, width);
838
839 // Accumulate xstep across 8 rows.
840 const __m128i xstep_dup = _mm_set1_epi16(-xstep);
841 const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
842 const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
843 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
844 const __m128i scaled_one = _mm_set1_epi16(-64);
845 __m128i xstep_bounds_base =
846 (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
847 : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
848
849 const int left_base_increment = ystep >> 6;
850 const int ystep_remainder = ystep & 0x3F;
851 const int ystep8 = ystep << 3;
852 const int left_base_increment8 = ystep8 >> 6;
853 const int ystep_remainder8 = ystep8 & 0x3F;
854 const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
855
856 // If the 64 scaling is regarded as a decimal point, the first value of the
857 // left_y vector omits the portion which is covered under the left_column
858 // offset. Following values need the full ystep as a relative offset.
859 const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
860 const __m128i ystep_dup = _mm_set1_epi16(-ystep);
861 const __m128i dest_index_x =
862 _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
863 __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
864 left_y = _mm_add_epi16(ystep_init, left_y);
865
866 // Analysis finds that, for most angles (ystep < 132), all segments that use
867 // both top_row and left_column can compute from left_column using byte
868 // shuffles from a single vector. For steeper angles, the shuffle is also
869 // fully reliable when x >= 32.
870 const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
871 const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
872 const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
873 int x = 0;
874
875 for (int left_offset = -left_base_increment; x < min_shuffle_x;
876 x += 8,
877 xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
878 // Watch left_y because it can still get big.
879 left_y = _mm_add_epi16(left_y, increment_left8),
880 left_offset -= left_base_increment8) {
881 DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
882 dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
883 xstep_for_shift, xstep_bounds_base, left_y);
884 }
885 for (int left_offset = -left_base_increment; x < min_top_only_x;
886 x += 8,
887 xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
888 // Watch left_y because it can still get big.
889 left_y = _mm_add_epi16(left_y, increment_left8),
890 left_offset -= left_base_increment8) {
891 DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
892 dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
893 xstep_for_shift, xstep_bounds_base, left_y);
894 }
895 for (; x < width; x += 4) {
896 DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
897 height, -xstep, upsampled_top);
898 }
899 }
900
901 template <bool upsampled_left, bool upsampled_top>
DirectionalZone2_4_SSE4_1(void * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int width,const int height,const int xstep,const int ystep)902 inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
903 const uint8_t* const top_row,
904 const uint8_t* const left_column,
905 const int width, const int height,
906 const int xstep, const int ystep) {
907 auto* dst = static_cast<uint8_t*>(dest);
908 const int upsample_left_shift = static_cast<int>(upsampled_left);
909 const int upsample_top_shift = static_cast<int>(upsampled_top);
910 const __m128i max_shift = _mm_set1_epi8(32);
911 const ptrdiff_t stride4 = stride << 2;
912 const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
913 const __m128i sampler_top =
914 upsampled_top
915 ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
916 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
917 // All columns from |min_top_only_x| to the right will only need |top_row| to
918 // compute.
919 assert(xstep >= 3);
920 const int min_top_only_x = std::min((height * xstep) >> 6, width);
921
922 const int xstep4 = xstep << 2;
923 const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
924 const __m128i xstep_dup = _mm_set1_epi16(-xstep);
925 const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
926 __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
927 const __m128i scaled_one = _mm_set1_epi16(-64);
928 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
929 __m128i xstep_bounds_base =
930 (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
931 : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
932
933 const int left_base_increment = ystep >> 6;
934 const int ystep_remainder = ystep & 0x3F;
935 const int ystep4 = ystep << 2;
936 const int left_base_increment4 = ystep4 >> 6;
937 // This is guaranteed to be less than 64, but accumulation may bring it past
938 // 64 for higher x values.
939 const int ystep_remainder4 = ystep4 & 0x3F;
940 const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
941 const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
942
943 // If the 64 scaling is regarded as a decimal point, the first value of the
944 // left_y vector omits the portion which will go into the left_column offset.
945 // Following values need the full ystep as a relative offset.
946 const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
947 const __m128i ystep_dup = _mm_set1_epi16(-ystep);
948 __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
949 left_y = _mm_add_epi16(ystep_init, left_y);
950 const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
951
952 int x = 0;
953 // Loop over x for columns with a mixture of sources.
954 for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
955 xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
956 left_y = _mm_add_epi16(left_y, increment_left4),
957 left_offset -= left_base_increment4) {
958 uint8_t* dst_x = dst + x;
959
960 // Round down to the nearest multiple of 4.
961 const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
962 DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
963 max_top_only_y, -xstep, upsampled_top);
964 int y = max_top_only_y;
965 dst_x += stride * y;
966 const int xstep_y = xstep * y;
967 const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
968 // All rows from |min_left_only_y| down for this set of columns, only need
969 // |left_column| to compute. Rounded up to the nearest multiple of 4.
970 const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
971
972 __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
973 __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
974 int top_x = -xstep_y;
975
976 // Loop over y for mixed rows.
977 for (; y < min_left_only_y;
978 y += 4, dst_x += stride4,
979 xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
980 xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
981 top_x -= xstep4) {
982 DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
983 dst_x, stride,
984 left_column + ((left_offset + y) * (1 << upsample_left_shift)),
985 left_y);
986
987 __m128i shifts = _mm_srli_epi16(
988 _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
989 shift_mask),
990 1);
991 shifts = _mm_packus_epi16(shifts, shifts);
992 const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
993 shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
994 const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
995 DirectionalZone1Blend_4x4<upsampled_top>(
996 dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
997 xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
998 }
999 // Loop over y for left-only rows, if any.
1000 for (; y < height; y += 4, dst_x += stride4) {
1001 DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
1002 dst_x, stride,
1003 left_column + ((left_offset + y) << upsample_left_shift), left_y);
1004 }
1005 }
1006 // Loop over top-only columns, if any.
1007 for (; x < width; x += 4) {
1008 DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
1009 height, -xstep, upsampled_top);
1010 }
1011 }
1012
DirectionalIntraPredictorZone2_SSE4_1(void * const dest,ptrdiff_t stride,const void * const top_row,const void * const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)1013 void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
1014 const void* const top_row,
1015 const void* const left_column,
1016 const int width, const int height,
1017 const int xstep, const int ystep,
1018 const bool upsampled_top,
1019 const bool upsampled_left) {
1020 // Increasing the negative buffer for this function allows more rows to be
1021 // processed at a time without branching in an inner loop to check the base.
1022 uint8_t top_buffer[288];
1023 uint8_t left_buffer[288];
1024 memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
1025 memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
1026 #if LIBGAV1_MSAN
1027 memset(top_buffer, 0x33, 128);
1028 memset(left_buffer, 0x44, 128);
1029 #endif
1030 const uint8_t* top_ptr = top_buffer + 144;
1031 const uint8_t* left_ptr = left_buffer + 144;
1032 if (width == 4 || height == 4) {
1033 if (upsampled_left) {
1034 if (upsampled_top) {
1035 DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
1036 width, height, xstep, ystep);
1037 } else {
1038 DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
1039 width, height, xstep, ystep);
1040 }
1041 } else {
1042 if (upsampled_top) {
1043 DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
1044 width, height, xstep, ystep);
1045 } else {
1046 DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
1047 width, height, xstep, ystep);
1048 }
1049 }
1050 return;
1051 }
1052 if (upsampled_left) {
1053 if (upsampled_top) {
1054 DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
1055 width, height, xstep, ystep);
1056 } else {
1057 DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
1058 width, height, xstep, ystep);
1059 }
1060 } else {
1061 if (upsampled_top) {
1062 DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
1063 width, height, xstep, ystep);
1064 } else {
1065 DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
1066 width, height, xstep, ystep);
1067 }
1068 }
1069 }
1070
Init8bpp()1071 void Init8bpp() {
1072 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1073 assert(dsp != nullptr);
1074 static_cast<void>(dsp);
1075 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
1076 dsp->directional_intra_predictor_zone1 =
1077 DirectionalIntraPredictorZone1_SSE4_1;
1078 #endif
1079 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
1080 dsp->directional_intra_predictor_zone2 =
1081 DirectionalIntraPredictorZone2_SSE4_1;
1082 #endif
1083 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
1084 dsp->directional_intra_predictor_zone3 =
1085 DirectionalIntraPredictorZone3_SSE4_1;
1086 #endif
1087 }
1088
1089 } // namespace
1090 } // namespace low_bitdepth
1091
1092 //------------------------------------------------------------------------------
1093 #if LIBGAV1_MAX_BITDEPTH >= 10
1094 namespace high_bitdepth {
1095 namespace {
1096
1097 //------------------------------------------------------------------------------
1098 // 7.11.2.4. Directional intra prediction process
1099
1100 // Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
1101 // upsampling is ruled out. In addition, the bits masked by 0x3F for
1102 // |shift_val| are 0 for all multiples of 64, so the formula
1103 // val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
1104 // val = top[top_base_x+1] << 5, meaning only the second set of pixels is
1105 // involved in the output. Hence |top| is offset by 1.
DirectionalZone1_Step64(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top,const int width,const int height)1106 inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
1107 const uint16_t* const top, const int width,
1108 const int height) {
1109 ptrdiff_t offset = 1;
1110 if (height == 4) {
1111 memcpy(dst, top + offset, width * sizeof(dst[0]));
1112 dst += stride;
1113 memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
1114 dst += stride;
1115 memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
1116 dst += stride;
1117 memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
1118 return;
1119 }
1120 int y = height;
1121 do {
1122 memcpy(dst, top + offset, width * sizeof(dst[0]));
1123 dst += stride;
1124 memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
1125 dst += stride;
1126 memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
1127 dst += stride;
1128 memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
1129 dst += stride;
1130 memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
1131 dst += stride;
1132 memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
1133 dst += stride;
1134 memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
1135 dst += stride;
1136 memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
1137 dst += stride;
1138
1139 offset += 8;
1140 y -= 8;
1141 } while (y != 0);
1142 }
1143
1144 // Produce a weighted average whose weights sum to 32.
CombineTopVals4(const __m128i & top_vals,const __m128i & sampler,const __m128i & shifts,const __m128i & top_indices,const __m128i & final_top_val,const __m128i & border_index)1145 inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
1146 const __m128i& shifts,
1147 const __m128i& top_indices,
1148 const __m128i& final_top_val,
1149 const __m128i& border_index) {
1150 const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
1151 __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
1152 prod = _mm_hadd_epi16(prod, prod);
1153 const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
1154
1155 const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
1156 // Replace pixels from invalid range with top-right corner.
1157 return _mm_blendv_epi8(result, final_top_val, past_max);
1158 }
1159
1160 // When width is 4, only one load operation is needed per iteration. We also
1161 // avoid extra loop precomputations that cause too much overhead.
DirectionalZone1_4xH(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top,const int height,const int xstep,const bool upsampled,const __m128i & sampler)1162 inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
1163 const uint16_t* const top, const int height,
1164 const int xstep, const bool upsampled,
1165 const __m128i& sampler) {
1166 const int upsample_shift = static_cast<int>(upsampled);
1167 const int index_scale_bits = 6 - upsample_shift;
1168 const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
1169 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1170 const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
1171
1172 // Each 16-bit value here corresponds to a position that may exceed
1173 // |max_base_x|. When added to the top_base_x, it is used to mask values
1174 // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
1175 // only cmpgt is available.
1176 const __m128i offsets =
1177 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1178
1179 // All rows from |min_corner_only_y| down will simply use memcpy.
1180 // |max_base_x| is always greater than |height|, so clipping the denominator
1181 // to 1 is enough to make the logic work.
1182 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1183 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1184
1185 int y = 0;
1186 int top_x = xstep;
1187 const __m128i max_shift = _mm_set1_epi16(32);
1188
1189 for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1190 const int top_base_x = top_x >> index_scale_bits;
1191
1192 // Permit negative values of |top_x|.
1193 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1194 const __m128i shift = _mm_set1_epi16(shift_val);
1195 const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1196 const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1197 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1198 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1199
1200 // Load 8 values because we will select the sampled values based on
1201 // |upsampled|.
1202 const __m128i values = LoadUnaligned16(top + top_base_x);
1203 const __m128i pred =
1204 CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
1205 max_base_x_vect);
1206 StoreLo8(dst, pred);
1207 }
1208
1209 // Fill in corner-only rows.
1210 for (; y < height; ++y) {
1211 Memset(dst, top[max_base_x], /* width */ 4);
1212 dst += stride;
1213 }
1214 }
1215
1216 // General purpose combine function.
1217 // |check_border| means the final source value has to be duplicated into the
1218 // result. This simplifies the loop structures that use precomputed boundaries
1219 // to identify sections where it is safe to compute without checking for the
1220 // right border.
1221 template <bool check_border>
CombineTopVals(const __m128i & top_vals_0,const __m128i & top_vals_1,const __m128i & sampler,const __m128i & shifts,const __m128i & top_indices=_mm_setzero_si128 (),const __m128i & final_top_val=_mm_setzero_si128 (),const __m128i & border_index=_mm_setzero_si128 ())1222 inline __m128i CombineTopVals(
1223 const __m128i& top_vals_0, const __m128i& top_vals_1,
1224 const __m128i& sampler, const __m128i& shifts,
1225 const __m128i& top_indices = _mm_setzero_si128(),
1226 const __m128i& final_top_val = _mm_setzero_si128(),
1227 const __m128i& border_index = _mm_setzero_si128()) {
1228 constexpr int scale_int_bits = 5;
1229 const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
1230 const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
1231 const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
1232 const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
1233 const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
1234 const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
1235 if (check_border) {
1236 const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
1237 // Replace pixels from invalid range with top-right corner.
1238 return _mm_blendv_epi8(result, final_top_val, past_max);
1239 }
1240 return result;
1241 }
1242
1243 // 7.11.2.4 (7) angle < 90
DirectionalZone1_Large(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const int width,const int height,const int xstep,const bool upsampled,const __m128i & sampler)1244 inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
1245 const uint16_t* const top_row,
1246 const int width, const int height,
1247 const int xstep, const bool upsampled,
1248 const __m128i& sampler) {
1249 const int upsample_shift = static_cast<int>(upsampled);
1250 const int index_scale_bits = 6 - upsample_shift;
1251 const int max_base_x = ((width + height) - 1) << upsample_shift;
1252
1253 const __m128i max_shift = _mm_set1_epi16(32);
1254 const int base_step = 1 << upsample_shift;
1255 const int base_step8 = base_step << 3;
1256
1257 // All rows from |min_corner_only_y| down will simply use memcpy.
1258 // |max_base_x| is always greater than |height|, so clipping to 1 is enough
1259 // to make the logic work.
1260 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1261 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1262
1263 // Rows up to this y-value can be computed without checking for bounds.
1264 const int max_no_corner_y = std::min(
1265 LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
1266 height);
1267 // No need to check for exceeding |max_base_x| in the first loop.
1268 int y = 0;
1269 int top_x = xstep;
1270 for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
1271 int top_base_x = top_x >> index_scale_bits;
1272 // Permit negative values of |top_x|.
1273 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1274 const __m128i shift = _mm_set1_epi16(shift_val);
1275 const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1276 const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1277 int x = 0;
1278 do {
1279 const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1280 const __m128i top_vals_1 =
1281 LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1282
1283 const __m128i pred =
1284 CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
1285
1286 StoreUnaligned16(dest + x, pred);
1287 top_base_x += base_step8;
1288 x += 8;
1289 } while (x < width);
1290 }
1291
1292 // Each 16-bit value here corresponds to a position that may exceed
1293 // |max_base_x|. When added to |top_base_x|, it is used to mask values
1294 // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
1295 // which is not supported for packed integers.
1296 const __m128i offsets =
1297 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1298
1299 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1300 const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
1301 const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
1302 for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
1303 int top_base_x = top_x >> index_scale_bits;
1304
1305 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1306 const __m128i shift = _mm_set1_epi16(shift_val);
1307 const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1308 const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1309 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1310 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1311
1312 int x = 0;
1313 const int min_corner_only_x =
1314 std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
1315 for (; x < min_corner_only_x;
1316 x += 8, top_base_x += base_step8,
1317 top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
1318 const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1319 const __m128i top_vals_1 =
1320 LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1321 const __m128i pred =
1322 CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
1323 top_index_vect, final_top_val, max_base_x_vect);
1324 StoreUnaligned16(dest + x, pred);
1325 }
1326 // Corner-only section of the row.
1327 Memset(dest + x, top_row[max_base_x], width - x);
1328 }
1329 // Fill in corner-only rows.
1330 for (; y < height; ++y) {
1331 Memset(dest, top_row[max_base_x], width);
1332 dest += stride;
1333 }
1334 }
1335
1336 // 7.11.2.4 (7) angle < 90
DirectionalIntraPredictorZone1_SSE4_1(void * dest_ptr,ptrdiff_t stride,const void * const top_ptr,const int width,const int height,const int xstep,const bool upsampled)1337 inline void DirectionalIntraPredictorZone1_SSE4_1(
1338 void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
1339 const int width, const int height, const int xstep, const bool upsampled) {
1340 const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1341 auto* dest = static_cast<uint16_t*>(dest_ptr);
1342 stride /= sizeof(uint16_t);
1343 const int upsample_shift = static_cast<int>(upsampled);
1344 if (xstep == 64) {
1345 DirectionalZone1_Step64(dest, stride, top_row, width, height);
1346 return;
1347 }
1348 // Each base pixel paired with its following pixel, for hadd purposes.
1349 const __m128i adjacency_shuffler = _mm_set_epi16(
1350 0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
1351 // This is equivalent to not shuffling at all.
1352 const __m128i identity_shuffler = _mm_set_epi16(
1353 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
1354 // This represents a trade-off between code size and speed. When upsampled
1355 // is true, no shuffle is necessary. But to avoid in-loop branching, we
1356 // would need 2 copies of the main function body.
1357 const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
1358 if (width == 4) {
1359 DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
1360 sampler);
1361 return;
1362 }
1363 if (width >= 32) {
1364 DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
1365 upsampled, sampler);
1366 return;
1367 }
1368 const int index_scale_bits = 6 - upsample_shift;
1369 const int max_base_x = ((width + height) - 1) << upsample_shift;
1370
1371 const __m128i max_shift = _mm_set1_epi16(32);
1372 const int base_step = 1 << upsample_shift;
1373 const int base_step8 = base_step << 3;
1374
1375 // No need to check for exceeding |max_base_x| in the loops.
1376 if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
1377 int top_x = xstep;
1378 int y = height;
1379 do {
1380 int top_base_x = top_x >> index_scale_bits;
1381 // Permit negative values of |top_x|.
1382 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1383 const __m128i shift = _mm_set1_epi16(shift_val);
1384 const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1385 const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1386 int x = 0;
1387 do {
1388 const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1389 const __m128i top_vals_1 =
1390 LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1391 const __m128i pred =
1392 CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
1393 StoreUnaligned16(dest + x, pred);
1394 top_base_x += base_step8;
1395 x += 8;
1396 } while (x < width);
1397 dest += stride;
1398 top_x += xstep;
1399 } while (--y != 0);
1400 return;
1401 }
1402
1403 // General case. Blocks with width less than 32 do not benefit from x-wise
1404 // loop splitting, but do benefit from using memset on appropriate rows.
1405
1406 // Each 16-bit value here corresponds to a position that may exceed
1407 // |max_base_x|. When added to the top_base_x, it is used to mask values
1408 // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
1409 // not supported for packed integers.
1410 const __m128i offsets =
1411 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1412
1413 const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1414 const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
1415 const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
1416
1417 // All rows from |min_corner_only_y| down will simply use memcpy.
1418 // |max_base_x| is always greater than |height|, so clipping the denominator
1419 // to 1 is enough to make the logic work.
1420 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1421 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1422
1423 int top_x = xstep;
1424 int y = 0;
1425 for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
1426 int top_base_x = top_x >> index_scale_bits;
1427
1428 const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1429 const __m128i shift = _mm_set1_epi16(shift_val);
1430 const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1431 const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1432 __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1433 top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1434
1435 for (int x = 0; x < width; x += 8, top_base_x += base_step8,
1436 top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
1437 const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1438 const __m128i top_vals_1 =
1439 LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1440 const __m128i pred =
1441 CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
1442 top_index_vect, final_top_val, max_base_x_vect);
1443 StoreUnaligned16(dest + x, pred);
1444 }
1445 }
1446
1447 // Fill in corner-only rows.
1448 for (; y < height; ++y) {
1449 Memset(dest, top_row[max_base_x], width);
1450 dest += stride;
1451 }
1452 }
1453
Init10bpp()1454 void Init10bpp() {
1455 Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
1456 assert(dsp != nullptr);
1457 static_cast<void>(dsp);
1458 #if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
1459 dsp->directional_intra_predictor_zone1 =
1460 DirectionalIntraPredictorZone1_SSE4_1;
1461 #endif
1462 }
1463
1464 } // namespace
1465 } // namespace high_bitdepth
1466
1467 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1468
IntraPredDirectionalInit_SSE4_1()1469 void IntraPredDirectionalInit_SSE4_1() {
1470 low_bitdepth::Init8bpp();
1471 #if LIBGAV1_MAX_BITDEPTH >= 10
1472 high_bitdepth::Init10bpp();
1473 #endif
1474 }
1475
1476 } // namespace dsp
1477 } // namespace libgav1
1478
1479 #else // !LIBGAV1_TARGETING_SSE4_1
1480 namespace libgav1 {
1481 namespace dsp {
1482
IntraPredDirectionalInit_SSE4_1()1483 void IntraPredDirectionalInit_SSE4_1() {}
1484
1485 } // namespace dsp
1486 } // namespace libgav1
1487 #endif // LIBGAV1_TARGETING_SSE4_1
1488