xref: /aosp_15_r20/external/libgav1/src/dsp/x86/intrapred_directional_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2021 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_directional.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27 
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/memory.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39 
40 //------------------------------------------------------------------------------
41 // 7.11.2.4. Directional intra prediction process
42 
43 // Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
44 // upsampling is ruled out. In addition, the bits masked by 0x3F for
45 // |shift_val| are 0 for all multiples of 64, so the formula
46 // val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
47 // val = top[top_base_x+1] << 5, meaning only the second set of pixels is
48 // involved in the output. Hence |top| is offset by 1.
DirectionalZone1_Step64(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top,const int width,const int height)49 inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
50                                     const uint8_t* const top, const int width,
51                                     const int height) {
52   ptrdiff_t offset = 1;
53   if (height == 4) {
54     memcpy(dst, top + offset, width);
55     dst += stride;
56     memcpy(dst, top + offset + 1, width);
57     dst += stride;
58     memcpy(dst, top + offset + 2, width);
59     dst += stride;
60     memcpy(dst, top + offset + 3, width);
61     return;
62   }
63   int y = 0;
64   do {
65     memcpy(dst, top + offset, width);
66     dst += stride;
67     memcpy(dst, top + offset + 1, width);
68     dst += stride;
69     memcpy(dst, top + offset + 2, width);
70     dst += stride;
71     memcpy(dst, top + offset + 3, width);
72     dst += stride;
73     memcpy(dst, top + offset + 4, width);
74     dst += stride;
75     memcpy(dst, top + offset + 5, width);
76     dst += stride;
77     memcpy(dst, top + offset + 6, width);
78     dst += stride;
79     memcpy(dst, top + offset + 7, width);
80     dst += stride;
81 
82     offset += 8;
83     y += 8;
84   } while (y < height);
85 }
86 
DirectionalZone1_4xH(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top,const int height,const int xstep,const bool upsampled)87 inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
88                                  const uint8_t* const top, const int height,
89                                  const int xstep, const bool upsampled) {
90   const int upsample_shift = static_cast<int>(upsampled);
91   const int scale_bits = 6 - upsample_shift;
92   const __m128i max_shift = _mm_set1_epi8(32);
93   // Downscaling for a weighted average whose weights sum to 32 (max_shift).
94   const int rounding_bits = 5;
95   const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
96   const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
97   const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
98                                     : _mm_set_epi64x(0, 0x0403030202010100);
99   // Each 16-bit value here corresponds to a position that may exceed
100   // |max_base_x|. When added to the top_base_x, it is used to mask values
101   // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
102   // not supported for packed integers.
103   const __m128i offsets =
104       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
105 
106   // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
107   // is always greater than |height|, so clipping to 1 is enough to make the
108   // logic work.
109   const int xstep_units = std::max(xstep >> scale_bits, 1);
110   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
111 
112   // Rows up to this y-value can be computed without checking for bounds.
113   int y = 0;
114   int top_x = xstep;
115 
116   for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
117     const int top_base_x = top_x >> scale_bits;
118 
119     // Permit negative values of |top_x|.
120     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
121     const __m128i shift = _mm_set1_epi8(shift_val);
122     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
123     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
124     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
125     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
126     const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
127 
128     // Load 8 values because we will select the sampled values based on
129     // |upsampled|.
130     const __m128i values = LoadLo8(top + top_base_x);
131     const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
132     const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
133     __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
134     prod = RightShiftWithRounding_U16(prod, rounding_bits);
135     // Replace pixels from invalid range with top-right corner.
136     prod = _mm_blendv_epi8(prod, final_top_val, past_max);
137     Store4(dst, _mm_packus_epi16(prod, prod));
138   }
139 
140   // Fill in corner-only rows.
141   for (; y < height; ++y) {
142     memset(dst, top[max_base_x], /* width */ 4);
143     dst += stride;
144   }
145 }
146 
147 // 7.11.2.4 (7) angle < 90
DirectionalZone1_Large(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const int width,const int height,const int xstep,const bool upsampled)148 inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
149                                    const uint8_t* const top_row,
150                                    const int width, const int height,
151                                    const int xstep, const bool upsampled) {
152   const int upsample_shift = static_cast<int>(upsampled);
153   const __m128i sampler =
154       upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
155                 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
156   const int scale_bits = 6 - upsample_shift;
157   const int max_base_x = ((width + height) - 1) << upsample_shift;
158 
159   const __m128i max_shift = _mm_set1_epi8(32);
160   // Downscaling for a weighted average whose weights sum to 32 (max_shift).
161   const int rounding_bits = 5;
162   const int base_step = 1 << upsample_shift;
163   const int base_step8 = base_step << 3;
164 
165   // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
166   // is always greater than |height|, so clipping to 1 is enough to make the
167   // logic work.
168   const int xstep_units = std::max(xstep >> scale_bits, 1);
169   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
170 
171   // Rows up to this y-value can be computed without checking for bounds.
172   const int max_no_corner_y = std::min(
173       LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
174       height);
175   // No need to check for exceeding |max_base_x| in the first loop.
176   int y = 0;
177   int top_x = xstep;
178   for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
179     int top_base_x = top_x >> scale_bits;
180     // Permit negative values of |top_x|.
181     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
182     const __m128i shift = _mm_set1_epi8(shift_val);
183     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
184     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
185     int x = 0;
186     do {
187       const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
188       __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
189       vals = _mm_maddubs_epi16(vals, shifts);
190       vals = RightShiftWithRounding_U16(vals, rounding_bits);
191       StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
192       top_base_x += base_step8;
193       x += 8;
194     } while (x < width);
195   }
196 
197   // Each 16-bit value here corresponds to a position that may exceed
198   // |max_base_x|. When added to the top_base_x, it is used to mask values
199   // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
200   // not supported for packed integers.
201   const __m128i offsets =
202       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
203 
204   const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
205   const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
206   const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
207   for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
208     int top_base_x = top_x >> scale_bits;
209 
210     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
211     const __m128i shift = _mm_set1_epi8(shift_val);
212     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
213     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
214     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
215     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
216 
217     int x = 0;
218     const int min_corner_only_x =
219         std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
220     for (; x < min_corner_only_x;
221          x += 8, top_base_x += base_step8,
222          top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
223       const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
224       // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
225       // reading out of bounds. If all indices are past max and we don't need to
226       // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
227       // reset for the next |y|.
228       top_base_x &= ~_mm_cvtsi128_si32(past_max);
229       const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
230       __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
231       vals = _mm_maddubs_epi16(vals, shifts);
232       vals = RightShiftWithRounding_U16(vals, rounding_bits);
233       vals = _mm_blendv_epi8(vals, final_top_val, past_max);
234       StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
235     }
236     // Corner-only section of the row.
237     memset(dest + x, top_row[max_base_x], width - x);
238   }
239   // Fill in corner-only rows.
240   for (; y < height; ++y) {
241     memset(dest, top_row[max_base_x], width);
242     dest += stride;
243   }
244 }
245 
246 // 7.11.2.4 (7) angle < 90
DirectionalZone1_SSE4_1(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const int width,const int height,const int xstep,const bool upsampled)247 inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
248                                     const uint8_t* const top_row,
249                                     const int width, const int height,
250                                     const int xstep, const bool upsampled) {
251   const int upsample_shift = static_cast<int>(upsampled);
252   if (xstep == 64) {
253     DirectionalZone1_Step64(dest, stride, top_row, width, height);
254     return;
255   }
256   if (width == 4) {
257     DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
258     return;
259   }
260   if (width >= 32) {
261     DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
262                            upsampled);
263     return;
264   }
265   const __m128i sampler =
266       upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
267                 : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
268   const int scale_bits = 6 - upsample_shift;
269   const int max_base_x = ((width + height) - 1) << upsample_shift;
270 
271   const __m128i max_shift = _mm_set1_epi8(32);
272   // Downscaling for a weighted average whose weights sum to 32 (max_shift).
273   const int rounding_bits = 5;
274   const int base_step = 1 << upsample_shift;
275   const int base_step8 = base_step << 3;
276 
277   // No need to check for exceeding |max_base_x| in the loops.
278   if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
279     int top_x = xstep;
280     int y = 0;
281     do {
282       int top_base_x = top_x >> scale_bits;
283       // Permit negative values of |top_x|.
284       const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
285       const __m128i shift = _mm_set1_epi8(shift_val);
286       const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
287       const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
288       int x = 0;
289       do {
290         const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
291         __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
292         vals = _mm_maddubs_epi16(vals, shifts);
293         vals = RightShiftWithRounding_U16(vals, rounding_bits);
294         StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
295         top_base_x += base_step8;
296         x += 8;
297       } while (x < width);
298       dest += stride;
299       top_x += xstep;
300     } while (++y < height);
301     return;
302   }
303 
304   // Each 16-bit value here corresponds to a position that may exceed
305   // |max_base_x|. When added to the top_base_x, it is used to mask values
306   // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
307   // not supported for packed integers.
308   const __m128i offsets =
309       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
310 
311   const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
312   const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
313   const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
314   int top_x = xstep;
315   int y = 0;
316   do {
317     int top_base_x = top_x >> scale_bits;
318 
319     if (top_base_x >= max_base_x) {
320       for (int i = y; i < height; ++i) {
321         memset(dest, top_row[max_base_x], width);
322         dest += stride;
323       }
324       return;
325     }
326 
327     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
328     const __m128i shift = _mm_set1_epi8(shift_val);
329     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
330     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
331     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
332     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
333 
334     int x = 0;
335     for (; x < width - 8;
336          x += 8, top_base_x += base_step8,
337          top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
338       const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
339       // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
340       // reading out of bounds. If all indices are past max and we don't need to
341       // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
342       // reset for the next |y|.
343       top_base_x &= ~_mm_cvtsi128_si32(past_max);
344       const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
345       __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
346       vals = _mm_maddubs_epi16(vals, shifts);
347       vals = RightShiftWithRounding_U16(vals, rounding_bits);
348       vals = _mm_blendv_epi8(vals, final_top_val, past_max);
349       StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
350     }
351     const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
352     __m128i vals;
353     if (upsampled) {
354       vals = LoadUnaligned16(top_row + top_base_x);
355     } else {
356       const __m128i top_vals = LoadLo8(top_row + top_base_x);
357       vals = _mm_shuffle_epi8(top_vals, sampler);
358       vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
359     }
360     vals = _mm_maddubs_epi16(vals, shifts);
361     vals = RightShiftWithRounding_U16(vals, rounding_bits);
362     vals = _mm_blendv_epi8(vals, final_top_val, past_max);
363     StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
364     dest += stride;
365     top_x += xstep;
366   } while (++y < height);
367 }
368 
DirectionalIntraPredictorZone1_SSE4_1(void * const dest,ptrdiff_t stride,const void * const top_row,const int width,const int height,const int xstep,const bool upsampled_top)369 void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
370                                            const void* const top_row,
371                                            const int width, const int height,
372                                            const int xstep,
373                                            const bool upsampled_top) {
374   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
375   auto* dst = static_cast<uint8_t*>(dest);
376   DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
377                           upsampled_top);
378 }
379 
380 template <bool upsampled>
DirectionalZone3_4x4(uint8_t * dest,ptrdiff_t stride,const uint8_t * const left_column,const int base_left_y,const int ystep)381 inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
382                                  const uint8_t* const left_column,
383                                  const int base_left_y, const int ystep) {
384   // For use in the non-upsampled case.
385   const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
386   const int upsample_shift = static_cast<int>(upsampled);
387   const int scale_bits = 6 - upsample_shift;
388   const __m128i max_shift = _mm_set1_epi8(32);
389   // Downscaling for a weighted average whose weights sum to 32 (max_shift).
390   const int rounding_bits = 5;
391 
392   __m128i result_block[4];
393   for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
394     const int left_base_y = left_y >> scale_bits;
395     const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
396     const __m128i shift = _mm_set1_epi8(shift_val);
397     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
398     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
399     __m128i vals;
400     if (upsampled) {
401       vals = LoadLo8(left_column + left_base_y);
402     } else {
403       const __m128i top_vals = LoadLo8(left_column + left_base_y);
404       vals = _mm_shuffle_epi8(top_vals, sampler);
405     }
406     vals = _mm_maddubs_epi16(vals, shifts);
407     vals = RightShiftWithRounding_U16(vals, rounding_bits);
408     result_block[x] = _mm_packus_epi16(vals, vals);
409   }
410   const __m128i result = Transpose4x4_U8(result_block);
411   // This is result_row0.
412   Store4(dest, result);
413   dest += stride;
414   const int result_row1 = _mm_extract_epi32(result, 1);
415   memcpy(dest, &result_row1, sizeof(result_row1));
416   dest += stride;
417   const int result_row2 = _mm_extract_epi32(result, 2);
418   memcpy(dest, &result_row2, sizeof(result_row2));
419   dest += stride;
420   const int result_row3 = _mm_extract_epi32(result, 3);
421   memcpy(dest, &result_row3, sizeof(result_row3));
422 }
423 
424 template <bool upsampled, int height>
DirectionalZone3_8xH(uint8_t * dest,ptrdiff_t stride,const uint8_t * const left_column,const int base_left_y,const int ystep)425 inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
426                                  const uint8_t* const left_column,
427                                  const int base_left_y, const int ystep) {
428   // For use in the non-upsampled case.
429   const __m128i sampler =
430       _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
431   const int upsample_shift = static_cast<int>(upsampled);
432   const int scale_bits = 6 - upsample_shift;
433   const __m128i max_shift = _mm_set1_epi8(32);
434   // Downscaling for a weighted average whose weights sum to 32 (max_shift).
435   const int rounding_bits = 5;
436 
437   __m128i result_block[8];
438   for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
439     const int left_base_y = left_y >> scale_bits;
440     const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
441     const __m128i shift = _mm_set1_epi8(shift_val);
442     const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
443     const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
444     __m128i vals;
445     if (upsampled) {
446       vals = LoadUnaligned16(left_column + left_base_y);
447     } else {
448       const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
449       vals = _mm_shuffle_epi8(top_vals, sampler);
450     }
451     vals = _mm_maddubs_epi16(vals, shifts);
452     result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
453   }
454   Transpose8x8_U16(result_block, result_block);
455   for (int y = 0; y < height; ++y) {
456     StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
457     dest += stride;
458   }
459 }
460 
461 // 7.11.2.4 (9) angle > 180
DirectionalIntraPredictorZone3_SSE4_1(void * dest,ptrdiff_t stride,const void * const left_column,const int width,const int height,const int ystep,const bool upsampled)462 void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
463                                            const void* const left_column,
464                                            const int width, const int height,
465                                            const int ystep,
466                                            const bool upsampled) {
467   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
468   auto* dst = static_cast<uint8_t*>(dest);
469   const int upsample_shift = static_cast<int>(upsampled);
470   if (width == 4 || height == 4) {
471     const ptrdiff_t stride4 = stride << 2;
472     if (upsampled) {
473       int left_y = ystep;
474       int x = 0;
475       do {
476         uint8_t* dst_x = dst + x;
477         int y = 0;
478         do {
479           DirectionalZone3_4x4<true>(
480               dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
481           dst_x += stride4;
482           y += 4;
483         } while (y < height);
484         left_y += ystep << 2;
485         x += 4;
486       } while (x < width);
487     } else {
488       int left_y = ystep;
489       int x = 0;
490       do {
491         uint8_t* dst_x = dst + x;
492         int y = 0;
493         do {
494           DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
495                                       ystep);
496           dst_x += stride4;
497           y += 4;
498         } while (y < height);
499         left_y += ystep << 2;
500         x += 4;
501       } while (x < width);
502     }
503     return;
504   }
505 
506   const ptrdiff_t stride8 = stride << 3;
507   if (upsampled) {
508     int left_y = ystep;
509     int x = 0;
510     do {
511       uint8_t* dst_x = dst + x;
512       int y = 0;
513       do {
514         DirectionalZone3_8xH<true, 8>(
515             dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
516         dst_x += stride8;
517         y += 8;
518       } while (y < height);
519       left_y += ystep << 3;
520       x += 8;
521     } while (x < width);
522   } else {
523     int left_y = ystep;
524     int x = 0;
525     do {
526       uint8_t* dst_x = dst + x;
527       int y = 0;
528       do {
529         DirectionalZone3_8xH<false, 8>(
530             dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
531         dst_x += stride8;
532         y += 8;
533       } while (y < height);
534       left_y += ystep << 3;
535       x += 8;
536     } while (x < width);
537   }
538 }
539 
540 //------------------------------------------------------------------------------
541 // Directional Zone 2 Functions
542 // 7.11.2.4 (8)
543 
544 // DirectionalBlend* selectively overwrites the values written by
545 // DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
546 // row.
547 template <int y_selector>
DirectionalBlend4_SSE4_1(uint8_t * dest,const __m128i & dest_index_vect,const __m128i & vals,const __m128i & zone_bounds)548 inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
549                                      const __m128i& dest_index_vect,
550                                      const __m128i& vals,
551                                      const __m128i& zone_bounds) {
552   const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
553   const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
554   const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
555   const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
556   Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
557 }
558 
DirectionalBlend8_SSE4_1(uint8_t * dest,const __m128i & dest_index_vect,const __m128i & vals,const __m128i & zone_bounds,const __m128i & bounds_selector)559 inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
560                                      const __m128i& dest_index_vect,
561                                      const __m128i& vals,
562                                      const __m128i& zone_bounds,
563                                      const __m128i& bounds_selector) {
564   const __m128i max_dest_x_vect =
565       _mm_shuffle_epi8(zone_bounds, bounds_selector);
566   const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
567   const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
568   const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
569   StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
570 }
571 
572 constexpr int kDirectionalWeightBits = 5;
573 // |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
574 // |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
575 // shift) and shift. Shift is guaranteed to be between 0 and 32.
DirectionalZone2FromSource_SSE4_1(const uint8_t * const source,const __m128i & shifts,const __m128i & sampler)576 inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
577                                                  const __m128i& shifts,
578                                                  const __m128i& sampler) {
579   const __m128i src_vals = LoadUnaligned16(source);
580   __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
581   vals = _mm_maddubs_epi16(vals, shifts);
582   return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
583 }
584 
585 // Because the source values "move backwards" as the row index increases, the
586 // indices derived from ystep are generally negative. This is accommodated by
587 // making sure the relative indices are within [-15, 0] when the function is
588 // called, and sliding them into the inclusive range [0, 15], relative to a
589 // lower base address.
590 constexpr int kPositiveIndexOffset = 15;
591 
592 template <bool upsampled>
DirectionalZone2FromLeftCol_4x4_SSE4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * const left_column_base,__m128i left_y)593 inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
594     uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
595     __m128i left_y) {
596   const int upsample_shift = static_cast<int>(upsampled);
597   const int scale_bits = 6 - upsample_shift;
598   const __m128i max_shifts = _mm_set1_epi8(32);
599   const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
600   const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
601   const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
602   // Left_column and sampler are both offset by 15 so the indices are always
603   // positive.
604   const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
605   for (int y = 0; y < 4; dst += stride, ++y) {
606     __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
607     offset_y = _mm_packs_epi16(offset_y, offset_y);
608 
609     const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
610     __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
611     // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
612     // can work as shuffle indices. Some values may be out of bounds, but their
613     // pred results will be masked over by top prediction.
614     sampler = _mm_add_epi8(sampler, positive_offset);
615 
616     __m128i shifts = _mm_srli_epi16(
617         _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
618     shifts = _mm_packus_epi16(shifts, shifts);
619     const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
620     shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
621     const __m128i vals = DirectionalZone2FromSource_SSE4_1(
622         left_column + (y << upsample_shift), shifts, sampler);
623     Store4(dst, _mm_packus_epi16(vals, vals));
624   }
625 }
626 
627 template <bool upsampled>
DirectionalZone2FromLeftCol_8x8_SSE4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * const left_column,__m128i left_y)628 inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
629     uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
630     __m128i left_y) {
631   const int upsample_shift = static_cast<int>(upsampled);
632   const int scale_bits = 6 - upsample_shift;
633   const __m128i max_shifts = _mm_set1_epi8(32);
634   const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
635   const __m128i index_increment = _mm_set1_epi8(1);
636   const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
637   for (int y = 0; y < 8; dst += stride, ++y) {
638     __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
639     offset_y = _mm_packs_epi16(offset_y, offset_y);
640     const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
641 
642     // Offset the relative index because ystep is negative in Zone 2 and shuffle
643     // indices must be nonnegative.
644     __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
645     sampler = _mm_add_epi8(sampler, denegation);
646 
647     __m128i shifts = _mm_srli_epi16(
648         _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
649     shifts = _mm_packus_epi16(shifts, shifts);
650     const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
651     shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
652 
653     // The specification adds (y << 6) to left_y, which is subject to
654     // upsampling, but this puts sampler indices out of the 0-15 range. It is
655     // equivalent to offset the source address by (y << upsample_shift) instead.
656     const __m128i vals = DirectionalZone2FromSource_SSE4_1(
657         left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
658         sampler);
659     StoreLo8(dst, _mm_packus_epi16(vals, vals));
660   }
661 }
662 
663 // |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
664 // upsampled_top), for each row. When there are 4 values, they can be duplicated
665 // with a non-register shuffle mask.
666 // |shifts| is one pair of weights that applies throughout a given row.
667 template <bool upsampled_top>
DirectionalZone1Blend_4x4(uint8_t * dest,const uint8_t * const top_row,ptrdiff_t stride,__m128i sampler,const __m128i & zone_bounds,const __m128i & shifts,const __m128i & dest_index_x,int top_x,const int xstep)668 inline void DirectionalZone1Blend_4x4(
669     uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
670     __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
671     const __m128i& dest_index_x, int top_x, const int xstep) {
672   const int upsample_shift = static_cast<int>(upsampled_top);
673   const int scale_bits_x = 6 - upsample_shift;
674   top_x -= xstep;
675 
676   int top_base_x = (top_x >> scale_bits_x);
677   const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
678       top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
679   DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
680   top_x -= xstep;
681   dest += stride;
682 
683   top_base_x = (top_x >> scale_bits_x);
684   const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
685       top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
686   DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
687   top_x -= xstep;
688   dest += stride;
689 
690   top_base_x = (top_x >> scale_bits_x);
691   const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
692       top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
693   DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
694   top_x -= xstep;
695   dest += stride;
696 
697   top_base_x = (top_x >> scale_bits_x);
698   const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
699       top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
700   DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
701 }
702 
703 template <bool upsampled_top, int height>
DirectionalZone1Blend_8xH(uint8_t * dest,const uint8_t * const top_row,ptrdiff_t stride,__m128i sampler,const __m128i & zone_bounds,const __m128i & shifts,const __m128i & dest_index_x,int top_x,const int xstep)704 inline void DirectionalZone1Blend_8xH(
705     uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
706     __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
707     const __m128i& dest_index_x, int top_x, const int xstep) {
708   const int upsample_shift = static_cast<int>(upsampled_top);
709   const int scale_bits_x = 6 - upsample_shift;
710 
711   __m128i y_selector = _mm_set1_epi32(0x01000100);
712   const __m128i index_increment = _mm_set1_epi32(0x02020202);
713   for (int y = 0; y < height; ++y,
714            y_selector = _mm_add_epi8(y_selector, index_increment),
715            dest += stride) {
716     top_x -= xstep;
717     const int top_base_x = top_x >> scale_bits_x;
718     const __m128i vals = DirectionalZone2FromSource_SSE4_1(
719         top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
720     DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
721   }
722 }
723 
724 template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const __m128i & xstep_for_shift,const __m128i & xstep_bounds_base,const __m128i & left_y)725 inline void DirectionalZone2_8xH(
726     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
727     const uint8_t* LIBGAV1_RESTRICT const top_row,
728     const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
729     const int xstep, const int ystep, const int x, const int left_offset,
730     const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
731     const __m128i& left_y) {
732   const int upsample_left_shift = static_cast<int>(upsampled_left);
733   const int upsample_top_shift = static_cast<int>(upsampled_top);
734 
735   // Loop incrementers for moving by block (8x8). This function handles blocks
736   // with height 4 as well. They are calculated in one pass so these variables
737   // do not get used.
738   const ptrdiff_t stride8 = stride << 3;
739   const int xstep8 = xstep << 3;
740   const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
741 
742   // Cover 8x4 case.
743   const int min_height = (height == 4) ? 4 : 8;
744 
745   // The first stage, before the first y-loop, covers blocks that are only
746   // computed from the top row. The second stage, comprising two y-loops, covers
747   // blocks that have a mixture of values computed from top or left. The final
748   // stage covers blocks that are only computed from the left.
749   uint8_t* dst_x = dst + x;
750 
751   // Round down to the nearest multiple of 8 (or 4, if height is 4).
752   const int max_top_only_y =
753       std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
754   DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
755                        max_top_only_y, -xstep, upsampled_top);
756   DirectionalZone1_4xH(dst_x + 4, stride,
757                        top_row + ((x + 4) << upsample_top_shift),
758                        max_top_only_y, -xstep, upsampled_top);
759   if (max_top_only_y == height) return;
760 
761   const __m128i max_shift = _mm_set1_epi8(32);
762   const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
763   const __m128i dest_index_x =
764       _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
765   const __m128i sampler_top =
766       upsampled_top
767           ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
768           : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
769   int y = max_top_only_y;
770   dst_x += stride * y;
771   const int xstep_y = xstep * y;
772   const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
773   // All rows from |min_left_only_y| down for this set of columns, only need
774   // |left_column| to compute.
775   const int min_left_only_y =
776       Align(std::min(((x + 8) << 6) / xstep, height), 8);
777 
778   __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
779   __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
780   int top_x = -xstep_y;
781 
782   const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
783   for (; y < min_left_only_y;
784        y += 8, dst_x += stride8,
785        xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
786        xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
787        top_x -= xstep8) {
788     // Pick up from the last y-value, using the 10% slower but secure method for
789     // left prediction.
790     if (shuffle_left_column) {
791       DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
792           dst_x, stride,
793           left_column + ((left_offset + y) << upsample_left_shift), left_y);
794     } else {
795       DirectionalZone3_8xH<upsampled_left, 8>(
796           dst_x, stride,
797           left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
798           -ystep);
799     }
800 
801     __m128i shifts = _mm_srli_epi16(
802         _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
803                       shift_mask),
804         1);
805     shifts = _mm_packus_epi16(shifts, shifts);
806     __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
807     shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
808     __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
809     DirectionalZone1Blend_8xH<upsampled_top, 8>(
810         dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
811         xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
812   }
813   // Loop over y for left_only rows.
814   for (; y < height; y += 8, dst_x += stride8) {
815     DirectionalZone3_8xH<upsampled_left, 8>(
816         dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
817         base_left_y, -ystep);
818   }
819 }
820 
821 // 7.11.2.4 (8) 90 < angle > 180
822 // The strategy for this function is to know how many blocks can be processed
823 // with just pixels from |top_ptr|, then handle mixed blocks, then handle only
824 // blocks that take from |left_ptr|. Additionally, a fast index-shuffle
825 // approach is used for pred values from |left_column| in sections that permit
826 // it.
827 template <bool upsampled_left, bool upsampled_top>
DirectionalZone2_SSE4_1(void * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int width,const int height,const int xstep,const int ystep)828 inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
829                                     const uint8_t* const top_row,
830                                     const uint8_t* const left_column,
831                                     const int width, const int height,
832                                     const int xstep, const int ystep) {
833   auto* dst = static_cast<uint8_t*>(dest);
834   const int upsample_top_shift = static_cast<int>(upsampled_top);
835   // All columns from |min_top_only_x| to the right will only need |top_row|
836   // to compute. This assumes minimum |xstep| is 3.
837   const int min_top_only_x = std::min((height * xstep) >> 6, width);
838 
839   // Accumulate xstep across 8 rows.
840   const __m128i xstep_dup = _mm_set1_epi16(-xstep);
841   const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
842   const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
843   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
844   const __m128i scaled_one = _mm_set1_epi16(-64);
845   __m128i xstep_bounds_base =
846       (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
847                     : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
848 
849   const int left_base_increment = ystep >> 6;
850   const int ystep_remainder = ystep & 0x3F;
851   const int ystep8 = ystep << 3;
852   const int left_base_increment8 = ystep8 >> 6;
853   const int ystep_remainder8 = ystep8 & 0x3F;
854   const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
855 
856   // If the 64 scaling is regarded as a decimal point, the first value of the
857   // left_y vector omits the portion which is covered under the left_column
858   // offset. Following values need the full ystep as a relative offset.
859   const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
860   const __m128i ystep_dup = _mm_set1_epi16(-ystep);
861   const __m128i dest_index_x =
862       _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
863   __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
864   left_y = _mm_add_epi16(ystep_init, left_y);
865 
866   // Analysis finds that, for most angles (ystep < 132), all segments that use
867   // both top_row and left_column can compute from left_column using byte
868   // shuffles from a single vector. For steeper angles, the shuffle is also
869   // fully reliable when x >= 32.
870   const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
871   const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
872   const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
873   int x = 0;
874 
875   for (int left_offset = -left_base_increment; x < min_shuffle_x;
876        x += 8,
877            xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
878            // Watch left_y because it can still get big.
879        left_y = _mm_add_epi16(left_y, increment_left8),
880            left_offset -= left_base_increment8) {
881     DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
882         dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
883         xstep_for_shift, xstep_bounds_base, left_y);
884   }
885   for (int left_offset = -left_base_increment; x < min_top_only_x;
886        x += 8,
887            xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
888            // Watch left_y because it can still get big.
889        left_y = _mm_add_epi16(left_y, increment_left8),
890            left_offset -= left_base_increment8) {
891     DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
892         dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
893         xstep_for_shift, xstep_bounds_base, left_y);
894   }
895   for (; x < width; x += 4) {
896     DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
897                          height, -xstep, upsampled_top);
898   }
899 }
900 
901 template <bool upsampled_left, bool upsampled_top>
DirectionalZone2_4_SSE4_1(void * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int width,const int height,const int xstep,const int ystep)902 inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
903                                       const uint8_t* const top_row,
904                                       const uint8_t* const left_column,
905                                       const int width, const int height,
906                                       const int xstep, const int ystep) {
907   auto* dst = static_cast<uint8_t*>(dest);
908   const int upsample_left_shift = static_cast<int>(upsampled_left);
909   const int upsample_top_shift = static_cast<int>(upsampled_top);
910   const __m128i max_shift = _mm_set1_epi8(32);
911   const ptrdiff_t stride4 = stride << 2;
912   const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
913   const __m128i sampler_top =
914       upsampled_top
915           ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
916           : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
917   // All columns from |min_top_only_x| to the right will only need |top_row| to
918   // compute.
919   assert(xstep >= 3);
920   const int min_top_only_x = std::min((height * xstep) >> 6, width);
921 
922   const int xstep4 = xstep << 2;
923   const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
924   const __m128i xstep_dup = _mm_set1_epi16(-xstep);
925   const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
926   __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
927   const __m128i scaled_one = _mm_set1_epi16(-64);
928   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
929   __m128i xstep_bounds_base =
930       (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
931                     : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
932 
933   const int left_base_increment = ystep >> 6;
934   const int ystep_remainder = ystep & 0x3F;
935   const int ystep4 = ystep << 2;
936   const int left_base_increment4 = ystep4 >> 6;
937   // This is guaranteed to be less than 64, but accumulation may bring it past
938   // 64 for higher x values.
939   const int ystep_remainder4 = ystep4 & 0x3F;
940   const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
941   const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
942 
943   // If the 64 scaling is regarded as a decimal point, the first value of the
944   // left_y vector omits the portion which will go into the left_column offset.
945   // Following values need the full ystep as a relative offset.
946   const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
947   const __m128i ystep_dup = _mm_set1_epi16(-ystep);
948   __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
949   left_y = _mm_add_epi16(ystep_init, left_y);
950   const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
951 
952   int x = 0;
953   // Loop over x for columns with a mixture of sources.
954   for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
955            xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
956            left_y = _mm_add_epi16(left_y, increment_left4),
957            left_offset -= left_base_increment4) {
958     uint8_t* dst_x = dst + x;
959 
960     // Round down to the nearest multiple of 4.
961     const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
962     DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
963                          max_top_only_y, -xstep, upsampled_top);
964     int y = max_top_only_y;
965     dst_x += stride * y;
966     const int xstep_y = xstep * y;
967     const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
968     // All rows from |min_left_only_y| down for this set of columns, only need
969     // |left_column| to compute. Rounded up to the nearest multiple of 4.
970     const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
971 
972     __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
973     __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
974     int top_x = -xstep_y;
975 
976     // Loop over y for mixed rows.
977     for (; y < min_left_only_y;
978          y += 4, dst_x += stride4,
979          xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
980          xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
981          top_x -= xstep4) {
982       DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
983           dst_x, stride,
984           left_column + ((left_offset + y) * (1 << upsample_left_shift)),
985           left_y);
986 
987       __m128i shifts = _mm_srli_epi16(
988           _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
989                         shift_mask),
990           1);
991       shifts = _mm_packus_epi16(shifts, shifts);
992       const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
993       shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
994       const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
995       DirectionalZone1Blend_4x4<upsampled_top>(
996           dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
997           xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
998     }
999     // Loop over y for left-only rows, if any.
1000     for (; y < height; y += 4, dst_x += stride4) {
1001       DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
1002           dst_x, stride,
1003           left_column + ((left_offset + y) << upsample_left_shift), left_y);
1004     }
1005   }
1006   // Loop over top-only columns, if any.
1007   for (; x < width; x += 4) {
1008     DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
1009                          height, -xstep, upsampled_top);
1010   }
1011 }
1012 
DirectionalIntraPredictorZone2_SSE4_1(void * const dest,ptrdiff_t stride,const void * const top_row,const void * const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)1013 void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
1014                                            const void* const top_row,
1015                                            const void* const left_column,
1016                                            const int width, const int height,
1017                                            const int xstep, const int ystep,
1018                                            const bool upsampled_top,
1019                                            const bool upsampled_left) {
1020   // Increasing the negative buffer for this function allows more rows to be
1021   // processed at a time without branching in an inner loop to check the base.
1022   uint8_t top_buffer[288];
1023   uint8_t left_buffer[288];
1024   memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
1025   memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
1026 #if LIBGAV1_MSAN
1027   memset(top_buffer, 0x33, 128);
1028   memset(left_buffer, 0x44, 128);
1029 #endif
1030   const uint8_t* top_ptr = top_buffer + 144;
1031   const uint8_t* left_ptr = left_buffer + 144;
1032   if (width == 4 || height == 4) {
1033     if (upsampled_left) {
1034       if (upsampled_top) {
1035         DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
1036                                               width, height, xstep, ystep);
1037       } else {
1038         DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
1039                                                width, height, xstep, ystep);
1040       }
1041     } else {
1042       if (upsampled_top) {
1043         DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
1044                                                width, height, xstep, ystep);
1045       } else {
1046         DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
1047                                                 width, height, xstep, ystep);
1048       }
1049     }
1050     return;
1051   }
1052   if (upsampled_left) {
1053     if (upsampled_top) {
1054       DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
1055                                           width, height, xstep, ystep);
1056     } else {
1057       DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
1058                                            width, height, xstep, ystep);
1059     }
1060   } else {
1061     if (upsampled_top) {
1062       DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
1063                                            width, height, xstep, ystep);
1064     } else {
1065       DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
1066                                             width, height, xstep, ystep);
1067     }
1068   }
1069 }
1070 
Init8bpp()1071 void Init8bpp() {
1072   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1073   assert(dsp != nullptr);
1074   static_cast<void>(dsp);
1075 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
1076   dsp->directional_intra_predictor_zone1 =
1077       DirectionalIntraPredictorZone1_SSE4_1;
1078 #endif
1079 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
1080   dsp->directional_intra_predictor_zone2 =
1081       DirectionalIntraPredictorZone2_SSE4_1;
1082 #endif
1083 #if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
1084   dsp->directional_intra_predictor_zone3 =
1085       DirectionalIntraPredictorZone3_SSE4_1;
1086 #endif
1087 }
1088 
1089 }  // namespace
1090 }  // namespace low_bitdepth
1091 
1092 //------------------------------------------------------------------------------
1093 #if LIBGAV1_MAX_BITDEPTH >= 10
1094 namespace high_bitdepth {
1095 namespace {
1096 
1097 //------------------------------------------------------------------------------
1098 // 7.11.2.4. Directional intra prediction process
1099 
1100 // Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
1101 // upsampling is ruled out. In addition, the bits masked by 0x3F for
1102 // |shift_val| are 0 for all multiples of 64, so the formula
1103 // val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
1104 // val = top[top_base_x+1] << 5, meaning only the second set of pixels is
1105 // involved in the output. Hence |top| is offset by 1.
DirectionalZone1_Step64(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top,const int width,const int height)1106 inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
1107                                     const uint16_t* const top, const int width,
1108                                     const int height) {
1109   ptrdiff_t offset = 1;
1110   if (height == 4) {
1111     memcpy(dst, top + offset, width * sizeof(dst[0]));
1112     dst += stride;
1113     memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
1114     dst += stride;
1115     memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
1116     dst += stride;
1117     memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
1118     return;
1119   }
1120   int y = height;
1121   do {
1122     memcpy(dst, top + offset, width * sizeof(dst[0]));
1123     dst += stride;
1124     memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
1125     dst += stride;
1126     memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
1127     dst += stride;
1128     memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
1129     dst += stride;
1130     memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
1131     dst += stride;
1132     memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
1133     dst += stride;
1134     memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
1135     dst += stride;
1136     memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
1137     dst += stride;
1138 
1139     offset += 8;
1140     y -= 8;
1141   } while (y != 0);
1142 }
1143 
1144 // Produce a weighted average whose weights sum to 32.
CombineTopVals4(const __m128i & top_vals,const __m128i & sampler,const __m128i & shifts,const __m128i & top_indices,const __m128i & final_top_val,const __m128i & border_index)1145 inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
1146                                const __m128i& shifts,
1147                                const __m128i& top_indices,
1148                                const __m128i& final_top_val,
1149                                const __m128i& border_index) {
1150   const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
1151   __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
1152   prod = _mm_hadd_epi16(prod, prod);
1153   const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
1154 
1155   const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
1156   // Replace pixels from invalid range with top-right corner.
1157   return _mm_blendv_epi8(result, final_top_val, past_max);
1158 }
1159 
1160 // When width is 4, only one load operation is needed per iteration. We also
1161 // avoid extra loop precomputations that cause too much overhead.
DirectionalZone1_4xH(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top,const int height,const int xstep,const bool upsampled,const __m128i & sampler)1162 inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
1163                                  const uint16_t* const top, const int height,
1164                                  const int xstep, const bool upsampled,
1165                                  const __m128i& sampler) {
1166   const int upsample_shift = static_cast<int>(upsampled);
1167   const int index_scale_bits = 6 - upsample_shift;
1168   const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
1169   const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1170   const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
1171 
1172   // Each 16-bit value here corresponds to a position that may exceed
1173   // |max_base_x|. When added to the top_base_x, it is used to mask values
1174   // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
1175   // only cmpgt is available.
1176   const __m128i offsets =
1177       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1178 
1179   // All rows from |min_corner_only_y| down will simply use memcpy.
1180   // |max_base_x| is always greater than |height|, so clipping the denominator
1181   // to 1 is enough to make the logic work.
1182   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1183   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1184 
1185   int y = 0;
1186   int top_x = xstep;
1187   const __m128i max_shift = _mm_set1_epi16(32);
1188 
1189   for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1190     const int top_base_x = top_x >> index_scale_bits;
1191 
1192     // Permit negative values of |top_x|.
1193     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1194     const __m128i shift = _mm_set1_epi16(shift_val);
1195     const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1196     const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1197     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1198     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1199 
1200     // Load 8 values because we will select the sampled values based on
1201     // |upsampled|.
1202     const __m128i values = LoadUnaligned16(top + top_base_x);
1203     const __m128i pred =
1204         CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
1205                         max_base_x_vect);
1206     StoreLo8(dst, pred);
1207   }
1208 
1209   // Fill in corner-only rows.
1210   for (; y < height; ++y) {
1211     Memset(dst, top[max_base_x], /* width */ 4);
1212     dst += stride;
1213   }
1214 }
1215 
1216 // General purpose combine function.
1217 // |check_border| means the final source value has to be duplicated into the
1218 // result. This simplifies the loop structures that use precomputed boundaries
1219 // to identify sections where it is safe to compute without checking for the
1220 // right border.
1221 template <bool check_border>
CombineTopVals(const __m128i & top_vals_0,const __m128i & top_vals_1,const __m128i & sampler,const __m128i & shifts,const __m128i & top_indices=_mm_setzero_si128 (),const __m128i & final_top_val=_mm_setzero_si128 (),const __m128i & border_index=_mm_setzero_si128 ())1222 inline __m128i CombineTopVals(
1223     const __m128i& top_vals_0, const __m128i& top_vals_1,
1224     const __m128i& sampler, const __m128i& shifts,
1225     const __m128i& top_indices = _mm_setzero_si128(),
1226     const __m128i& final_top_val = _mm_setzero_si128(),
1227     const __m128i& border_index = _mm_setzero_si128()) {
1228   constexpr int scale_int_bits = 5;
1229   const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
1230   const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
1231   const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
1232   const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
1233   const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
1234   const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
1235   if (check_border) {
1236     const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
1237     // Replace pixels from invalid range with top-right corner.
1238     return _mm_blendv_epi8(result, final_top_val, past_max);
1239   }
1240   return result;
1241 }
1242 
1243 // 7.11.2.4 (7) angle < 90
DirectionalZone1_Large(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const int width,const int height,const int xstep,const bool upsampled,const __m128i & sampler)1244 inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
1245                                    const uint16_t* const top_row,
1246                                    const int width, const int height,
1247                                    const int xstep, const bool upsampled,
1248                                    const __m128i& sampler) {
1249   const int upsample_shift = static_cast<int>(upsampled);
1250   const int index_scale_bits = 6 - upsample_shift;
1251   const int max_base_x = ((width + height) - 1) << upsample_shift;
1252 
1253   const __m128i max_shift = _mm_set1_epi16(32);
1254   const int base_step = 1 << upsample_shift;
1255   const int base_step8 = base_step << 3;
1256 
1257   // All rows from |min_corner_only_y| down will simply use memcpy.
1258   // |max_base_x| is always greater than |height|, so clipping to 1 is enough
1259   // to make the logic work.
1260   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1261   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1262 
1263   // Rows up to this y-value can be computed without checking for bounds.
1264   const int max_no_corner_y = std::min(
1265       LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
1266       height);
1267   // No need to check for exceeding |max_base_x| in the first loop.
1268   int y = 0;
1269   int top_x = xstep;
1270   for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
1271     int top_base_x = top_x >> index_scale_bits;
1272     // Permit negative values of |top_x|.
1273     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1274     const __m128i shift = _mm_set1_epi16(shift_val);
1275     const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1276     const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1277     int x = 0;
1278     do {
1279       const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1280       const __m128i top_vals_1 =
1281           LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1282 
1283       const __m128i pred =
1284           CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
1285 
1286       StoreUnaligned16(dest + x, pred);
1287       top_base_x += base_step8;
1288       x += 8;
1289     } while (x < width);
1290   }
1291 
1292   // Each 16-bit value here corresponds to a position that may exceed
1293   // |max_base_x|. When added to |top_base_x|, it is used to mask values
1294   // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
1295   // which is not supported for packed integers.
1296   const __m128i offsets =
1297       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1298 
1299   const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1300   const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
1301   const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
1302   for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
1303     int top_base_x = top_x >> index_scale_bits;
1304 
1305     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1306     const __m128i shift = _mm_set1_epi16(shift_val);
1307     const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1308     const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1309     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1310     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1311 
1312     int x = 0;
1313     const int min_corner_only_x =
1314         std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
1315     for (; x < min_corner_only_x;
1316          x += 8, top_base_x += base_step8,
1317          top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
1318       const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1319       const __m128i top_vals_1 =
1320           LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1321       const __m128i pred =
1322           CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
1323                                top_index_vect, final_top_val, max_base_x_vect);
1324       StoreUnaligned16(dest + x, pred);
1325     }
1326     // Corner-only section of the row.
1327     Memset(dest + x, top_row[max_base_x], width - x);
1328   }
1329   // Fill in corner-only rows.
1330   for (; y < height; ++y) {
1331     Memset(dest, top_row[max_base_x], width);
1332     dest += stride;
1333   }
1334 }
1335 
1336 // 7.11.2.4 (7) angle < 90
DirectionalIntraPredictorZone1_SSE4_1(void * dest_ptr,ptrdiff_t stride,const void * const top_ptr,const int width,const int height,const int xstep,const bool upsampled)1337 inline void DirectionalIntraPredictorZone1_SSE4_1(
1338     void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
1339     const int width, const int height, const int xstep, const bool upsampled) {
1340   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1341   auto* dest = static_cast<uint16_t*>(dest_ptr);
1342   stride /= sizeof(uint16_t);
1343   const int upsample_shift = static_cast<int>(upsampled);
1344   if (xstep == 64) {
1345     DirectionalZone1_Step64(dest, stride, top_row, width, height);
1346     return;
1347   }
1348   // Each base pixel paired with its following pixel, for hadd purposes.
1349   const __m128i adjacency_shuffler = _mm_set_epi16(
1350       0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
1351   // This is equivalent to not shuffling at all.
1352   const __m128i identity_shuffler = _mm_set_epi16(
1353       0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
1354   // This represents a trade-off between code size and speed. When upsampled
1355   // is true, no shuffle is necessary. But to avoid in-loop branching, we
1356   // would need 2 copies of the main function body.
1357   const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
1358   if (width == 4) {
1359     DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
1360                          sampler);
1361     return;
1362   }
1363   if (width >= 32) {
1364     DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
1365                            upsampled, sampler);
1366     return;
1367   }
1368   const int index_scale_bits = 6 - upsample_shift;
1369   const int max_base_x = ((width + height) - 1) << upsample_shift;
1370 
1371   const __m128i max_shift = _mm_set1_epi16(32);
1372   const int base_step = 1 << upsample_shift;
1373   const int base_step8 = base_step << 3;
1374 
1375   // No need to check for exceeding |max_base_x| in the loops.
1376   if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
1377     int top_x = xstep;
1378     int y = height;
1379     do {
1380       int top_base_x = top_x >> index_scale_bits;
1381       // Permit negative values of |top_x|.
1382       const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1383       const __m128i shift = _mm_set1_epi16(shift_val);
1384       const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1385       const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1386       int x = 0;
1387       do {
1388         const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1389         const __m128i top_vals_1 =
1390             LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1391         const __m128i pred =
1392             CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
1393         StoreUnaligned16(dest + x, pred);
1394         top_base_x += base_step8;
1395         x += 8;
1396       } while (x < width);
1397       dest += stride;
1398       top_x += xstep;
1399     } while (--y != 0);
1400     return;
1401   }
1402 
1403   // General case. Blocks with width less than 32 do not benefit from x-wise
1404   // loop splitting, but do benefit from using memset on appropriate rows.
1405 
1406   // Each 16-bit value here corresponds to a position that may exceed
1407   // |max_base_x|. When added to the top_base_x, it is used to mask values
1408   // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
1409   // not supported for packed integers.
1410   const __m128i offsets =
1411       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
1412 
1413   const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
1414   const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
1415   const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
1416 
1417   // All rows from |min_corner_only_y| down will simply use memcpy.
1418   // |max_base_x| is always greater than |height|, so clipping the denominator
1419   // to 1 is enough to make the logic work.
1420   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1421   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
1422 
1423   int top_x = xstep;
1424   int y = 0;
1425   for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
1426     int top_base_x = top_x >> index_scale_bits;
1427 
1428     const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1429     const __m128i shift = _mm_set1_epi16(shift_val);
1430     const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
1431     const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
1432     __m128i top_index_vect = _mm_set1_epi16(top_base_x);
1433     top_index_vect = _mm_add_epi16(top_index_vect, offsets);
1434 
1435     for (int x = 0; x < width; x += 8, top_base_x += base_step8,
1436              top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
1437       const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
1438       const __m128i top_vals_1 =
1439           LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
1440       const __m128i pred =
1441           CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
1442                                top_index_vect, final_top_val, max_base_x_vect);
1443       StoreUnaligned16(dest + x, pred);
1444     }
1445   }
1446 
1447   // Fill in corner-only rows.
1448   for (; y < height; ++y) {
1449     Memset(dest, top_row[max_base_x], width);
1450     dest += stride;
1451   }
1452 }
1453 
Init10bpp()1454 void Init10bpp() {
1455   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
1456   assert(dsp != nullptr);
1457   static_cast<void>(dsp);
1458 #if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
1459   dsp->directional_intra_predictor_zone1 =
1460       DirectionalIntraPredictorZone1_SSE4_1;
1461 #endif
1462 }
1463 
1464 }  // namespace
1465 }  // namespace high_bitdepth
1466 
1467 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1468 
IntraPredDirectionalInit_SSE4_1()1469 void IntraPredDirectionalInit_SSE4_1() {
1470   low_bitdepth::Init8bpp();
1471 #if LIBGAV1_MAX_BITDEPTH >= 10
1472   high_bitdepth::Init10bpp();
1473 #endif
1474 }
1475 
1476 }  // namespace dsp
1477 }  // namespace libgav1
1478 
1479 #else   // !LIBGAV1_TARGETING_SSE4_1
1480 namespace libgav1 {
1481 namespace dsp {
1482 
IntraPredDirectionalInit_SSE4_1()1483 void IntraPredDirectionalInit_SSE4_1() {}
1484 
1485 }  // namespace dsp
1486 }  // namespace libgav1
1487 #endif  // LIBGAV1_TARGETING_SSE4_1
1488