xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 #include "aom_dsp/x86/intrapred_x86.h"
16 #include "aom_dsp/x86/intrapred_utils.h"
17 #include "aom_dsp/x86/lpf_common_sse2.h"
18 
dc_sum_64(const uint8_t * ref)19 static inline __m256i dc_sum_64(const uint8_t *ref) {
20   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21   const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22   const __m256i zero = _mm256_setzero_si256();
23   __m256i y0 = _mm256_sad_epu8(x0, zero);
24   __m256i y1 = _mm256_sad_epu8(x1, zero);
25   y0 = _mm256_add_epi64(y0, y1);
26   __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27   y0 = _mm256_add_epi64(u0, y0);
28   u0 = _mm256_unpackhi_epi64(y0, y0);
29   return _mm256_add_epi16(y0, u0);
30 }
31 
dc_sum_32(const uint8_t * ref)32 static inline __m256i dc_sum_32(const uint8_t *ref) {
33   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34   const __m256i zero = _mm256_setzero_si256();
35   __m256i y = _mm256_sad_epu8(x, zero);
36   __m256i u = _mm256_permute2x128_si256(y, y, 1);
37   y = _mm256_add_epi64(u, y);
38   u = _mm256_unpackhi_epi64(y, y);
39   return _mm256_add_epi16(y, u);
40 }
41 
row_store_32xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)42 static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43                                   ptrdiff_t stride) {
44   for (int i = 0; i < height; ++i) {
45     _mm256_storeu_si256((__m256i *)dst, *r);
46     dst += stride;
47   }
48 }
49 
row_store_32x2xh(const __m256i * r0,const __m256i * r1,int height,uint8_t * dst,ptrdiff_t stride)50 static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51                                     int height, uint8_t *dst,
52                                     ptrdiff_t stride) {
53   for (int i = 0; i < height; ++i) {
54     _mm256_storeu_si256((__m256i *)dst, *r0);
55     _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56     dst += stride;
57   }
58 }
59 
row_store_64xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)60 static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61                                   ptrdiff_t stride) {
62   for (int i = 0; i < height; ++i) {
63     _mm256_storeu_si256((__m256i *)dst, *r);
64     _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65     dst += stride;
66   }
67 }
68 
69 #if CONFIG_AV1_HIGHBITDEPTH
70 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
71   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
72   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
73   { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
74   { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
75   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
76   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
77   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
78   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
79 };
80 
81 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
82   { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
83   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
84   { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
85   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
86 };
87 
88 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
89   { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
90     2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
91   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
92     0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
93   { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
94     0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
95   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
96     0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
97   { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
98     0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
99   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
100     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
101   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
102     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
103   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
104     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
105 };
106 
107 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
108   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109   { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110   { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111   { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112   { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
114   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115     0 },
116   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
117     0, 0 },
118   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
119     0, 0, 0, 0 },
120   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
121     0, 0, 0, 0, 0, 0 },
122   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
123     0xffff, 0, 0, 0, 0, 0, 0 },
124   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
125     0xffff, 0xffff, 0, 0, 0, 0, 0 },
126   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
127     0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
128   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
129     0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
130   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
131     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
132   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
133     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
134   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
135     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
136 };
137 
138 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_transpose16x4_8x8_sse2(__m128i * x,__m128i * d)139 static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
140   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
141 
142   r0 = _mm_unpacklo_epi16(x[0], x[1]);
143   r1 = _mm_unpacklo_epi16(x[2], x[3]);
144   r2 = _mm_unpacklo_epi16(x[4], x[5]);
145   r3 = _mm_unpacklo_epi16(x[6], x[7]);
146 
147   r4 = _mm_unpacklo_epi16(x[8], x[9]);
148   r5 = _mm_unpacklo_epi16(x[10], x[11]);
149   r6 = _mm_unpacklo_epi16(x[12], x[13]);
150   r7 = _mm_unpacklo_epi16(x[14], x[15]);
151 
152   r8 = _mm_unpacklo_epi32(r0, r1);
153   r9 = _mm_unpackhi_epi32(r0, r1);
154   r10 = _mm_unpacklo_epi32(r2, r3);
155   r11 = _mm_unpackhi_epi32(r2, r3);
156 
157   r12 = _mm_unpacklo_epi32(r4, r5);
158   r13 = _mm_unpackhi_epi32(r4, r5);
159   r14 = _mm_unpacklo_epi32(r6, r7);
160   r15 = _mm_unpackhi_epi32(r6, r7);
161 
162   r0 = _mm_unpacklo_epi64(r8, r9);
163   r1 = _mm_unpackhi_epi64(r8, r9);
164   r2 = _mm_unpacklo_epi64(r10, r11);
165   r3 = _mm_unpackhi_epi64(r10, r11);
166 
167   r4 = _mm_unpacklo_epi64(r12, r13);
168   r5 = _mm_unpackhi_epi64(r12, r13);
169   r6 = _mm_unpacklo_epi64(r14, r15);
170   r7 = _mm_unpackhi_epi64(r14, r15);
171 
172   d[0] = _mm_unpacklo_epi64(r0, r2);
173   d[1] = _mm_unpacklo_epi64(r4, r6);
174   d[2] = _mm_unpacklo_epi64(r1, r3);
175   d[3] = _mm_unpacklo_epi64(r5, r7);
176 
177   d[4] = _mm_unpackhi_epi64(r0, r2);
178   d[5] = _mm_unpackhi_epi64(r4, r6);
179   d[6] = _mm_unpackhi_epi64(r1, r3);
180   d[7] = _mm_unpackhi_epi64(r5, r7);
181 }
182 
highbd_transpose4x16_avx2(__m256i * x,__m256i * d)183 static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
184   __m256i w0, w1, w2, w3, ww0, ww1;
185 
186   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
187   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
188   w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
189   w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
190 
191   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
192   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
193 
194   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
195   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
196 
197   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
198   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
199 
200   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
201   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
202 }
203 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204 
highbd_transpose8x16_16x8_avx2(__m256i * x,__m256i * d)205 static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
206   __m256i w0, w1, w2, w3, ww0, ww1;
207 
208   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
209   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
210   w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
211   w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
212 
213   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
214   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
215 
216   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
217   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
218 
219   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
220   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
221 
222   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
223   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
224 
225   w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
226   w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
227   w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
228   w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
229 
230   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
231   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
232 
233   d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
234   d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
235 
236   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
237   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
238 
239   d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
240   d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
241 }
242 
highbd_transpose16x16_avx2(__m256i * x,__m256i * d)243 static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
244   __m256i w0, w1, w2, w3, ww0, ww1;
245   __m256i dd[16];
246   w0 = _mm256_unpacklo_epi16(x[0], x[1]);
247   w1 = _mm256_unpacklo_epi16(x[2], x[3]);
248   w2 = _mm256_unpacklo_epi16(x[4], x[5]);
249   w3 = _mm256_unpacklo_epi16(x[6], x[7]);
250 
251   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
252   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
253 
254   dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
255   dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
256 
257   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
258   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
259 
260   dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
261   dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
262 
263   w0 = _mm256_unpackhi_epi16(x[0], x[1]);
264   w1 = _mm256_unpackhi_epi16(x[2], x[3]);
265   w2 = _mm256_unpackhi_epi16(x[4], x[5]);
266   w3 = _mm256_unpackhi_epi16(x[6], x[7]);
267 
268   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
269   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
270 
271   dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
272   dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
273 
274   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
275   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
276 
277   dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
278   dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
279 
280   w0 = _mm256_unpacklo_epi16(x[8], x[9]);
281   w1 = _mm256_unpacklo_epi16(x[10], x[11]);
282   w2 = _mm256_unpacklo_epi16(x[12], x[13]);
283   w3 = _mm256_unpacklo_epi16(x[14], x[15]);
284 
285   ww0 = _mm256_unpacklo_epi32(w0, w1);
286   ww1 = _mm256_unpacklo_epi32(w2, w3);
287 
288   dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
289   dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
290 
291   ww0 = _mm256_unpackhi_epi32(w0, w1);
292   ww1 = _mm256_unpackhi_epi32(w2, w3);
293 
294   dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
295   dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
296 
297   w0 = _mm256_unpackhi_epi16(x[8], x[9]);
298   w1 = _mm256_unpackhi_epi16(x[10], x[11]);
299   w2 = _mm256_unpackhi_epi16(x[12], x[13]);
300   w3 = _mm256_unpackhi_epi16(x[14], x[15]);
301 
302   ww0 = _mm256_unpacklo_epi32(w0, w1);
303   ww1 = _mm256_unpacklo_epi32(w2, w3);
304 
305   dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
306   dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
307 
308   ww0 = _mm256_unpackhi_epi32(w0, w1);
309   ww1 = _mm256_unpackhi_epi32(w2, w3);
310 
311   dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
312   dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
313 
314   for (int i = 0; i < 8; i++) {
315     d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
316     d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
317                                        _mm256_extracti128_si256(dd[i], 1), 0);
318   }
319 }
320 #endif  // CONFIG_AV1_HIGHBITDEPTH
321 
aom_dc_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)322 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
323                                  const uint8_t *above, const uint8_t *left) {
324   const __m256i sum_above = dc_sum_32(above);
325   __m256i sum_left = dc_sum_32(left);
326   sum_left = _mm256_add_epi16(sum_left, sum_above);
327   const __m256i thirtytwo = _mm256_set1_epi16(32);
328   sum_left = _mm256_add_epi16(sum_left, thirtytwo);
329   sum_left = _mm256_srai_epi16(sum_left, 6);
330   const __m256i zero = _mm256_setzero_si256();
331   __m256i row = _mm256_shuffle_epi8(sum_left, zero);
332   row_store_32xh(&row, 32, dst, stride);
333 }
334 
aom_dc_top_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
336                                      const uint8_t *above,
337                                      const uint8_t *left) {
338   __m256i sum = dc_sum_32(above);
339   (void)left;
340 
341   const __m256i sixteen = _mm256_set1_epi16(16);
342   sum = _mm256_add_epi16(sum, sixteen);
343   sum = _mm256_srai_epi16(sum, 5);
344   const __m256i zero = _mm256_setzero_si256();
345   __m256i row = _mm256_shuffle_epi8(sum, zero);
346   row_store_32xh(&row, 32, dst, stride);
347 }
348 
aom_dc_left_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)349 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
350                                       const uint8_t *above,
351                                       const uint8_t *left) {
352   __m256i sum = dc_sum_32(left);
353   (void)above;
354 
355   const __m256i sixteen = _mm256_set1_epi16(16);
356   sum = _mm256_add_epi16(sum, sixteen);
357   sum = _mm256_srai_epi16(sum, 5);
358   const __m256i zero = _mm256_setzero_si256();
359   __m256i row = _mm256_shuffle_epi8(sum, zero);
360   row_store_32xh(&row, 32, dst, stride);
361 }
362 
aom_dc_128_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)363 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
364                                      const uint8_t *above,
365                                      const uint8_t *left) {
366   (void)above;
367   (void)left;
368   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
369   row_store_32xh(&row, 32, dst, stride);
370 }
371 
aom_v_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)372 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
373                                 const uint8_t *above, const uint8_t *left) {
374   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
375   (void)left;
376   row_store_32xh(&row, 32, dst, stride);
377 }
378 
379 // There are 32 rows togeter. This function does line:
380 // 0,1,2,3, and 16,17,18,19. The next call would do
381 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
382 // would finish 32 rows.
h_predictor_32x8line(const __m256i * row,uint8_t * dst,ptrdiff_t stride)383 static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
384                                         ptrdiff_t stride) {
385   __m256i t[4];
386   __m256i m = _mm256_setzero_si256();
387   const __m256i inc = _mm256_set1_epi8(4);
388   int i;
389 
390   for (i = 0; i < 4; i++) {
391     t[i] = _mm256_shuffle_epi8(*row, m);
392     __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
393     __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
394     _mm256_storeu_si256((__m256i *)dst, r0);
395     _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
396     dst += stride;
397     m = _mm256_add_epi8(m, inc);
398   }
399 }
400 
aom_h_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
402                                 const uint8_t *above, const uint8_t *left) {
403   (void)above;
404   const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
405 
406   __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
407 
408   __m256i v = _mm256_unpacklo_epi8(u, u);
409   h_predictor_32x8line(&v, dst, stride);
410   dst += stride << 2;
411 
412   v = _mm256_unpackhi_epi8(u, u);
413   h_predictor_32x8line(&v, dst, stride);
414   dst += stride << 2;
415 
416   u = _mm256_unpackhi_epi8(left_col, left_col);
417 
418   v = _mm256_unpacklo_epi8(u, u);
419   h_predictor_32x8line(&v, dst, stride);
420   dst += stride << 2;
421 
422   v = _mm256_unpackhi_epi8(u, u);
423   h_predictor_32x8line(&v, dst, stride);
424 }
425 
426 // -----------------------------------------------------------------------------
427 // Rectangle
aom_dc_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)428 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
429                                  const uint8_t *above, const uint8_t *left) {
430   const __m128i top_sum = dc_sum_32_sse2(above);
431   __m128i left_sum = dc_sum_16_sse2(left);
432   left_sum = _mm_add_epi16(top_sum, left_sum);
433   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
434   sum += 24;
435   sum /= 48;
436   const __m256i row = _mm256_set1_epi8((int8_t)sum);
437   row_store_32xh(&row, 16, dst, stride);
438 }
439 
aom_dc_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)440 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
441                                  const uint8_t *above, const uint8_t *left) {
442   const __m256i sum_above = dc_sum_32(above);
443   __m256i sum_left = dc_sum_64(left);
444   sum_left = _mm256_add_epi16(sum_left, sum_above);
445   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
446   sum += 48;
447   sum /= 96;
448   const __m256i row = _mm256_set1_epi8((int8_t)sum);
449   row_store_32xh(&row, 64, dst, stride);
450 }
451 
aom_dc_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)452 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
453                                  const uint8_t *above, const uint8_t *left) {
454   const __m256i sum_above = dc_sum_64(above);
455   __m256i sum_left = dc_sum_64(left);
456   sum_left = _mm256_add_epi16(sum_left, sum_above);
457   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
458   sum += 64;
459   sum /= 128;
460   const __m256i row = _mm256_set1_epi8((int8_t)sum);
461   row_store_64xh(&row, 64, dst, stride);
462 }
463 
aom_dc_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)464 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
465                                  const uint8_t *above, const uint8_t *left) {
466   const __m256i sum_above = dc_sum_64(above);
467   __m256i sum_left = dc_sum_32(left);
468   sum_left = _mm256_add_epi16(sum_left, sum_above);
469   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
470   sum += 48;
471   sum /= 96;
472   const __m256i row = _mm256_set1_epi8((int8_t)sum);
473   row_store_64xh(&row, 32, dst, stride);
474 }
475 
476 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)477 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
478                                  const uint8_t *above, const uint8_t *left) {
479   const __m256i sum_above = dc_sum_64(above);
480   __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
481   sum_left = _mm256_add_epi16(sum_left, sum_above);
482   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
483   sum += 40;
484   sum /= 80;
485   const __m256i row = _mm256_set1_epi8((int8_t)sum);
486   row_store_64xh(&row, 16, dst, stride);
487 }
488 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
489 
aom_dc_top_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)490 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
491                                      const uint8_t *above,
492                                      const uint8_t *left) {
493   __m256i sum = dc_sum_32(above);
494   (void)left;
495 
496   const __m256i sixteen = _mm256_set1_epi16(16);
497   sum = _mm256_add_epi16(sum, sixteen);
498   sum = _mm256_srai_epi16(sum, 5);
499   const __m256i zero = _mm256_setzero_si256();
500   __m256i row = _mm256_shuffle_epi8(sum, zero);
501   row_store_32xh(&row, 16, dst, stride);
502 }
503 
aom_dc_top_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)504 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
505                                      const uint8_t *above,
506                                      const uint8_t *left) {
507   __m256i sum = dc_sum_32(above);
508   (void)left;
509 
510   const __m256i sixteen = _mm256_set1_epi16(16);
511   sum = _mm256_add_epi16(sum, sixteen);
512   sum = _mm256_srai_epi16(sum, 5);
513   const __m256i zero = _mm256_setzero_si256();
514   __m256i row = _mm256_shuffle_epi8(sum, zero);
515   row_store_32xh(&row, 64, dst, stride);
516 }
517 
aom_dc_top_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)518 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
519                                      const uint8_t *above,
520                                      const uint8_t *left) {
521   __m256i sum = dc_sum_64(above);
522   (void)left;
523 
524   const __m256i thirtytwo = _mm256_set1_epi16(32);
525   sum = _mm256_add_epi16(sum, thirtytwo);
526   sum = _mm256_srai_epi16(sum, 6);
527   const __m256i zero = _mm256_setzero_si256();
528   __m256i row = _mm256_shuffle_epi8(sum, zero);
529   row_store_64xh(&row, 64, dst, stride);
530 }
531 
aom_dc_top_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
533                                      const uint8_t *above,
534                                      const uint8_t *left) {
535   __m256i sum = dc_sum_64(above);
536   (void)left;
537 
538   const __m256i thirtytwo = _mm256_set1_epi16(32);
539   sum = _mm256_add_epi16(sum, thirtytwo);
540   sum = _mm256_srai_epi16(sum, 6);
541   const __m256i zero = _mm256_setzero_si256();
542   __m256i row = _mm256_shuffle_epi8(sum, zero);
543   row_store_64xh(&row, 32, dst, stride);
544 }
545 
546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
548                                      const uint8_t *above,
549                                      const uint8_t *left) {
550   __m256i sum = dc_sum_64(above);
551   (void)left;
552 
553   const __m256i thirtytwo = _mm256_set1_epi16(32);
554   sum = _mm256_add_epi16(sum, thirtytwo);
555   sum = _mm256_srai_epi16(sum, 6);
556   const __m256i zero = _mm256_setzero_si256();
557   __m256i row = _mm256_shuffle_epi8(sum, zero);
558   row_store_64xh(&row, 16, dst, stride);
559 }
560 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
561 
aom_dc_left_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)562 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
563                                       const uint8_t *above,
564                                       const uint8_t *left) {
565   __m128i sum = dc_sum_16_sse2(left);
566   (void)above;
567 
568   const __m128i eight = _mm_set1_epi16(8);
569   sum = _mm_add_epi16(sum, eight);
570   sum = _mm_srai_epi16(sum, 4);
571   const __m128i zero = _mm_setzero_si128();
572   const __m128i r = _mm_shuffle_epi8(sum, zero);
573   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
574   row_store_32xh(&row, 16, dst, stride);
575 }
576 
aom_dc_left_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)577 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
578                                       const uint8_t *above,
579                                       const uint8_t *left) {
580   __m256i sum = dc_sum_64(left);
581   (void)above;
582 
583   const __m256i thirtytwo = _mm256_set1_epi16(32);
584   sum = _mm256_add_epi16(sum, thirtytwo);
585   sum = _mm256_srai_epi16(sum, 6);
586   const __m256i zero = _mm256_setzero_si256();
587   __m256i row = _mm256_shuffle_epi8(sum, zero);
588   row_store_32xh(&row, 64, dst, stride);
589 }
590 
aom_dc_left_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)591 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
592                                       const uint8_t *above,
593                                       const uint8_t *left) {
594   __m256i sum = dc_sum_64(left);
595   (void)above;
596 
597   const __m256i thirtytwo = _mm256_set1_epi16(32);
598   sum = _mm256_add_epi16(sum, thirtytwo);
599   sum = _mm256_srai_epi16(sum, 6);
600   const __m256i zero = _mm256_setzero_si256();
601   __m256i row = _mm256_shuffle_epi8(sum, zero);
602   row_store_64xh(&row, 64, dst, stride);
603 }
604 
aom_dc_left_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)605 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
606                                       const uint8_t *above,
607                                       const uint8_t *left) {
608   __m256i sum = dc_sum_32(left);
609   (void)above;
610 
611   const __m256i sixteen = _mm256_set1_epi16(16);
612   sum = _mm256_add_epi16(sum, sixteen);
613   sum = _mm256_srai_epi16(sum, 5);
614   const __m256i zero = _mm256_setzero_si256();
615   __m256i row = _mm256_shuffle_epi8(sum, zero);
616   row_store_64xh(&row, 32, dst, stride);
617 }
618 
619 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)620 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
621                                       const uint8_t *above,
622                                       const uint8_t *left) {
623   __m128i sum = dc_sum_16_sse2(left);
624   (void)above;
625 
626   const __m128i eight = _mm_set1_epi16(8);
627   sum = _mm_add_epi16(sum, eight);
628   sum = _mm_srai_epi16(sum, 4);
629   const __m128i zero = _mm_setzero_si128();
630   const __m128i r = _mm_shuffle_epi8(sum, zero);
631   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
632   row_store_64xh(&row, 16, dst, stride);
633 }
634 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
635 
aom_dc_128_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)636 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
637                                      const uint8_t *above,
638                                      const uint8_t *left) {
639   (void)above;
640   (void)left;
641   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
642   row_store_32xh(&row, 16, dst, stride);
643 }
644 
aom_dc_128_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)645 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
646                                      const uint8_t *above,
647                                      const uint8_t *left) {
648   (void)above;
649   (void)left;
650   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
651   row_store_32xh(&row, 64, dst, stride);
652 }
653 
aom_dc_128_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
655                                      const uint8_t *above,
656                                      const uint8_t *left) {
657   (void)above;
658   (void)left;
659   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
660   row_store_64xh(&row, 64, dst, stride);
661 }
662 
aom_dc_128_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)663 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
664                                      const uint8_t *above,
665                                      const uint8_t *left) {
666   (void)above;
667   (void)left;
668   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
669   row_store_64xh(&row, 32, dst, stride);
670 }
671 
672 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)673 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
674                                      const uint8_t *above,
675                                      const uint8_t *left) {
676   (void)above;
677   (void)left;
678   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
679   row_store_64xh(&row, 16, dst, stride);
680 }
681 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
682 
aom_v_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)683 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
684                                 const uint8_t *above, const uint8_t *left) {
685   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
686   (void)left;
687   row_store_32xh(&row, 16, dst, stride);
688 }
689 
aom_v_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)690 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
691                                 const uint8_t *above, const uint8_t *left) {
692   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
693   (void)left;
694   row_store_32xh(&row, 64, dst, stride);
695 }
696 
aom_v_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)697 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
698                                 const uint8_t *above, const uint8_t *left) {
699   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
700   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
701   (void)left;
702   row_store_32x2xh(&row0, &row1, 64, dst, stride);
703 }
704 
aom_v_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)705 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
706                                 const uint8_t *above, const uint8_t *left) {
707   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
708   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
709   (void)left;
710   row_store_32x2xh(&row0, &row1, 32, dst, stride);
711 }
712 
713 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)714 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
715                                 const uint8_t *above, const uint8_t *left) {
716   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
717   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
718   (void)left;
719   row_store_32x2xh(&row0, &row1, 16, dst, stride);
720 }
721 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
722 
723 // -----------------------------------------------------------------------------
724 // PAETH_PRED
725 
726 // Return 16 16-bit pixels in one row (__m256i)
paeth_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)727 static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
728                                  const __m256i *topleft) {
729   const __m256i base =
730       _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
731 
732   __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
733   __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
734   __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
735 
736   __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
737   mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
738   __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
739 
740   pl = _mm256_andnot_si256(mask1, *left);
741 
742   ptl = _mm256_and_si256(mask2, *topleft);
743   pt = _mm256_andnot_si256(mask2, *top);
744   pt = _mm256_or_si256(pt, ptl);
745   pt = _mm256_and_si256(mask1, pt);
746 
747   return _mm256_or_si256(pt, pl);
748 }
749 
750 // Return 16 8-bit pixels in one row (__m128i)
paeth_16x1_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)751 static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
752                                       const __m256i *topleft) {
753   const __m256i p0 = paeth_pred(left, top, topleft);
754   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
755   const __m256i p = _mm256_packus_epi16(p0, p1);
756   return _mm256_castsi256_si128(p);
757 }
758 
get_top_vector(const uint8_t * above)759 static inline __m256i get_top_vector(const uint8_t *above) {
760   const __m128i x = _mm_load_si128((const __m128i *)above);
761   const __m128i zero = _mm_setzero_si128();
762   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
763   const __m128i t1 = _mm_unpackhi_epi8(x, zero);
764   return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
765 }
766 
aom_paeth_predictor_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)767 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
768                                    const uint8_t *above, const uint8_t *left) {
769   __m128i x = _mm_loadl_epi64((const __m128i *)left);
770   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
771   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
772   __m256i rep = _mm256_set1_epi16((short)0x8000);
773   const __m256i one = _mm256_set1_epi16(1);
774   const __m256i top = get_top_vector(above);
775 
776   int i;
777   for (i = 0; i < 8; ++i) {
778     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
779     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
780 
781     _mm_store_si128((__m128i *)dst, row);
782     dst += stride;
783     rep = _mm256_add_epi16(rep, one);
784   }
785 }
786 
get_left_vector(const uint8_t * left)787 static inline __m256i get_left_vector(const uint8_t *left) {
788   const __m128i x = _mm_load_si128((const __m128i *)left);
789   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
790 }
791 
aom_paeth_predictor_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)792 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
793                                     const uint8_t *above, const uint8_t *left) {
794   const __m256i l = get_left_vector(left);
795   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
796   __m256i rep = _mm256_set1_epi16((short)0x8000);
797   const __m256i one = _mm256_set1_epi16(1);
798   const __m256i top = get_top_vector(above);
799 
800   int i;
801   for (i = 0; i < 16; ++i) {
802     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
803     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
804 
805     _mm_store_si128((__m128i *)dst, row);
806     dst += stride;
807     rep = _mm256_add_epi16(rep, one);
808   }
809 }
810 
aom_paeth_predictor_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)811 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
812                                     const uint8_t *above, const uint8_t *left) {
813   __m256i l = get_left_vector(left);
814   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
815   __m256i rep = _mm256_set1_epi16((short)0x8000);
816   const __m256i one = _mm256_set1_epi16(1);
817   const __m256i top = get_top_vector(above);
818 
819   int i;
820   for (i = 0; i < 16; ++i) {
821     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
822     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
823 
824     _mm_store_si128((__m128i *)dst, row);
825     dst += stride;
826     rep = _mm256_add_epi16(rep, one);
827   }
828 
829   l = get_left_vector(left + 16);
830   rep = _mm256_set1_epi16((short)0x8000);
831   for (i = 0; i < 16; ++i) {
832     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
833     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
834 
835     _mm_store_si128((__m128i *)dst, row);
836     dst += stride;
837     rep = _mm256_add_epi16(rep, one);
838   }
839 }
840 
841 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)842 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
843                                     const uint8_t *above, const uint8_t *left) {
844   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
845   const __m256i one = _mm256_set1_epi16(1);
846   const __m256i top = get_top_vector(above);
847 
848   for (int j = 0; j < 4; ++j) {
849     const __m256i l = get_left_vector(left + j * 16);
850     __m256i rep = _mm256_set1_epi16((short)0x8000);
851     for (int i = 0; i < 16; ++i) {
852       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
853       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
854 
855       _mm_store_si128((__m128i *)dst, row);
856       dst += stride;
857       rep = _mm256_add_epi16(rep, one);
858     }
859   }
860 }
861 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
862 
863 // Return 32 8-bit pixels in one row (__m256i)
paeth_32x1_pred(const __m256i * left,const __m256i * top0,const __m256i * top1,const __m256i * topleft)864 static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
865                                       const __m256i *top1,
866                                       const __m256i *topleft) {
867   __m256i p0 = paeth_pred(left, top0, topleft);
868   __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
869   const __m256i x0 = _mm256_packus_epi16(p0, p1);
870 
871   p0 = paeth_pred(left, top1, topleft);
872   p1 = _mm256_permute4x64_epi64(p0, 0xe);
873   const __m256i x1 = _mm256_packus_epi16(p0, p1);
874 
875   return _mm256_permute2x128_si256(x0, x1, 0x20);
876 }
877 
aom_paeth_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)878 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
879                                     const uint8_t *above, const uint8_t *left) {
880   const __m256i l = get_left_vector(left);
881   const __m256i t0 = get_top_vector(above);
882   const __m256i t1 = get_top_vector(above + 16);
883   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
884   __m256i rep = _mm256_set1_epi16((short)0x8000);
885   const __m256i one = _mm256_set1_epi16(1);
886 
887   int i;
888   for (i = 0; i < 16; ++i) {
889     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
890 
891     const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
892 
893     _mm256_storeu_si256((__m256i *)dst, r);
894 
895     dst += stride;
896     rep = _mm256_add_epi16(rep, one);
897   }
898 }
899 
aom_paeth_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)900 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
901                                     const uint8_t *above, const uint8_t *left) {
902   __m256i l = get_left_vector(left);
903   const __m256i t0 = get_top_vector(above);
904   const __m256i t1 = get_top_vector(above + 16);
905   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
906   __m256i rep = _mm256_set1_epi16((short)0x8000);
907   const __m256i one = _mm256_set1_epi16(1);
908 
909   int i;
910   for (i = 0; i < 16; ++i) {
911     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
912 
913     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
914     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
915 
916     _mm_store_si128((__m128i *)dst, r0);
917     _mm_store_si128((__m128i *)(dst + 16), r1);
918 
919     dst += stride;
920     rep = _mm256_add_epi16(rep, one);
921   }
922 
923   l = get_left_vector(left + 16);
924   rep = _mm256_set1_epi16((short)0x8000);
925   for (i = 0; i < 16; ++i) {
926     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
927 
928     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
929     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
930 
931     _mm_store_si128((__m128i *)dst, r0);
932     _mm_store_si128((__m128i *)(dst + 16), r1);
933 
934     dst += stride;
935     rep = _mm256_add_epi16(rep, one);
936   }
937 }
938 
aom_paeth_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)939 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
940                                     const uint8_t *above, const uint8_t *left) {
941   const __m256i t0 = get_top_vector(above);
942   const __m256i t1 = get_top_vector(above + 16);
943   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
944   const __m256i one = _mm256_set1_epi16(1);
945 
946   int i, j;
947   for (j = 0; j < 4; ++j) {
948     const __m256i l = get_left_vector(left + j * 16);
949     __m256i rep = _mm256_set1_epi16((short)0x8000);
950     for (i = 0; i < 16; ++i) {
951       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
952 
953       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
954       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
955 
956       _mm_store_si128((__m128i *)dst, r0);
957       _mm_store_si128((__m128i *)(dst + 16), r1);
958 
959       dst += stride;
960       rep = _mm256_add_epi16(rep, one);
961     }
962   }
963 }
964 
aom_paeth_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)965 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
966                                     const uint8_t *above, const uint8_t *left) {
967   const __m256i t0 = get_top_vector(above);
968   const __m256i t1 = get_top_vector(above + 16);
969   const __m256i t2 = get_top_vector(above + 32);
970   const __m256i t3 = get_top_vector(above + 48);
971   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
972   const __m256i one = _mm256_set1_epi16(1);
973 
974   int i, j;
975   for (j = 0; j < 2; ++j) {
976     const __m256i l = get_left_vector(left + j * 16);
977     __m256i rep = _mm256_set1_epi16((short)0x8000);
978     for (i = 0; i < 16; ++i) {
979       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
980 
981       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
982       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
983       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
984       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
985 
986       _mm_store_si128((__m128i *)dst, r0);
987       _mm_store_si128((__m128i *)(dst + 16), r1);
988       _mm_store_si128((__m128i *)(dst + 32), r2);
989       _mm_store_si128((__m128i *)(dst + 48), r3);
990 
991       dst += stride;
992       rep = _mm256_add_epi16(rep, one);
993     }
994   }
995 }
996 
aom_paeth_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)997 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
998                                     const uint8_t *above, const uint8_t *left) {
999   const __m256i t0 = get_top_vector(above);
1000   const __m256i t1 = get_top_vector(above + 16);
1001   const __m256i t2 = get_top_vector(above + 32);
1002   const __m256i t3 = get_top_vector(above + 48);
1003   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1004   const __m256i one = _mm256_set1_epi16(1);
1005 
1006   int i, j;
1007   for (j = 0; j < 4; ++j) {
1008     const __m256i l = get_left_vector(left + j * 16);
1009     __m256i rep = _mm256_set1_epi16((short)0x8000);
1010     for (i = 0; i < 16; ++i) {
1011       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1012 
1013       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1014       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1015       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1016       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1017 
1018       _mm_store_si128((__m128i *)dst, r0);
1019       _mm_store_si128((__m128i *)(dst + 16), r1);
1020       _mm_store_si128((__m128i *)(dst + 32), r2);
1021       _mm_store_si128((__m128i *)(dst + 48), r3);
1022 
1023       dst += stride;
1024       rep = _mm256_add_epi16(rep, one);
1025     }
1026   }
1027 }
1028 
1029 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1030 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1031                                     const uint8_t *above, const uint8_t *left) {
1032   const __m256i t0 = get_top_vector(above);
1033   const __m256i t1 = get_top_vector(above + 16);
1034   const __m256i t2 = get_top_vector(above + 32);
1035   const __m256i t3 = get_top_vector(above + 48);
1036   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1037   const __m256i one = _mm256_set1_epi16(1);
1038 
1039   int i;
1040   const __m256i l = get_left_vector(left);
1041   __m256i rep = _mm256_set1_epi16((short)0x8000);
1042   for (i = 0; i < 16; ++i) {
1043     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1044 
1045     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1046     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1047     const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1048     const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1049 
1050     _mm_store_si128((__m128i *)dst, r0);
1051     _mm_store_si128((__m128i *)(dst + 16), r1);
1052     _mm_store_si128((__m128i *)(dst + 32), r2);
1053     _mm_store_si128((__m128i *)(dst + 48), r3);
1054 
1055     dst += stride;
1056     rep = _mm256_add_epi16(rep, one);
1057   }
1058 }
1059 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1060 
1061 #if CONFIG_AV1_HIGHBITDEPTH
1062 
highbd_dr_prediction_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1063 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1064     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1065   const int frac_bits = 6 - upsample_above;
1066   const int max_base_x = ((N + 4) - 1) << upsample_above;
1067 
1068   assert(dx > 0);
1069   // pre-filter above pixels
1070   // store in temp buffers:
1071   //   above[x] * 32 + 16
1072   //   above[x+1] - above[x]
1073   // final pixels will be calculated as:
1074   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1075   __m256i a0, a1, a32, a16;
1076   __m256i diff, c3f;
1077   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1078   __m128i a0_128, a1_128;
1079   a16 = _mm256_set1_epi16(16);
1080   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1081   max_base_x128 = _mm_set1_epi16(max_base_x);
1082   c3f = _mm256_set1_epi16(0x3f);
1083 
1084   int x = dx;
1085   for (int r = 0; r < N; r++) {
1086     __m256i b, res, shift;
1087     __m128i res1;
1088 
1089     int base = x >> frac_bits;
1090     if (base >= max_base_x) {
1091       for (int i = r; i < N; ++i) {
1092         dst[i] = a_mbase_x;  // save 4 values
1093       }
1094       return;
1095     }
1096 
1097     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1098     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1099 
1100     if (upsample_above) {
1101       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1102       a1_128 = _mm_srli_si128(a0_128, 8);
1103 
1104       base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1105                                    base + 10, base + 12, base + 14);
1106       shift = _mm256_srli_epi16(
1107           _mm256_and_si256(
1108               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1109               _mm256_set1_epi16(0x3f)),
1110           1);
1111     } else {
1112       base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1113                                    base + 5, base + 6, base + 7);
1114       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1115     }
1116     a0 = _mm256_castsi128_si256(a0_128);
1117     a1 = _mm256_castsi128_si256(a1_128);
1118     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1119     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1120     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1121 
1122     b = _mm256_mullo_epi16(diff, shift);
1123     res = _mm256_add_epi16(a32, b);
1124     res = _mm256_srli_epi16(res, 5);
1125     res1 = _mm256_castsi256_si128(res);
1126 
1127     mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1128     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1129     x += dx;
1130   }
1131 }
1132 
highbd_dr_prediction_32bit_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1133 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1134     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1135   const int frac_bits = 6 - upsample_above;
1136   const int max_base_x = ((N + 4) - 1) << upsample_above;
1137 
1138   assert(dx > 0);
1139   // pre-filter above pixels
1140   // store in temp buffers:
1141   //   above[x] * 32 + 16
1142   //   above[x+1] - above[x]
1143   // final pixels will be calculated as:
1144   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1145   __m256i a0, a1, a32, a16;
1146   __m256i diff;
1147   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1148 
1149   a16 = _mm256_set1_epi32(16);
1150   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1151   max_base_x128 = _mm_set1_epi32(max_base_x);
1152 
1153   int x = dx;
1154   for (int r = 0; r < N; r++) {
1155     __m256i b, res, shift;
1156     __m128i res1;
1157 
1158     int base = x >> frac_bits;
1159     if (base >= max_base_x) {
1160       for (int i = r; i < N; ++i) {
1161         dst[i] = a_mbase_x;  // save 4 values
1162       }
1163       return;
1164     }
1165 
1166     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1167     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1168 
1169     if (upsample_above) {
1170       a0 = _mm256_permutevar8x32_epi32(
1171           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1172       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1173       base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1174       shift = _mm256_srli_epi32(
1175           _mm256_and_si256(
1176               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1177               _mm256_set1_epi32(0x3f)),
1178           1);
1179     } else {
1180       base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1181       shift = _mm256_srli_epi32(
1182           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1183     }
1184 
1185     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1186     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1187     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1188 
1189     b = _mm256_mullo_epi32(diff, shift);
1190     res = _mm256_add_epi32(a32, b);
1191     res = _mm256_srli_epi32(res, 5);
1192 
1193     res1 = _mm256_castsi256_si128(res);
1194     res1 = _mm_packus_epi32(res1, res1);
1195 
1196     mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1197     mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1198     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1199     x += dx;
1200   }
1201 }
1202 
highbd_dr_prediction_z1_4xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1203 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1204                                              ptrdiff_t stride,
1205                                              const uint16_t *above,
1206                                              int upsample_above, int dx,
1207                                              int bd) {
1208   __m128i dstvec[16];
1209   if (bd < 12) {
1210     highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1211                                               dx);
1212   } else {
1213     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1214                                                     upsample_above, dx);
1215   }
1216   for (int i = 0; i < N; i++) {
1217     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1218   }
1219 }
1220 
highbd_dr_prediction_32bit_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1221 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1222     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1223   const int frac_bits = 6 - upsample_above;
1224   const int max_base_x = ((8 + N) - 1) << upsample_above;
1225 
1226   assert(dx > 0);
1227   // pre-filter above pixels
1228   // store in temp buffers:
1229   //   above[x] * 32 + 16
1230   //   above[x+1] - above[x]
1231   // final pixels will be calculated as:
1232   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1233   __m256i a0, a1, a0_1, a1_1, a32, a16;
1234   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1235 
1236   a16 = _mm256_set1_epi32(16);
1237   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1238   max_base_x256 = _mm256_set1_epi32(max_base_x);
1239 
1240   int x = dx;
1241   for (int r = 0; r < N; r++) {
1242     __m256i b, res, res1, shift;
1243 
1244     int base = x >> frac_bits;
1245     if (base >= max_base_x) {
1246       for (int i = r; i < N; ++i) {
1247         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1248       }
1249       return;
1250     }
1251 
1252     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1253     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1254 
1255     if (upsample_above) {
1256       a0 = _mm256_permutevar8x32_epi32(
1257           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1258       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1259 
1260       a0_1 =
1261           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1262       a0_1 = _mm256_permutevar8x32_epi32(
1263           a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1264       a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1265 
1266       a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1267       a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1268       base_inc256 =
1269           _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1270                             base + 10, base + 12, base + 14);
1271       shift = _mm256_srli_epi32(
1272           _mm256_and_si256(
1273               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1274               _mm256_set1_epi32(0x3f)),
1275           1);
1276     } else {
1277       base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1278                                       base + 4, base + 5, base + 6, base + 7);
1279       shift = _mm256_srli_epi32(
1280           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1281     }
1282 
1283     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1284     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1285     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1286 
1287     b = _mm256_mullo_epi32(diff, shift);
1288     res = _mm256_add_epi32(a32, b);
1289     res = _mm256_srli_epi32(res, 5);
1290 
1291     res1 = _mm256_packus_epi32(
1292         res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1293 
1294     mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1295     mask256 = _mm256_packs_epi32(
1296         mask256, _mm256_castsi128_si256(
1297                      _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1298     res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1299     dst[r] = _mm256_castsi256_si128(res1);
1300     x += dx;
1301   }
1302 }
1303 
highbd_dr_prediction_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1304 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1305     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1306   const int frac_bits = 6 - upsample_above;
1307   const int max_base_x = ((8 + N) - 1) << upsample_above;
1308 
1309   assert(dx > 0);
1310   // pre-filter above pixels
1311   // store in temp buffers:
1312   //   above[x] * 32 + 16
1313   //   above[x+1] - above[x]
1314   // final pixels will be calculated as:
1315   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1316   __m256i a0, a1, a32, a16, c3f;
1317   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1318   __m128i a0_x128, a1_x128;
1319 
1320   a16 = _mm256_set1_epi16(16);
1321   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1322   max_base_x256 = _mm256_set1_epi16(max_base_x);
1323   c3f = _mm256_set1_epi16(0x3f);
1324 
1325   int x = dx;
1326   for (int r = 0; r < N; r++) {
1327     __m256i b, res, res1, shift;
1328 
1329     int base = x >> frac_bits;
1330     if (base >= max_base_x) {
1331       for (int i = r; i < N; ++i) {
1332         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1333       }
1334       return;
1335     }
1336 
1337     a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1338     if (upsample_above) {
1339       __m128i mask, atmp0, atmp1, atmp2, atmp3;
1340       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1341       atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1342       atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1343       atmp2 =
1344           _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1345       atmp3 =
1346           _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1347       mask =
1348           _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1349       a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1350       mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1351                             _mm_set1_epi8(15));
1352       a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1353 
1354       base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1355                                       base + 8, base + 10, base + 12, base + 14,
1356                                       0, 0, 0, 0, 0, 0, 0, 0);
1357       shift = _mm256_srli_epi16(
1358           _mm256_and_si256(
1359               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1360           1);
1361     } else {
1362       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1363       base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1364                                       base + 4, base + 5, base + 6, base + 7, 0,
1365                                       0, 0, 0, 0, 0, 0, 0);
1366       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1367     }
1368     a0 = _mm256_castsi128_si256(a0_x128);
1369     a1 = _mm256_castsi128_si256(a1_x128);
1370 
1371     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1372     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1373     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1374 
1375     b = _mm256_mullo_epi16(diff, shift);
1376     res = _mm256_add_epi16(a32, b);
1377     res = _mm256_srli_epi16(res, 5);
1378 
1379     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1380     res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1381     dst[r] = _mm256_castsi256_si128(res1);
1382     x += dx;
1383   }
1384 }
1385 
highbd_dr_prediction_z1_8xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1386 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1387                                              ptrdiff_t stride,
1388                                              const uint16_t *above,
1389                                              int upsample_above, int dx,
1390                                              int bd) {
1391   __m128i dstvec[32];
1392   if (bd < 12) {
1393     highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1394                                               dx);
1395   } else {
1396     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1397                                                     upsample_above, dx);
1398   }
1399   for (int i = 0; i < N; i++) {
1400     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1401   }
1402 }
1403 
highbd_dr_prediction_32bit_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1404 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1405     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1406   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1407   (void)upsample_above;
1408   const int frac_bits = 6;
1409   const int max_base_x = ((16 + N) - 1);
1410 
1411   // pre-filter above pixels
1412   // store in temp buffers:
1413   //   above[x] * 32 + 16
1414   //   above[x+1] - above[x]
1415   // final pixels will be calculated as:
1416   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1417   __m256i a0, a0_1, a1, a1_1, a32, a16;
1418   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1419 
1420   a16 = _mm256_set1_epi32(16);
1421   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1422   max_base_x256 = _mm256_set1_epi16(max_base_x);
1423 
1424   int x = dx;
1425   for (int r = 0; r < N; r++) {
1426     __m256i b, res[2], res1;
1427 
1428     int base = x >> frac_bits;
1429     if (base >= max_base_x) {
1430       for (int i = r; i < N; ++i) {
1431         dstvec[i] = a_mbase_x;  // save 16 values
1432       }
1433       return;
1434     }
1435     __m256i shift = _mm256_srli_epi32(
1436         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1437 
1438     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1439     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1440 
1441     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1442     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1443     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1444     b = _mm256_mullo_epi32(diff, shift);
1445 
1446     res[0] = _mm256_add_epi32(a32, b);
1447     res[0] = _mm256_srli_epi32(res[0], 5);
1448     res[0] = _mm256_packus_epi32(
1449         res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1450 
1451     int mdif = max_base_x - base;
1452     if (mdif > 8) {
1453       a0_1 =
1454           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1455       a1_1 =
1456           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1457 
1458       diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1459       a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1460       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1461       b = _mm256_mullo_epi32(diff, shift);
1462 
1463       res[1] = _mm256_add_epi32(a32, b);
1464       res[1] = _mm256_srli_epi32(res[1], 5);
1465       res[1] = _mm256_packus_epi32(
1466           res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1467     } else {
1468       res[1] = a_mbase_x;
1469     }
1470     res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1471                                    1);  // 16 16bit values
1472 
1473     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1474                                     base + 4, base + 5, base + 6, base + 7,
1475                                     base + 8, base + 9, base + 10, base + 11,
1476                                     base + 12, base + 13, base + 14, base + 15);
1477     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1478     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1479     x += dx;
1480   }
1481 }
1482 
highbd_dr_prediction_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1483 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1484     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1485   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1486   (void)upsample_above;
1487   const int frac_bits = 6;
1488   const int max_base_x = ((16 + N) - 1);
1489 
1490   // pre-filter above pixels
1491   // store in temp buffers:
1492   //   above[x] * 32 + 16
1493   //   above[x+1] - above[x]
1494   // final pixels will be calculated as:
1495   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1496   __m256i a0, a1, a32, a16, c3f;
1497   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1498 
1499   a16 = _mm256_set1_epi16(16);
1500   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1501   max_base_x256 = _mm256_set1_epi16(max_base_x);
1502   c3f = _mm256_set1_epi16(0x3f);
1503 
1504   int x = dx;
1505   for (int r = 0; r < N; r++) {
1506     __m256i b, res;
1507 
1508     int base = x >> frac_bits;
1509     if (base >= max_base_x) {
1510       for (int i = r; i < N; ++i) {
1511         dstvec[i] = a_mbase_x;  // save 16 values
1512       }
1513       return;
1514     }
1515     __m256i shift =
1516         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1517 
1518     a0 = _mm256_loadu_si256((__m256i *)(above + base));
1519     a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1520 
1521     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1522     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1523     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1524     b = _mm256_mullo_epi16(diff, shift);
1525 
1526     res = _mm256_add_epi16(a32, b);
1527     res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1528 
1529     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1530                                     base + 4, base + 5, base + 6, base + 7,
1531                                     base + 8, base + 9, base + 10, base + 11,
1532                                     base + 12, base + 13, base + 14, base + 15);
1533     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1534     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1535     x += dx;
1536   }
1537 }
1538 
highbd_dr_prediction_z1_16xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1539 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1540                                               ptrdiff_t stride,
1541                                               const uint16_t *above,
1542                                               int upsample_above, int dx,
1543                                               int bd) {
1544   __m256i dstvec[64];
1545   if (bd < 12) {
1546     highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1547                                                dx);
1548   } else {
1549     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1550                                                      upsample_above, dx);
1551   }
1552   for (int i = 0; i < N; i++) {
1553     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1554   }
1555 }
1556 
highbd_dr_prediction_32bit_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1557 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1558     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1559   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1560   (void)upsample_above;
1561   const int frac_bits = 6;
1562   const int max_base_x = ((32 + N) - 1);
1563 
1564   // pre-filter above pixels
1565   // store in temp buffers:
1566   //   above[x] * 32 + 16
1567   //   above[x+1] - above[x]
1568   // final pixels will be calculated as:
1569   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1570   __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1571   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1572 
1573   a16 = _mm256_set1_epi32(16);
1574   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1575   max_base_x256 = _mm256_set1_epi16(max_base_x);
1576   c3f = _mm256_set1_epi16(0x3f);
1577 
1578   int x = dx;
1579   for (int r = 0; r < N; r++) {
1580     __m256i b, res[2], res1;
1581 
1582     int base = x >> frac_bits;
1583     if (base >= max_base_x) {
1584       for (int i = r; i < N; ++i) {
1585         dstvec[i] = a_mbase_x;  // save 32 values
1586         dstvec[i + N] = a_mbase_x;
1587       }
1588       return;
1589     }
1590 
1591     __m256i shift =
1592         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1593 
1594     for (int j = 0; j < 32; j += 16) {
1595       int mdif = max_base_x - (base + j);
1596       if (mdif <= 0) {
1597         res1 = a_mbase_x;
1598       } else {
1599         a0 = _mm256_cvtepu16_epi32(
1600             _mm_loadu_si128((__m128i *)(above + base + j)));
1601         a1 = _mm256_cvtepu16_epi32(
1602             _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1603 
1604         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1605         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1606         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1607         b = _mm256_mullo_epi32(diff, shift);
1608 
1609         res[0] = _mm256_add_epi32(a32, b);
1610         res[0] = _mm256_srli_epi32(res[0], 5);
1611         res[0] = _mm256_packus_epi32(
1612             res[0],
1613             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1614         if (mdif > 8) {
1615           a0_1 = _mm256_cvtepu16_epi32(
1616               _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1617           a1_1 = _mm256_cvtepu16_epi32(
1618               _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1619 
1620           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1621           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1622           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1623           b = _mm256_mullo_epi32(diff, shift);
1624 
1625           res[1] = _mm256_add_epi32(a32, b);
1626           res[1] = _mm256_srli_epi32(res[1], 5);
1627           res[1] = _mm256_packus_epi32(
1628               res[1],
1629               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1630         } else {
1631           res[1] = a_mbase_x;
1632         }
1633         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1634                                        1);  // 16 16bit values
1635         base_inc256 = _mm256_setr_epi16(
1636             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1637             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1638             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1639             base + j + 13, base + j + 14, base + j + 15);
1640 
1641         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1642         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1643       }
1644       if (!j) {
1645         dstvec[r] = res1;
1646       } else {
1647         dstvec[r + N] = res1;
1648       }
1649     }
1650     x += dx;
1651   }
1652 }
1653 
highbd_dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1654 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1655     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1656   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1657   (void)upsample_above;
1658   const int frac_bits = 6;
1659   const int max_base_x = ((32 + N) - 1);
1660 
1661   // pre-filter above pixels
1662   // store in temp buffers:
1663   //   above[x] * 32 + 16
1664   //   above[x+1] - above[x]
1665   // final pixels will be calculated as:
1666   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1667   __m256i a0, a1, a32, a16, c3f;
1668   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1669 
1670   a16 = _mm256_set1_epi16(16);
1671   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1672   max_base_x256 = _mm256_set1_epi16(max_base_x);
1673   c3f = _mm256_set1_epi16(0x3f);
1674 
1675   int x = dx;
1676   for (int r = 0; r < N; r++) {
1677     __m256i b, res;
1678 
1679     int base = x >> frac_bits;
1680     if (base >= max_base_x) {
1681       for (int i = r; i < N; ++i) {
1682         dstvec[i] = a_mbase_x;  // save 32 values
1683         dstvec[i + N] = a_mbase_x;
1684       }
1685       return;
1686     }
1687 
1688     __m256i shift =
1689         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1690 
1691     for (int j = 0; j < 32; j += 16) {
1692       int mdif = max_base_x - (base + j);
1693       if (mdif <= 0) {
1694         res = a_mbase_x;
1695       } else {
1696         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1697         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1698 
1699         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1700         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1701         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1702         b = _mm256_mullo_epi16(diff, shift);
1703 
1704         res = _mm256_add_epi16(a32, b);
1705         res = _mm256_srli_epi16(res, 5);
1706 
1707         base_inc256 = _mm256_setr_epi16(
1708             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1709             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1710             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1711             base + j + 13, base + j + 14, base + j + 15);
1712 
1713         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1714         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1715       }
1716       if (!j) {
1717         dstvec[r] = res;
1718       } else {
1719         dstvec[r + N] = res;
1720       }
1721     }
1722     x += dx;
1723   }
1724 }
1725 
highbd_dr_prediction_z1_32xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1726 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1727                                               ptrdiff_t stride,
1728                                               const uint16_t *above,
1729                                               int upsample_above, int dx,
1730                                               int bd) {
1731   __m256i dstvec[128];
1732   if (bd < 12) {
1733     highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1734                                                dx);
1735   } else {
1736     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1737                                                      upsample_above, dx);
1738   }
1739   for (int i = 0; i < N; i++) {
1740     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1741     _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1742   }
1743 }
1744 
highbd_dr_prediction_32bit_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1745 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1746                                                     ptrdiff_t stride,
1747                                                     const uint16_t *above,
1748                                                     int upsample_above,
1749                                                     int dx) {
1750   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1751   (void)upsample_above;
1752   const int frac_bits = 6;
1753   const int max_base_x = ((64 + N) - 1);
1754 
1755   // pre-filter above pixels
1756   // store in temp buffers:
1757   //   above[x] * 32 + 16
1758   //   above[x+1] - above[x]
1759   // final pixels will be calculated as:
1760   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1761   __m256i a0, a0_1, a1, a1_1, a32, a16;
1762   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1763 
1764   a16 = _mm256_set1_epi32(16);
1765   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1766   max_base_x256 = _mm256_set1_epi16(max_base_x);
1767 
1768   int x = dx;
1769   for (int r = 0; r < N; r++, dst += stride) {
1770     __m256i b, res[2], res1;
1771 
1772     int base = x >> frac_bits;
1773     if (base >= max_base_x) {
1774       for (int i = r; i < N; ++i) {
1775         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1776         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1777         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1778         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1779         dst += stride;
1780       }
1781       return;
1782     }
1783 
1784     __m256i shift = _mm256_srli_epi32(
1785         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1786 
1787     __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1788     for (int j = 0; j < 64; j += 16) {
1789       int mdif = max_base_x - (base + j);
1790       if (mdif <= 0) {
1791         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1792       } else {
1793         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1794         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1795         a0 = _mm256_cvtepu16_epi32(a0_128);
1796         a1 = _mm256_cvtepu16_epi32(a1_128);
1797 
1798         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1799         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1800         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1801         b = _mm256_mullo_epi32(diff, shift);
1802 
1803         res[0] = _mm256_add_epi32(a32, b);
1804         res[0] = _mm256_srli_epi32(res[0], 5);
1805         res[0] = _mm256_packus_epi32(
1806             res[0],
1807             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1808         if (mdif > 8) {
1809           a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1810           a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1811           a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1812           a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1813 
1814           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1815           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1816           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1817           b = _mm256_mullo_epi32(diff, shift);
1818 
1819           res[1] = _mm256_add_epi32(a32, b);
1820           res[1] = _mm256_srli_epi32(res[1], 5);
1821           res[1] = _mm256_packus_epi32(
1822               res[1],
1823               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1824         } else {
1825           res[1] = a_mbase_x;
1826         }
1827         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1828                                        1);  // 16 16bit values
1829         base_inc256 = _mm256_setr_epi16(
1830             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1831             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1832             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1833             base + j + 13, base + j + 14, base + j + 15);
1834 
1835         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1836         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1837         _mm256_storeu_si256((__m256i *)(dst + j), res1);
1838       }
1839     }
1840     x += dx;
1841   }
1842 }
1843 
highbd_dr_prediction_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1844 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1845                                               ptrdiff_t stride,
1846                                               const uint16_t *above,
1847                                               int upsample_above, int dx) {
1848   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1849   (void)upsample_above;
1850   const int frac_bits = 6;
1851   const int max_base_x = ((64 + N) - 1);
1852 
1853   // pre-filter above pixels
1854   // store in temp buffers:
1855   //   above[x] * 32 + 16
1856   //   above[x+1] - above[x]
1857   // final pixels will be calculated as:
1858   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1859   __m256i a0, a1, a32, a16, c3f;
1860   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1861 
1862   a16 = _mm256_set1_epi16(16);
1863   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1864   max_base_x256 = _mm256_set1_epi16(max_base_x);
1865   c3f = _mm256_set1_epi16(0x3f);
1866 
1867   int x = dx;
1868   for (int r = 0; r < N; r++, dst += stride) {
1869     __m256i b, res;
1870 
1871     int base = x >> frac_bits;
1872     if (base >= max_base_x) {
1873       for (int i = r; i < N; ++i) {
1874         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1875         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1876         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1877         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1878         dst += stride;
1879       }
1880       return;
1881     }
1882 
1883     __m256i shift =
1884         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1885 
1886     for (int j = 0; j < 64; j += 16) {
1887       int mdif = max_base_x - (base + j);
1888       if (mdif <= 0) {
1889         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1890       } else {
1891         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1892         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1893 
1894         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1895         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1896         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1897         b = _mm256_mullo_epi16(diff, shift);
1898 
1899         res = _mm256_add_epi16(a32, b);
1900         res = _mm256_srli_epi16(res, 5);
1901 
1902         base_inc256 = _mm256_setr_epi16(
1903             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1904             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1905             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1906             base + j + 13, base + j + 14, base + j + 15);
1907 
1908         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1909         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1910         _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1911       }
1912     }
1913     x += dx;
1914   }
1915 }
1916 
1917 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1918 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1919                                       int bh, const uint16_t *above,
1920                                       const uint16_t *left, int upsample_above,
1921                                       int dx, int dy, int bd) {
1922   (void)left;
1923   (void)dy;
1924 
1925   switch (bw) {
1926     case 4:
1927       highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1928                                        dx, bd);
1929       break;
1930     case 8:
1931       highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1932                                        dx, bd);
1933       break;
1934     case 16:
1935       highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1936                                         dx, bd);
1937       break;
1938     case 32:
1939       highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1940                                         dx, bd);
1941       break;
1942     case 64:
1943       if (bd < 12) {
1944         highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1945                                           upsample_above, dx);
1946       } else {
1947         highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1948                                                 upsample_above, dx);
1949       }
1950       break;
1951     default: break;
1952   }
1953   return;
1954 }
1955 
highbd_transpose_TX_16X16(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst)1956 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1957                                       uint16_t *dst, ptrdiff_t pitchDst) {
1958   __m256i r[16];
1959   __m256i d[16];
1960   for (int j = 0; j < 16; j++) {
1961     r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1962   }
1963   highbd_transpose16x16_avx2(r, d);
1964   for (int j = 0; j < 16; j++) {
1965     _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1966   }
1967 }
1968 
highbd_transpose(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst,int width,int height)1969 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1970                              uint16_t *dst, ptrdiff_t pitchDst, int width,
1971                              int height) {
1972   for (int j = 0; j < height; j += 16)
1973     for (int i = 0; i < width; i += 16)
1974       highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1975                                 dst + j * pitchDst + i, pitchDst);
1976 }
1977 
highbd_dr_prediction_32bit_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)1978 static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1979     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1980     const uint16_t *left, int upsample_above, int upsample_left, int dx,
1981     int dy) {
1982   const int min_base_x = -(1 << upsample_above);
1983   const int min_base_y = -(1 << upsample_left);
1984   const int frac_bits_x = 6 - upsample_above;
1985   const int frac_bits_y = 6 - upsample_left;
1986 
1987   assert(dx > 0);
1988   // pre-filter above pixels
1989   // store in temp buffers:
1990   //   above[x] * 32 + 16
1991   //   above[x+1] - above[x]
1992   // final pixels will be calculated as:
1993   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1994   __m256i a0_x, a1_x, a32, a16;
1995   __m256i diff;
1996   __m128i c3f, min_base_y128;
1997 
1998   a16 = _mm256_set1_epi32(16);
1999   c3f = _mm_set1_epi32(0x3f);
2000   min_base_y128 = _mm_set1_epi32(min_base_y);
2001 
2002   for (int r = 0; r < N; r++) {
2003     __m256i b, res, shift;
2004     __m128i resx, resy, resxy;
2005     __m128i a0_x128, a1_x128;
2006     int y = r + 1;
2007     int base_x = (-y * dx) >> frac_bits_x;
2008     int base_shift = 0;
2009     if (base_x < (min_base_x - 1)) {
2010       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2011     }
2012     int base_min_diff =
2013         (min_base_x - base_x + upsample_above) >> upsample_above;
2014     if (base_min_diff > 4) {
2015       base_min_diff = 4;
2016     } else {
2017       if (base_min_diff < 0) base_min_diff = 0;
2018     }
2019 
2020     if (base_shift > 3) {
2021       a0_x = _mm256_setzero_si256();
2022       a1_x = _mm256_setzero_si256();
2023       shift = _mm256_setzero_si256();
2024     } else {
2025       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2026       if (upsample_above) {
2027         a0_x128 = _mm_shuffle_epi8(a0_x128,
2028                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2029         a1_x128 = _mm_srli_si128(a0_x128, 8);
2030 
2031         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2032             _mm_and_si128(
2033                 _mm_slli_epi32(
2034                     _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2035                                    (2 << 6) - y * dx, (3 << 6) - y * dx),
2036                     upsample_above),
2037                 c3f),
2038             1));
2039       } else {
2040         a0_x128 =
2041             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2042         a1_x128 = _mm_srli_si128(a0_x128, 2);
2043 
2044         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2045             _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2046                                          (2 << 6) - y * dx, (3 << 6) - y * dx),
2047                           c3f),
2048             1));
2049       }
2050       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2051       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2052     }
2053     // y calc
2054     __m128i a0_y, a1_y, shifty;
2055     if (base_x < min_base_x) {
2056       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2057       DECLARE_ALIGNED(32, int, base_y_c[4]);
2058       r6 = _mm_set1_epi32(r << 6);
2059       dy128 = _mm_set1_epi32(dy);
2060       c1234 = _mm_setr_epi32(1, 2, 3, 4);
2061       y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2062       base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2063       mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2064       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2065       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2066 
2067       a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2068                             left[base_y_c[2]], left[base_y_c[3]]);
2069       a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2070                             left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2071 
2072       if (upsample_left) {
2073         shifty = _mm_srli_epi32(
2074             _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2075       } else {
2076         shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2077       }
2078       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2079       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2080       shift = _mm256_inserti128_si256(shift, shifty, 1);
2081     }
2082 
2083     diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2084     a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2085     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2086 
2087     b = _mm256_mullo_epi32(diff, shift);
2088     res = _mm256_add_epi32(a32, b);
2089     res = _mm256_srli_epi32(res, 5);
2090 
2091     resx = _mm256_castsi256_si128(res);
2092     resx = _mm_packus_epi32(resx, resx);
2093 
2094     resy = _mm256_extracti128_si256(res, 1);
2095     resy = _mm_packus_epi32(resy, resy);
2096 
2097     resxy =
2098         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2099     _mm_storel_epi64((__m128i *)(dst), resxy);
2100     dst += stride;
2101   }
2102 }
2103 
highbd_dr_prediction_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2104 static void highbd_dr_prediction_z2_Nx4_avx2(
2105     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2106     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2107     int dy) {
2108   const int min_base_x = -(1 << upsample_above);
2109   const int min_base_y = -(1 << upsample_left);
2110   const int frac_bits_x = 6 - upsample_above;
2111   const int frac_bits_y = 6 - upsample_left;
2112 
2113   assert(dx > 0);
2114   // pre-filter above pixels
2115   // store in temp buffers:
2116   //   above[x] * 32 + 16
2117   //   above[x+1] - above[x]
2118   // final pixels will be calculated as:
2119   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2120   __m256i a0_x, a1_x, a32, a16;
2121   __m256i diff;
2122   __m128i c3f, min_base_y128;
2123 
2124   a16 = _mm256_set1_epi16(16);
2125   c3f = _mm_set1_epi16(0x3f);
2126   min_base_y128 = _mm_set1_epi16(min_base_y);
2127 
2128   for (int r = 0; r < N; r++) {
2129     __m256i b, res, shift;
2130     __m128i resx, resy, resxy;
2131     __m128i a0_x128, a1_x128;
2132     int y = r + 1;
2133     int base_x = (-y * dx) >> frac_bits_x;
2134     int base_shift = 0;
2135     if (base_x < (min_base_x - 1)) {
2136       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2137     }
2138     int base_min_diff =
2139         (min_base_x - base_x + upsample_above) >> upsample_above;
2140     if (base_min_diff > 4) {
2141       base_min_diff = 4;
2142     } else {
2143       if (base_min_diff < 0) base_min_diff = 0;
2144     }
2145 
2146     if (base_shift > 3) {
2147       a0_x = _mm256_setzero_si256();
2148       a1_x = _mm256_setzero_si256();
2149       shift = _mm256_setzero_si256();
2150     } else {
2151       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2152       if (upsample_above) {
2153         a0_x128 = _mm_shuffle_epi8(a0_x128,
2154                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2155         a1_x128 = _mm_srli_si128(a0_x128, 8);
2156 
2157         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2158             _mm_and_si128(
2159                 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2160                                               (2 << 6) - y * dx,
2161                                               (3 << 6) - y * dx, 0, 0, 0, 0),
2162                                upsample_above),
2163                 c3f),
2164             1));
2165       } else {
2166         a0_x128 =
2167             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2168         a1_x128 = _mm_srli_si128(a0_x128, 2);
2169 
2170         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2171             _mm_and_si128(
2172                 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2173                                (3 << 6) - y * dx, 0, 0, 0, 0),
2174                 c3f),
2175             1));
2176       }
2177       a0_x = _mm256_castsi128_si256(a0_x128);
2178       a1_x = _mm256_castsi128_si256(a1_x128);
2179     }
2180     // y calc
2181     __m128i a0_y, a1_y, shifty;
2182     if (base_x < min_base_x) {
2183       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2184       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2185       r6 = _mm_set1_epi16(r << 6);
2186       dy128 = _mm_set1_epi16(dy);
2187       c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2188       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2189       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2190       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2191       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2192       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2193 
2194       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2195                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2196       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2197                             left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2198                             0, 0);
2199 
2200       if (upsample_left) {
2201         shifty = _mm_srli_epi16(
2202             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2203       } else {
2204         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2205       }
2206       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2207       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2208       shift = _mm256_inserti128_si256(shift, shifty, 1);
2209     }
2210 
2211     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2212     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2213     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2214 
2215     b = _mm256_mullo_epi16(diff, shift);
2216     res = _mm256_add_epi16(a32, b);
2217     res = _mm256_srli_epi16(res, 5);
2218 
2219     resx = _mm256_castsi256_si128(res);
2220     resy = _mm256_extracti128_si256(res, 1);
2221     resxy =
2222         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2223     _mm_storel_epi64((__m128i *)(dst), resxy);
2224     dst += stride;
2225   }
2226 }
2227 
highbd_dr_prediction_32bit_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2228 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2229     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2230     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2231     int dy) {
2232   const int min_base_x = -(1 << upsample_above);
2233   const int min_base_y = -(1 << upsample_left);
2234   const int frac_bits_x = 6 - upsample_above;
2235   const int frac_bits_y = 6 - upsample_left;
2236 
2237   // pre-filter above pixels
2238   // store in temp buffers:
2239   //   above[x] * 32 + 16
2240   //   above[x+1] - above[x]
2241   // final pixels will be calculated as:
2242   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2243   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2244   __m256i diff;
2245   __m128i a0_x128, a1_x128;
2246 
2247   a16 = _mm256_set1_epi32(16);
2248   c3f = _mm256_set1_epi32(0x3f);
2249   min_base_y256 = _mm256_set1_epi32(min_base_y);
2250 
2251   for (int r = 0; r < N; r++) {
2252     __m256i b, res, shift;
2253     __m128i resx, resy, resxy;
2254     int y = r + 1;
2255     int base_x = (-y * dx) >> frac_bits_x;
2256     int base_shift = 0;
2257     if (base_x < (min_base_x - 1)) {
2258       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2259     }
2260     int base_min_diff =
2261         (min_base_x - base_x + upsample_above) >> upsample_above;
2262     if (base_min_diff > 8) {
2263       base_min_diff = 8;
2264     } else {
2265       if (base_min_diff < 0) base_min_diff = 0;
2266     }
2267 
2268     if (base_shift > 7) {
2269       resx = _mm_setzero_si128();
2270     } else {
2271       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2272       if (upsample_above) {
2273         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2274         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2275         atmp0 = _mm_shuffle_epi8(a0_x128,
2276                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2277         atmp1 = _mm_shuffle_epi8(a1_x128,
2278                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2279         atmp2 = _mm_shuffle_epi8(
2280             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2281         atmp3 = _mm_shuffle_epi8(
2282             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2283         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2284                               _mm_set1_epi8(15));
2285         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2286         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2287                               _mm_set1_epi8(15));
2288         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2289         shift = _mm256_srli_epi32(
2290             _mm256_and_si256(
2291                 _mm256_slli_epi32(
2292                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2293                                       (2 << 6) - y * dx, (3 << 6) - y * dx,
2294                                       (4 << 6) - y * dx, (5 << 6) - y * dx,
2295                                       (6 << 6) - y * dx, (7 << 6) - y * dx),
2296                     upsample_above),
2297                 c3f),
2298             1);
2299       } else {
2300         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2301         a0_x128 =
2302             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2303         a1_x128 =
2304             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2305 
2306         shift = _mm256_srli_epi32(
2307             _mm256_and_si256(
2308                 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2309                                   (3 << 6) - y * dx, (4 << 6) - y * dx,
2310                                   (5 << 6) - y * dx, (6 << 6) - y * dx,
2311                                   (7 << 6) - y * dx),
2312                 c3f),
2313             1);
2314       }
2315       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2316       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2317 
2318       diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2319       a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2320       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2321 
2322       b = _mm256_mullo_epi32(diff, shift);
2323       res = _mm256_add_epi32(a32, b);
2324       res = _mm256_srli_epi32(res, 5);
2325 
2326       resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2327           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2328     }
2329     // y calc
2330     if (base_x < min_base_x) {
2331       DECLARE_ALIGNED(32, int, base_y_c[8]);
2332       __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2333       r6 = _mm256_set1_epi32(r << 6);
2334       dy256 = _mm256_set1_epi32(dy);
2335       c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2336       y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2337       base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2338       mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2339       base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2340       _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2341 
2342       a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2343           left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2344           left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2345           left[base_y_c[6]], left[base_y_c[7]]));
2346       a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2347           left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2348           left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2349           left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2350 
2351       if (upsample_left) {
2352         shift = _mm256_srli_epi32(
2353             _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2354             1);
2355       } else {
2356         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2357       }
2358       diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2359       a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2360       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2361 
2362       b = _mm256_mullo_epi32(diff, shift);
2363       res = _mm256_add_epi32(a32, b);
2364       res = _mm256_srli_epi32(res, 5);
2365 
2366       resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2367           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2368     } else {
2369       resy = resx;
2370     }
2371     resxy =
2372         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2373     _mm_storeu_si128((__m128i *)(dst), resxy);
2374     dst += stride;
2375   }
2376 }
2377 
highbd_dr_prediction_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2378 static void highbd_dr_prediction_z2_Nx8_avx2(
2379     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2380     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2381     int dy) {
2382   const int min_base_x = -(1 << upsample_above);
2383   const int min_base_y = -(1 << upsample_left);
2384   const int frac_bits_x = 6 - upsample_above;
2385   const int frac_bits_y = 6 - upsample_left;
2386 
2387   // pre-filter above pixels
2388   // store in temp buffers:
2389   //   above[x] * 32 + 16
2390   //   above[x+1] - above[x]
2391   // final pixels will be calculated as:
2392   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2393   __m128i c3f, min_base_y128;
2394   __m256i a0_x, a1_x, diff, a32, a16;
2395   __m128i a0_x128, a1_x128;
2396 
2397   a16 = _mm256_set1_epi16(16);
2398   c3f = _mm_set1_epi16(0x3f);
2399   min_base_y128 = _mm_set1_epi16(min_base_y);
2400 
2401   for (int r = 0; r < N; r++) {
2402     __m256i b, res, shift;
2403     __m128i resx, resy, resxy;
2404     int y = r + 1;
2405     int base_x = (-y * dx) >> frac_bits_x;
2406     int base_shift = 0;
2407     if (base_x < (min_base_x - 1)) {
2408       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2409     }
2410     int base_min_diff =
2411         (min_base_x - base_x + upsample_above) >> upsample_above;
2412     if (base_min_diff > 8) {
2413       base_min_diff = 8;
2414     } else {
2415       if (base_min_diff < 0) base_min_diff = 0;
2416     }
2417 
2418     if (base_shift > 7) {
2419       a0_x = _mm256_setzero_si256();
2420       a1_x = _mm256_setzero_si256();
2421       shift = _mm256_setzero_si256();
2422     } else {
2423       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2424       if (upsample_above) {
2425         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2426         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2427         atmp0 = _mm_shuffle_epi8(a0_x128,
2428                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2429         atmp1 = _mm_shuffle_epi8(a1_x128,
2430                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2431         atmp2 = _mm_shuffle_epi8(
2432             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2433         atmp3 = _mm_shuffle_epi8(
2434             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2435         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2436                               _mm_set1_epi8(15));
2437         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2438         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2439                               _mm_set1_epi8(15));
2440         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2441 
2442         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443             _mm_and_si128(
2444                 _mm_slli_epi16(
2445                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2446                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
2447                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
2448                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
2449                     upsample_above),
2450                 c3f),
2451             1));
2452       } else {
2453         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2454         a0_x128 =
2455             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2456         a1_x128 =
2457             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2458 
2459         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2460             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2461                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
2462                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
2463                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
2464                           c3f),
2465             1));
2466       }
2467       a0_x = _mm256_castsi128_si256(a0_x128);
2468       a1_x = _mm256_castsi128_si256(a1_x128);
2469     }
2470 
2471     // y calc
2472     __m128i a0_y, a1_y, shifty;
2473     if (base_x < min_base_x) {
2474       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2475       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2476       r6 = _mm_set1_epi16(r << 6);
2477       dy128 = _mm_set1_epi16(dy);
2478       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2479       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2480       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2481       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2482       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2483       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2484 
2485       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2486                             left[base_y_c[2]], left[base_y_c[3]],
2487                             left[base_y_c[4]], left[base_y_c[5]],
2488                             left[base_y_c[6]], left[base_y_c[7]]);
2489       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2490                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2491                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2492                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2493 
2494       if (upsample_left) {
2495         shifty = _mm_srli_epi16(
2496             _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2497       } else {
2498         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2499       }
2500       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2501       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2502       shift = _mm256_inserti128_si256(shift, shifty, 1);
2503     }
2504 
2505     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2506     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2507     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2508 
2509     b = _mm256_mullo_epi16(diff, shift);
2510     res = _mm256_add_epi16(a32, b);
2511     res = _mm256_srli_epi16(res, 5);
2512 
2513     resx = _mm256_castsi256_si128(res);
2514     resy = _mm256_extracti128_si256(res, 1);
2515 
2516     resxy =
2517         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2518     _mm_storeu_si128((__m128i *)(dst), resxy);
2519     dst += stride;
2520   }
2521 }
2522 
highbd_dr_prediction_32bit_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2523 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2524     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2525     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2526     int dy) {
2527   // here upsample_above and upsample_left are 0 by design of
2528   // av1_use_intra_edge_upsample
2529   const int min_base_x = -1;
2530   const int min_base_y = -1;
2531   (void)upsample_above;
2532   (void)upsample_left;
2533   const int frac_bits_x = 6;
2534   const int frac_bits_y = 6;
2535 
2536   // pre-filter above pixels
2537   // store in temp buffers:
2538   //   above[x] * 32 + 16
2539   //   above[x+1] - above[x]
2540   // final pixels will be calculated as:
2541   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2542   __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2543   __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2544   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2545   DECLARE_ALIGNED(32, int, base_y_c[16]);
2546 
2547   a16 = _mm256_set1_epi32(16);
2548   c1 = _mm256_srli_epi32(a16, 4);
2549   c8 = _mm256_srli_epi32(a16, 1);
2550   min_base_y256 = _mm256_set1_epi32(min_base_y);
2551   c3f = _mm256_set1_epi32(0x3f);
2552   dy256 = _mm256_set1_epi32(dy);
2553   c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2554   c1234 = _mm256_add_epi32(c0123, c1);
2555 
2556   for (int r = 0; r < H; r++) {
2557     __m256i b, res, shift, ydx;
2558     __m256i resx[2], resy[2];
2559     __m256i resxy, j256, r6;
2560     for (int j = 0; j < W; j += 16) {
2561       j256 = _mm256_set1_epi32(j);
2562       int y = r + 1;
2563       ydx = _mm256_set1_epi32(y * dx);
2564 
2565       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2566       int base_shift = 0;
2567       if ((base_x) < (min_base_x - 1)) {
2568         base_shift = (min_base_x - base_x - 1);
2569       }
2570       int base_min_diff = (min_base_x - base_x);
2571       if (base_min_diff > 16) {
2572         base_min_diff = 16;
2573       } else {
2574         if (base_min_diff < 0) base_min_diff = 0;
2575       }
2576 
2577       if (base_shift > 7) {
2578         resx[0] = _mm256_setzero_si256();
2579       } else {
2580         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2581         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2582         a0_x128 =
2583             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2584         a1_x128 =
2585             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2586 
2587         a0_x = _mm256_cvtepu16_epi32(a0_x128);
2588         a1_x = _mm256_cvtepu16_epi32(a1_x128);
2589 
2590         r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2591         shift = _mm256_srli_epi32(
2592             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2593 
2594         diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2595         a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2596         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2597 
2598         b = _mm256_mullo_epi32(diff, shift);
2599         res = _mm256_add_epi32(a32, b);
2600         res = _mm256_srli_epi32(res, 5);
2601 
2602         resx[0] = _mm256_packus_epi32(
2603             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2604       }
2605       int base_shift8 = 0;
2606       if ((base_x + 8) < (min_base_x - 1)) {
2607         base_shift8 = (min_base_x - (base_x + 8) - 1);
2608       }
2609       if (base_shift8 > 7) {
2610         resx[1] = _mm256_setzero_si256();
2611       } else {
2612         a0_1_x128 =
2613             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2614         a1_1_x128 =
2615             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2616         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2617                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2618         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2619                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2620 
2621         a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2622         a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2623 
2624         r6 = _mm256_slli_epi32(
2625             _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2626         shift = _mm256_srli_epi32(
2627             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2628 
2629         diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2630         a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2631         a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2632         b = _mm256_mullo_epi32(diff, shift);
2633 
2634         resx[1] = _mm256_add_epi32(a32, b);
2635         resx[1] = _mm256_srli_epi32(resx[1], 5);
2636         resx[1] = _mm256_packus_epi32(
2637             resx[1],
2638             _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2639       }
2640       resx[0] =
2641           _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2642                                   1);  // 16 16bit values
2643 
2644       // y calc
2645       resy[0] = _mm256_setzero_si256();
2646       if ((base_x < min_base_x)) {
2647         __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2648         r6 = _mm256_set1_epi32(r << 6);
2649         c256 = _mm256_add_epi32(j256, c1234);
2650         y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2651         base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2652         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2653         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2654         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2655         c256 = _mm256_add_epi32(c256, c8);
2656         y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2657         base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2658         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2659         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2660         _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2661 
2662         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2663             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2664             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2665             left[base_y_c[6]], left[base_y_c[7]]));
2666         a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2667             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2668             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2669             left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2670 
2671         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2672 
2673         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2674         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2675         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2676 
2677         b = _mm256_mullo_epi32(diff, shift);
2678         res = _mm256_add_epi32(a32, b);
2679         res = _mm256_srli_epi32(res, 5);
2680 
2681         resy[0] = _mm256_packus_epi32(
2682             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2683 
2684         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2685             left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2686             left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2687             left[base_y_c[14]], left[base_y_c[15]]));
2688         a1_y = _mm256_cvtepu16_epi32(
2689             _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2690                            left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2691                            left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2692                            left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2693         shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2694 
2695         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2696         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2697         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2698 
2699         b = _mm256_mullo_epi32(diff, shift);
2700         res = _mm256_add_epi32(a32, b);
2701         res = _mm256_srli_epi32(res, 5);
2702 
2703         resy[1] = _mm256_packus_epi32(
2704             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2705 
2706         resy[0] =
2707             _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2708                                     1);  // 16 16bit values
2709       }
2710 
2711       resxy = _mm256_blendv_epi8(resx[0], resy[0],
2712                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2713       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2714     }  // for j
2715     dst += stride;
2716   }
2717 }
2718 
highbd_dr_prediction_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2719 static void highbd_dr_prediction_z2_HxW_avx2(
2720     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2721     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2722     int dy) {
2723   // here upsample_above and upsample_left are 0 by design of
2724   // av1_use_intra_edge_upsample
2725   const int min_base_x = -1;
2726   const int min_base_y = -1;
2727   (void)upsample_above;
2728   (void)upsample_left;
2729   const int frac_bits_x = 6;
2730   const int frac_bits_y = 6;
2731 
2732   // pre-filter above pixels
2733   // store in temp buffers:
2734   //   above[x] * 32 + 16
2735   //   above[x+1] - above[x]
2736   // final pixels will be calculated as:
2737   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2738   __m256i a0_x, a1_x, a32, a16, c3f, c1;
2739   __m256i diff, min_base_y256, dy256, c1234, c0123;
2740   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2741 
2742   a16 = _mm256_set1_epi16(16);
2743   c1 = _mm256_srli_epi16(a16, 4);
2744   min_base_y256 = _mm256_set1_epi16(min_base_y);
2745   c3f = _mm256_set1_epi16(0x3f);
2746   dy256 = _mm256_set1_epi16(dy);
2747   c0123 =
2748       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2749   c1234 = _mm256_add_epi16(c0123, c1);
2750 
2751   for (int r = 0; r < H; r++) {
2752     __m256i b, res, shift;
2753     __m256i resx, resy, ydx;
2754     __m256i resxy, j256, r6;
2755     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2756     int y = r + 1;
2757     ydx = _mm256_set1_epi16((short)(y * dx));
2758 
2759     for (int j = 0; j < W; j += 16) {
2760       j256 = _mm256_set1_epi16(j);
2761       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2762       int base_shift = 0;
2763       if ((base_x) < (min_base_x - 1)) {
2764         base_shift = (min_base_x - (base_x)-1);
2765       }
2766       int base_min_diff = (min_base_x - base_x);
2767       if (base_min_diff > 16) {
2768         base_min_diff = 16;
2769       } else {
2770         if (base_min_diff < 0) base_min_diff = 0;
2771       }
2772 
2773       if (base_shift < 8) {
2774         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2775         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2776         a0_x128 =
2777             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2778         a1_x128 =
2779             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2780 
2781         a0_x = _mm256_castsi128_si256(a0_x128);
2782         a1_x = _mm256_castsi128_si256(a1_x128);
2783       } else {
2784         a0_x = _mm256_setzero_si256();
2785         a1_x = _mm256_setzero_si256();
2786       }
2787 
2788       int base_shift1 = 0;
2789       if (base_shift > 8) {
2790         base_shift1 = base_shift - 8;
2791       }
2792       if (base_shift1 < 8) {
2793         a0_1_x128 =
2794             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2795         a1_1_x128 =
2796             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2797         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2798                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2799         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2800                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2801 
2802         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2803         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2804       }
2805       r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2806       shift = _mm256_srli_epi16(
2807           _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2808 
2809       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2810       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2811       a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2812 
2813       b = _mm256_mullo_epi16(diff, shift);
2814       res = _mm256_add_epi16(a32, b);
2815       resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2816 
2817       // y calc
2818       resy = _mm256_setzero_si256();
2819       __m256i a0_y, a1_y, shifty;
2820       if ((base_x < min_base_x)) {
2821         __m256i c256, y_c256, base_y_c256, mask256, mul16;
2822         r6 = _mm256_set1_epi16(r << 6);
2823         c256 = _mm256_add_epi16(j256, c1234);
2824         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2825                                  _mm256_srli_epi16(min_base_y256, 1));
2826         y_c256 = _mm256_sub_epi16(r6, mul16);
2827         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2828         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2829         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2830         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2831 
2832         a0_y = _mm256_setr_epi16(
2833             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2834             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2835             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2836             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2837             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2838             left[base_y_c[15]]);
2839         base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2840         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2841 
2842         a1_y = _mm256_setr_epi16(
2843             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2844             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2845             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2846             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2847             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2848             left[base_y_c[15]]);
2849 
2850         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2851 
2852         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2853         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2854         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2855 
2856         b = _mm256_mullo_epi16(diff, shifty);
2857         res = _mm256_add_epi16(a32, b);
2858         resy = _mm256_srli_epi16(res, 5);
2859       }
2860 
2861       resxy = _mm256_blendv_epi8(resx, resy,
2862                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2863       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2864     }  // for j
2865     dst += stride;
2866   }
2867 }
2868 
2869 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2870 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2871                                       int bh, const uint16_t *above,
2872                                       const uint16_t *left, int upsample_above,
2873                                       int upsample_left, int dx, int dy,
2874                                       int bd) {
2875   (void)bd;
2876   assert(dx > 0);
2877   assert(dy > 0);
2878   switch (bw) {
2879     case 4:
2880       if (bd < 12) {
2881         highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2882                                          upsample_above, upsample_left, dx, dy);
2883       } else {
2884         highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2885                                                upsample_above, upsample_left,
2886                                                dx, dy);
2887       }
2888       break;
2889     case 8:
2890       if (bd < 12) {
2891         highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2892                                          upsample_above, upsample_left, dx, dy);
2893       } else {
2894         highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2895                                                upsample_above, upsample_left,
2896                                                dx, dy);
2897       }
2898       break;
2899     default:
2900       if (bd < 12) {
2901         highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2902                                          upsample_above, upsample_left, dx, dy);
2903       } else {
2904         highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2905                                                upsample_above, upsample_left,
2906                                                dx, dy);
2907       }
2908       break;
2909   }
2910 }
2911 
2912 //  Directional prediction, zone 3 functions
highbd_dr_prediction_z3_4x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2913 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2914                                              const uint16_t *left,
2915                                              int upsample_left, int dy,
2916                                              int bd) {
2917   __m128i dstvec[4], d[4];
2918   if (bd < 12) {
2919     highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2920                                               dy);
2921   } else {
2922     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2923                                                     upsample_left, dy);
2924   }
2925   highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2926                                    &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2927   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2928   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2929   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2930   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2931   return;
2932 }
2933 
highbd_dr_prediction_z3_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2934 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2935                                              const uint16_t *left,
2936                                              int upsample_left, int dy,
2937                                              int bd) {
2938   __m128i dstvec[8], d[8];
2939   if (bd < 12) {
2940     highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2941                                               dy);
2942   } else {
2943     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2944                                                     upsample_left, dy);
2945   }
2946   highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2947                            &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2948                            &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2949                            &d[7]);
2950   for (int i = 0; i < 8; i++) {
2951     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2952   }
2953 }
2954 
highbd_dr_prediction_z3_4x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2955 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2956                                              const uint16_t *left,
2957                                              int upsample_left, int dy,
2958                                              int bd) {
2959   __m128i dstvec[4], d[8];
2960   if (bd < 12) {
2961     highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2962                                               dy);
2963   } else {
2964     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2965                                                     upsample_left, dy);
2966   }
2967 
2968   highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2969                                &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2970                                &d[7]);
2971   for (int i = 0; i < 8; i++) {
2972     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2973   }
2974 }
2975 
highbd_dr_prediction_z3_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2976 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2977                                              const uint16_t *left,
2978                                              int upsample_left, int dy,
2979                                              int bd) {
2980   __m128i dstvec[8], d[4];
2981   if (bd < 12) {
2982     highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2983                                               dy);
2984   } else {
2985     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2986                                                     upsample_left, dy);
2987   }
2988 
2989   highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2990                                &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2991                                &d[0], &d[1], &d[2], &d[3]);
2992   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2993   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2994   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2995   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2996 }
2997 
highbd_dr_prediction_z3_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2998 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2999                                               const uint16_t *left,
3000                                               int upsample_left, int dy,
3001                                               int bd) {
3002   __m256i dstvec[8], d[8];
3003   if (bd < 12) {
3004     highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
3005                                                dy);
3006   } else {
3007     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
3008                                                      upsample_left, dy);
3009   }
3010   highbd_transpose8x16_16x8_avx2(dstvec, d);
3011   for (int i = 0; i < 8; i++) {
3012     _mm_storeu_si128((__m128i *)(dst + i * stride),
3013                      _mm256_castsi256_si128(d[i]));
3014   }
3015   for (int i = 8; i < 16; i++) {
3016     _mm_storeu_si128((__m128i *)(dst + i * stride),
3017                      _mm256_extracti128_si256(d[i - 8], 1));
3018   }
3019 }
3020 
highbd_dr_prediction_z3_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3021 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3022                                               const uint16_t *left,
3023                                               int upsample_left, int dy,
3024                                               int bd) {
3025   __m128i dstvec[16], d[16];
3026   if (bd < 12) {
3027     highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3028                                               dy);
3029   } else {
3030     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3031                                                     upsample_left, dy);
3032   }
3033   for (int i = 0; i < 16; i += 8) {
3034     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3035                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3036                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3037                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3038                              &d[5 + i], &d[6 + i], &d[7 + i]);
3039   }
3040   for (int i = 0; i < 8; i++) {
3041     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3042     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3043   }
3044 }
3045 
3046 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_4x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3047 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3048                                               const uint16_t *left,
3049                                               int upsample_left, int dy,
3050                                               int bd) {
3051   __m256i dstvec[4], d[4], d1;
3052   if (bd < 12) {
3053     highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3054                                                dy);
3055   } else {
3056     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3057                                                      upsample_left, dy);
3058   }
3059   highbd_transpose4x16_avx2(dstvec, d);
3060   for (int i = 0; i < 4; i++) {
3061     _mm_storel_epi64((__m128i *)(dst + i * stride),
3062                      _mm256_castsi256_si128(d[i]));
3063     d1 = _mm256_bsrli_epi128(d[i], 8);
3064     _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3065                      _mm256_castsi256_si128(d1));
3066     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3067                      _mm256_extracti128_si256(d[i], 1));
3068     _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3069                      _mm256_extracti128_si256(d1, 1));
3070   }
3071 }
3072 
highbd_dr_prediction_z3_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3073 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3074                                               const uint16_t *left,
3075                                               int upsample_left, int dy,
3076                                               int bd) {
3077   __m128i dstvec[16], d[8];
3078   if (bd < 12) {
3079     highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3080                                               dy);
3081   } else {
3082     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3083                                                     upsample_left, dy);
3084   }
3085   highbd_transpose16x4_8x8_sse2(dstvec, d);
3086 
3087   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3088   _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3089   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3090   _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3091   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3092   _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3093   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3094   _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3095 }
3096 
highbd_dr_prediction_z3_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3097 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3098                                               const uint16_t *left,
3099                                               int upsample_left, int dy,
3100                                               int bd) {
3101   __m256i dstvec[16], d[16];
3102   if (bd < 12) {
3103     highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3104                                                dy);
3105   } else {
3106     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3107                                                      upsample_left, dy);
3108   }
3109 
3110   for (int i = 0; i < 16; i += 8) {
3111     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3112   }
3113 
3114   for (int i = 0; i < 8; i++) {
3115     _mm_storeu_si128((__m128i *)(dst + i * stride),
3116                      _mm256_castsi256_si128(d[i]));
3117   }
3118   for (int i = 0; i < 8; i++) {
3119     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3120                      _mm256_extracti128_si256(d[i], 1));
3121   }
3122   for (int i = 8; i < 16; i++) {
3123     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3124                      _mm256_castsi256_si128(d[i]));
3125   }
3126   for (int i = 8; i < 16; i++) {
3127     _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3128                      _mm256_extracti128_si256(d[i], 1));
3129   }
3130 }
3131 
highbd_dr_prediction_z3_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3132 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3133                                               const uint16_t *left,
3134                                               int upsample_left, int dy,
3135                                               int bd) {
3136   __m128i dstvec[32], d[32];
3137   if (bd < 12) {
3138     highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3139                                               dy);
3140   } else {
3141     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3142                                                     upsample_left, dy);
3143   }
3144 
3145   for (int i = 0; i < 32; i += 8) {
3146     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3147                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3148                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3149                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3150                              &d[5 + i], &d[6 + i], &d[7 + i]);
3151   }
3152   for (int i = 0; i < 8; i++) {
3153     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3154     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3155     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3156     _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3157   }
3158 }
3159 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3160 
highbd_dr_prediction_z3_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3161 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3162                                                const uint16_t *left,
3163                                                int upsample_left, int dy,
3164                                                int bd) {
3165   __m256i dstvec[16], d[16];
3166   if (bd < 12) {
3167     highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3168                                                dy);
3169   } else {
3170     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3171                                                      upsample_left, dy);
3172   }
3173 
3174   highbd_transpose16x16_avx2(dstvec, d);
3175 
3176   for (int i = 0; i < 16; i++) {
3177     _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3178   }
3179 }
3180 
highbd_dr_prediction_z3_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3181 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3182                                                const uint16_t *left,
3183                                                int upsample_left, int dy,
3184                                                int bd) {
3185   __m256i dstvec[64], d[16];
3186   if (bd < 12) {
3187     highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3188                                                dy);
3189   } else {
3190     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3191                                                      upsample_left, dy);
3192   }
3193   highbd_transpose16x16_avx2(dstvec, d);
3194   for (int j = 0; j < 16; j++) {
3195     _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3196   }
3197   highbd_transpose16x16_avx2(dstvec + 16, d);
3198   for (int j = 0; j < 16; j++) {
3199     _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3200   }
3201   highbd_transpose16x16_avx2(dstvec + 32, d);
3202   for (int j = 0; j < 16; j++) {
3203     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3204   }
3205   highbd_transpose16x16_avx2(dstvec + 48, d);
3206   for (int j = 0; j < 16; j++) {
3207     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3208   }
3209 }
3210 
highbd_dr_prediction_z3_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3211 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3212                                                const uint16_t *left,
3213                                                int upsample_left, int dy,
3214                                                int bd) {
3215   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3216   if (bd < 12) {
3217     highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3218   } else {
3219     highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3220                                             dy);
3221   }
3222   highbd_transpose(dstT, 64, dst, stride, 64, 64);
3223 }
3224 
highbd_dr_prediction_z3_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3225 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3226                                                const uint16_t *left,
3227                                                int upsample_left, int dy,
3228                                                int bd) {
3229   __m256i dstvec[32], d[32];
3230   if (bd < 12) {
3231     highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3232                                                dy);
3233   } else {
3234     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3235                                                      upsample_left, dy);
3236   }
3237   for (int i = 0; i < 32; i += 8) {
3238     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3239   }
3240   // store
3241   for (int j = 0; j < 32; j += 16) {
3242     for (int i = 0; i < 8; i++) {
3243       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3244                        _mm256_castsi256_si128(d[(i + j)]));
3245     }
3246     for (int i = 0; i < 8; i++) {
3247       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3248                        _mm256_castsi256_si128(d[(i + j) + 8]));
3249     }
3250     for (int i = 8; i < 16; i++) {
3251       _mm256_storeu_si256(
3252           (__m256i *)(dst + (i + j) * stride),
3253           _mm256_inserti128_si256(
3254               d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3255     }
3256   }
3257 }
3258 
highbd_dr_prediction_z3_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3259 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3260                                                const uint16_t *left,
3261                                                int upsample_left, int dy,
3262                                                int bd) {
3263   __m256i dstvec[32], d[16];
3264   if (bd < 12) {
3265     highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3266                                                dy);
3267   } else {
3268     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3269                                                      upsample_left, dy);
3270   }
3271   for (int i = 0; i < 32; i += 16) {
3272     highbd_transpose16x16_avx2((dstvec + i), d);
3273     for (int j = 0; j < 16; j++) {
3274       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3275     }
3276   }
3277 }
3278 
highbd_dr_prediction_z3_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3279 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3280                                                const uint16_t *left,
3281                                                int upsample_left, int dy,
3282                                                int bd) {
3283   uint16_t dstT[64 * 32];
3284   if (bd < 12) {
3285     highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3286   } else {
3287     highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3288                                             dy);
3289   }
3290   highbd_transpose(dstT, 64, dst, stride, 32, 64);
3291 }
3292 
highbd_dr_prediction_z3_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3293 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3294                                                const uint16_t *left,
3295                                                int upsample_left, int dy,
3296                                                int bd) {
3297   DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3298   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3299   highbd_transpose(dstT, 32, dst, stride, 64, 32);
3300   return;
3301 }
3302 
3303 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3304 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3305                                                const uint16_t *left,
3306                                                int upsample_left, int dy,
3307                                                int bd) {
3308   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3309   if (bd < 12) {
3310     highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3311   } else {
3312     highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3313                                             dy);
3314   }
3315   highbd_transpose(dstT, 64, dst, stride, 16, 64);
3316 }
3317 
highbd_dr_prediction_z3_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3318 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3319                                                const uint16_t *left,
3320                                                int upsample_left, int dy,
3321                                                int bd) {
3322   __m256i dstvec[64], d[16];
3323   if (bd < 12) {
3324     highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3325                                                dy);
3326   } else {
3327     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3328                                                      upsample_left, dy);
3329   }
3330   for (int i = 0; i < 64; i += 16) {
3331     highbd_transpose16x16_avx2((dstvec + i), d);
3332     for (int j = 0; j < 16; j++) {
3333       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3334     }
3335   }
3336 }
3337 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3338 
av1_highbd_dr_prediction_z3_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)3339 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3340                                       int bh, const uint16_t *above,
3341                                       const uint16_t *left, int upsample_left,
3342                                       int dx, int dy, int bd) {
3343   (void)above;
3344   (void)dx;
3345 
3346   assert(dx == 1);
3347   assert(dy > 0);
3348   if (bw == bh) {
3349     switch (bw) {
3350       case 4:
3351         highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3352                                          bd);
3353         break;
3354       case 8:
3355         highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3356                                          bd);
3357         break;
3358       case 16:
3359         highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3360                                            bd);
3361         break;
3362       case 32:
3363         highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3364                                            bd);
3365         break;
3366       case 64:
3367         highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3368                                            bd);
3369         break;
3370     }
3371   } else {
3372     if (bw < bh) {
3373       if (bw + bw == bh) {
3374         switch (bw) {
3375           case 4:
3376             highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3377                                              dy, bd);
3378             break;
3379           case 8:
3380             highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3381                                               dy, bd);
3382             break;
3383           case 16:
3384             highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3385                                                dy, bd);
3386             break;
3387           case 32:
3388             highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3389                                                dy, bd);
3390             break;
3391         }
3392       } else {
3393         switch (bw) {
3394 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3395           case 4:
3396             highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3397                                               dy, bd);
3398             break;
3399           case 8:
3400             highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3401                                               dy, bd);
3402             break;
3403           case 16:
3404             highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3405                                                dy, bd);
3406             break;
3407 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3408         }
3409       }
3410     } else {
3411       if (bh + bh == bw) {
3412         switch (bh) {
3413           case 4:
3414             highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3415                                              dy, bd);
3416             break;
3417           case 8:
3418             highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3419                                               dy, bd);
3420             break;
3421           case 16:
3422             highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3423                                                dy, bd);
3424             break;
3425           case 32:
3426             highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3427                                                dy, bd);
3428             break;
3429         }
3430       } else {
3431         switch (bh) {
3432 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3433           case 4:
3434             highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3435                                               dy, bd);
3436             break;
3437           case 8:
3438             highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3439                                               dy, bd);
3440             break;
3441           case 16:
3442             highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3443                                                dy, bd);
3444             break;
3445 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3446         }
3447       }
3448     }
3449   }
3450   return;
3451 }
3452 #endif  // CONFIG_AV1_HIGHBITDEPTH
3453 
3454 // Low bit depth functions
3455 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3456   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3457     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3458   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3459     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3460   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3462   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3463     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3464   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3466   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3468   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3469     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3470   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3471     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3472   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3473     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3474   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3475     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3476   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3477     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3478     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3479   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3480     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3481     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3482   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3483     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3484     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3485   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3486     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3487     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3488   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3489     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3490     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3491   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3492     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3493     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3494   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3495     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3496     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3497   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3499     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3500   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3502     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3503   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3505     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3506   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3508     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3509   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3511     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3512   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3515   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3518   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3520     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3521   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3523     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3524   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3526     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3527   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3529     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3530   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3533   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3536   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3539   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3540     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3541     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3542   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3543     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3544     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3545 };
3546 
3547 /* clang-format on */
dr_prediction_z1_HxW_internal_avx2(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)3548 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3549     int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3550     int dx) {
3551   const int frac_bits = 6 - upsample_above;
3552   const int max_base_x = ((W + H) - 1) << upsample_above;
3553 
3554   assert(dx > 0);
3555   // pre-filter above pixels
3556   // store in temp buffers:
3557   //   above[x] * 32 + 16
3558   //   above[x+1] - above[x]
3559   // final pixels will be calculated as:
3560   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3561   __m256i a0, a1, a32, a16;
3562   __m256i diff, c3f;
3563   __m128i a_mbase_x;
3564 
3565   a16 = _mm256_set1_epi16(16);
3566   a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3567   c3f = _mm256_set1_epi16(0x3f);
3568 
3569   int x = dx;
3570   for (int r = 0; r < W; r++) {
3571     __m256i b, res, shift;
3572     __m128i res1, a0_128, a1_128;
3573 
3574     int base = x >> frac_bits;
3575     int base_max_diff = (max_base_x - base) >> upsample_above;
3576     if (base_max_diff <= 0) {
3577       for (int i = r; i < W; ++i) {
3578         dst[i] = a_mbase_x;  // save 4 values
3579       }
3580       return;
3581     }
3582     if (base_max_diff > H) base_max_diff = H;
3583     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3584     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3585 
3586     if (upsample_above) {
3587       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3588       a1_128 = _mm_srli_si128(a0_128, 8);
3589 
3590       shift = _mm256_srli_epi16(
3591           _mm256_and_si256(
3592               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3593           1);
3594     } else {
3595       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3596     }
3597     a0 = _mm256_cvtepu8_epi16(a0_128);
3598     a1 = _mm256_cvtepu8_epi16(a1_128);
3599 
3600     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3601     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3602     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3603 
3604     b = _mm256_mullo_epi16(diff, shift);
3605     res = _mm256_add_epi16(a32, b);
3606     res = _mm256_srli_epi16(res, 5);
3607 
3608     res = _mm256_packus_epi16(
3609         res, _mm256_castsi128_si256(
3610                  _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3611     res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3612 
3613     dst[r] =
3614         _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3615     x += dx;
3616   }
3617 }
3618 
dr_prediction_z1_4xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3619 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3620                                       const uint8_t *above, int upsample_above,
3621                                       int dx) {
3622   __m128i dstvec[16];
3623 
3624   dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3625   for (int i = 0; i < N; i++) {
3626     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3627   }
3628 }
3629 
dr_prediction_z1_8xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3630 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3631                                       const uint8_t *above, int upsample_above,
3632                                       int dx) {
3633   __m128i dstvec[32];
3634 
3635   dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3636   for (int i = 0; i < N; i++) {
3637     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3638   }
3639 }
3640 
dr_prediction_z1_16xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3641 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3642                                        const uint8_t *above, int upsample_above,
3643                                        int dx) {
3644   __m128i dstvec[64];
3645 
3646   dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3647   for (int i = 0; i < N; i++) {
3648     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3649   }
3650 }
3651 
dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint8_t * above,int upsample_above,int dx)3652 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3653     int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3654   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3655   (void)upsample_above;
3656   const int frac_bits = 6;
3657   const int max_base_x = ((32 + N) - 1);
3658 
3659   // pre-filter above pixels
3660   // store in temp buffers:
3661   //   above[x] * 32 + 16
3662   //   above[x+1] - above[x]
3663   // final pixels will be calculated as:
3664   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3665   __m256i a0, a1, a32, a16;
3666   __m256i a_mbase_x, diff, c3f;
3667 
3668   a16 = _mm256_set1_epi16(16);
3669   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3670   c3f = _mm256_set1_epi16(0x3f);
3671 
3672   int x = dx;
3673   for (int r = 0; r < N; r++) {
3674     __m256i b, res, res16[2];
3675     __m128i a0_128, a1_128;
3676 
3677     int base = x >> frac_bits;
3678     int base_max_diff = (max_base_x - base);
3679     if (base_max_diff <= 0) {
3680       for (int i = r; i < N; ++i) {
3681         dstvec[i] = a_mbase_x;  // save 32 values
3682       }
3683       return;
3684     }
3685     if (base_max_diff > 32) base_max_diff = 32;
3686     __m256i shift =
3687         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3688 
3689     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3690       int mdiff = base_max_diff - j;
3691       if (mdiff <= 0) {
3692         res16[jj] = a_mbase_x;
3693       } else {
3694         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3695         a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3696         a0 = _mm256_cvtepu8_epi16(a0_128);
3697         a1 = _mm256_cvtepu8_epi16(a1_128);
3698 
3699         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3700         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3701         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3702         b = _mm256_mullo_epi16(diff, shift);
3703 
3704         res = _mm256_add_epi16(a32, b);
3705         res = _mm256_srli_epi16(res, 5);
3706         res16[jj] = _mm256_packus_epi16(
3707             res, _mm256_castsi128_si256(
3708                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3709       }
3710     }
3711     res16[1] =
3712         _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3713                                 1);  // 32 8bit values
3714 
3715     dstvec[r] = _mm256_blendv_epi8(
3716         a_mbase_x, res16[1],
3717         *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3718     x += dx;
3719   }
3720 }
3721 
dr_prediction_z1_32xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3722 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3723                                        const uint8_t *above, int upsample_above,
3724                                        int dx) {
3725   __m256i dstvec[64];
3726   dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3727   for (int i = 0; i < N; i++) {
3728     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3729   }
3730 }
3731 
dr_prediction_z1_64xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3732 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3733                                        const uint8_t *above, int upsample_above,
3734                                        int dx) {
3735   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3736   (void)upsample_above;
3737   const int frac_bits = 6;
3738   const int max_base_x = ((64 + N) - 1);
3739 
3740   // pre-filter above pixels
3741   // store in temp buffers:
3742   //   above[x] * 32 + 16
3743   //   above[x+1] - above[x]
3744   // final pixels will be calculated as:
3745   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3746   __m256i a0, a1, a32, a16;
3747   __m256i a_mbase_x, diff, c3f;
3748   __m128i max_base_x128, base_inc128, mask128;
3749 
3750   a16 = _mm256_set1_epi16(16);
3751   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3752   max_base_x128 = _mm_set1_epi8(max_base_x);
3753   c3f = _mm256_set1_epi16(0x3f);
3754 
3755   int x = dx;
3756   for (int r = 0; r < N; r++, dst += stride) {
3757     __m256i b, res;
3758     int base = x >> frac_bits;
3759     if (base >= max_base_x) {
3760       for (int i = r; i < N; ++i) {
3761         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3762         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3763         dst += stride;
3764       }
3765       return;
3766     }
3767 
3768     __m256i shift =
3769         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3770 
3771     __m128i a0_128, a1_128, res128;
3772     for (int j = 0; j < 64; j += 16) {
3773       int mdif = max_base_x - (base + j);
3774       if (mdif <= 0) {
3775         _mm_storeu_si128((__m128i *)(dst + j),
3776                          _mm256_castsi256_si128(a_mbase_x));
3777       } else {
3778         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3779         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3780         a0 = _mm256_cvtepu8_epi16(a0_128);
3781         a1 = _mm256_cvtepu8_epi16(a1_128);
3782 
3783         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3784         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3785         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3786         b = _mm256_mullo_epi16(diff, shift);
3787 
3788         res = _mm256_add_epi16(a32, b);
3789         res = _mm256_srli_epi16(res, 5);
3790         res = _mm256_packus_epi16(
3791             res, _mm256_castsi128_si256(
3792                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3793 
3794         base_inc128 =
3795             _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3796                           (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3797                           (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3798                           (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3799                           (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3800                           (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3801                           (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3802                           (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3803 
3804         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3805                                  _mm_setzero_si128());
3806         res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3807                                  _mm256_castsi256_si128(res), mask128);
3808         _mm_storeu_si128((__m128i *)(dst + j), res128);
3809       }
3810     }
3811     x += dx;
3812   }
3813 }
3814 
3815 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)3816 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3817                                const uint8_t *above, const uint8_t *left,
3818                                int upsample_above, int dx, int dy) {
3819   (void)left;
3820   (void)dy;
3821   switch (bw) {
3822     case 4:
3823       dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3824       break;
3825     case 8:
3826       dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3827       break;
3828     case 16:
3829       dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3830       break;
3831     case 32:
3832       dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3833       break;
3834     case 64:
3835       dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3836       break;
3837     default: break;
3838   }
3839   return;
3840 }
3841 
dr_prediction_z2_Nx4_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3842 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3843                                       const uint8_t *above, const uint8_t *left,
3844                                       int upsample_above, int upsample_left,
3845                                       int dx, int dy) {
3846   const int min_base_x = -(1 << upsample_above);
3847   const int min_base_y = -(1 << upsample_left);
3848   const int frac_bits_x = 6 - upsample_above;
3849   const int frac_bits_y = 6 - upsample_left;
3850 
3851   assert(dx > 0);
3852   // pre-filter above pixels
3853   // store in temp buffers:
3854   //   above[x] * 32 + 16
3855   //   above[x+1] - above[x]
3856   // final pixels will be calculated as:
3857   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3858   __m128i a0_x, a1_x, a32, a16, diff;
3859   __m128i c3f, min_base_y128, c1234, dy128;
3860 
3861   a16 = _mm_set1_epi16(16);
3862   c3f = _mm_set1_epi16(0x3f);
3863   min_base_y128 = _mm_set1_epi16(min_base_y);
3864   c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3865   dy128 = _mm_set1_epi16(dy);
3866 
3867   for (int r = 0; r < N; r++) {
3868     __m128i b, res, shift, r6, ydx;
3869     __m128i resx, resy, resxy;
3870     __m128i a0_x128, a1_x128;
3871     int y = r + 1;
3872     int base_x = (-y * dx) >> frac_bits_x;
3873     int base_shift = 0;
3874     if (base_x < (min_base_x - 1)) {
3875       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3876     }
3877     int base_min_diff =
3878         (min_base_x - base_x + upsample_above) >> upsample_above;
3879     if (base_min_diff > 4) {
3880       base_min_diff = 4;
3881     } else {
3882       if (base_min_diff < 0) base_min_diff = 0;
3883     }
3884 
3885     if (base_shift > 3) {
3886       a0_x = _mm_setzero_si128();
3887       a1_x = _mm_setzero_si128();
3888       shift = _mm_setzero_si128();
3889     } else {
3890       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3891       ydx = _mm_set1_epi16(y * dx);
3892       r6 = _mm_slli_epi16(c1234, 6);
3893 
3894       if (upsample_above) {
3895         a0_x128 =
3896             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3897         a1_x128 = _mm_srli_si128(a0_x128, 8);
3898 
3899         shift = _mm_srli_epi16(
3900             _mm_and_si128(
3901                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3902             1);
3903       } else {
3904         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3905         a1_x128 = _mm_srli_si128(a0_x128, 1);
3906 
3907         shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3908       }
3909       a0_x = _mm_cvtepu8_epi16(a0_x128);
3910       a1_x = _mm_cvtepu8_epi16(a1_x128);
3911     }
3912     // y calc
3913     __m128i a0_y, a1_y, shifty;
3914     if (base_x < min_base_x) {
3915       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3916       __m128i y_c128, base_y_c128, mask128, c1234_;
3917       c1234_ = _mm_srli_si128(c1234, 2);
3918       r6 = _mm_set1_epi16(r << 6);
3919       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3920       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3921       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3922       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3923       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3924 
3925       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3926                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3927       base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3928       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3929       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3930                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3931 
3932       if (upsample_left) {
3933         shifty = _mm_srli_epi16(
3934             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3935       } else {
3936         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3937       }
3938       a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3939       a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3940       shift = _mm_unpacklo_epi64(shift, shifty);
3941     }
3942 
3943     diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3944     a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3945     a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3946 
3947     b = _mm_mullo_epi16(diff, shift);
3948     res = _mm_add_epi16(a32, b);
3949     res = _mm_srli_epi16(res, 5);
3950 
3951     resx = _mm_packus_epi16(res, res);
3952     resy = _mm_srli_si128(resx, 4);
3953 
3954     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3955     *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3956     dst += stride;
3957   }
3958 }
3959 
dr_prediction_z2_Nx8_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3960 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3961                                       const uint8_t *above, const uint8_t *left,
3962                                       int upsample_above, int upsample_left,
3963                                       int dx, int dy) {
3964   const int min_base_x = -(1 << upsample_above);
3965   const int min_base_y = -(1 << upsample_left);
3966   const int frac_bits_x = 6 - upsample_above;
3967   const int frac_bits_y = 6 - upsample_left;
3968 
3969   // pre-filter above pixels
3970   // store in temp buffers:
3971   //   above[x] * 32 + 16
3972   //   above[x+1] - above[x]
3973   // final pixels will be calculated as:
3974   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3975   __m256i diff, a32, a16;
3976   __m256i a0_x, a1_x;
3977   __m128i a0_x128, a1_x128, min_base_y128, c3f;
3978   __m128i c1234, dy128;
3979 
3980   a16 = _mm256_set1_epi16(16);
3981   c3f = _mm_set1_epi16(0x3f);
3982   min_base_y128 = _mm_set1_epi16(min_base_y);
3983   dy128 = _mm_set1_epi16(dy);
3984   c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3985 
3986   for (int r = 0; r < N; r++) {
3987     __m256i b, res, shift;
3988     __m128i resx, resy, resxy, r6, ydx;
3989 
3990     int y = r + 1;
3991     int base_x = (-y * dx) >> frac_bits_x;
3992     int base_shift = 0;
3993     if (base_x < (min_base_x - 1)) {
3994       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3995     }
3996     int base_min_diff =
3997         (min_base_x - base_x + upsample_above) >> upsample_above;
3998     if (base_min_diff > 8) {
3999       base_min_diff = 8;
4000     } else {
4001       if (base_min_diff < 0) base_min_diff = 0;
4002     }
4003 
4004     if (base_shift > 7) {
4005       a0_x = _mm256_setzero_si256();
4006       a1_x = _mm256_setzero_si256();
4007       shift = _mm256_setzero_si256();
4008     } else {
4009       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
4010       ydx = _mm_set1_epi16(y * dx);
4011       r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
4012       if (upsample_above) {
4013         a0_x128 =
4014             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
4015         a1_x128 = _mm_srli_si128(a0_x128, 8);
4016 
4017         shift = _mm256_castsi128_si256(_mm_srli_epi16(
4018             _mm_and_si128(
4019                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
4020             1));
4021       } else {
4022         a1_x128 = _mm_srli_si128(a0_x128, 1);
4023         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4024         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4025 
4026         shift = _mm256_castsi128_si256(
4027             _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4028       }
4029       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4030       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4031     }
4032 
4033     // y calc
4034     __m128i a0_y, a1_y, shifty;
4035     if (base_x < min_base_x) {
4036       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4037       __m128i y_c128, base_y_c128, mask128;
4038       r6 = _mm_set1_epi16(r << 6);
4039       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4040       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4041       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4042       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4043       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4044 
4045       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4046                             left[base_y_c[2]], left[base_y_c[3]],
4047                             left[base_y_c[4]], left[base_y_c[5]],
4048                             left[base_y_c[6]], left[base_y_c[7]]);
4049       base_y_c128 = _mm_add_epi16(
4050           base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4051       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4052 
4053       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4054                             left[base_y_c[2]], left[base_y_c[3]],
4055                             left[base_y_c[4]], left[base_y_c[5]],
4056                             left[base_y_c[6]], left[base_y_c[7]]);
4057 
4058       if (upsample_left) {
4059         shifty = _mm_srli_epi16(
4060             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4061       } else {
4062         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4063       }
4064 
4065       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4066       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4067       shift = _mm256_inserti128_si256(shift, shifty, 1);
4068     }
4069 
4070     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4071     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4072     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4073 
4074     b = _mm256_mullo_epi16(diff, shift);
4075     res = _mm256_add_epi16(a32, b);
4076     res = _mm256_srli_epi16(res, 5);
4077 
4078     resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4079                             _mm256_castsi256_si128(res));
4080     resy = _mm256_extracti128_si256(res, 1);
4081     resy = _mm_packus_epi16(resy, resy);
4082 
4083     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4084     _mm_storel_epi64((__m128i *)(dst), resxy);
4085     dst += stride;
4086   }
4087 }
4088 
dr_prediction_z2_HxW_avx2(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4089 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4090                                       ptrdiff_t stride, const uint8_t *above,
4091                                       const uint8_t *left, int upsample_above,
4092                                       int upsample_left, int dx, int dy) {
4093   // here upsample_above and upsample_left are 0 by design of
4094   // av1_use_intra_edge_upsample
4095   const int min_base_x = -1;
4096   const int min_base_y = -1;
4097   (void)upsample_above;
4098   (void)upsample_left;
4099   const int frac_bits_x = 6;
4100   const int frac_bits_y = 6;
4101 
4102   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4103   __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4104   __m128i a0_x128, a1_x128;
4105 
4106   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4107   a16 = _mm256_set1_epi16(16);
4108   c1 = _mm256_srli_epi16(a16, 4);
4109   min_base_y256 = _mm256_set1_epi16(min_base_y);
4110   c3f = _mm256_set1_epi16(0x3f);
4111   dy256 = _mm256_set1_epi16(dy);
4112   c0123 =
4113       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4114   c1234 = _mm256_add_epi16(c0123, c1);
4115 
4116   for (int r = 0; r < H; r++) {
4117     __m256i b, res, shift, j256, r6, ydx;
4118     __m128i resx, resy;
4119     __m128i resxy;
4120     int y = r + 1;
4121     ydx = _mm256_set1_epi16((int16_t)(y * dx));
4122 
4123     int base_x = (-y * dx) >> frac_bits_x;
4124     for (int j = 0; j < W; j += 16) {
4125       j256 = _mm256_set1_epi16(j);
4126       int base_shift = 0;
4127       if ((base_x + j) < (min_base_x - 1)) {
4128         base_shift = (min_base_x - (base_x + j) - 1);
4129       }
4130       int base_min_diff = (min_base_x - base_x - j);
4131       if (base_min_diff > 16) {
4132         base_min_diff = 16;
4133       } else {
4134         if (base_min_diff < 0) base_min_diff = 0;
4135       }
4136 
4137       if (base_shift < 16) {
4138         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4139         a1_x128 =
4140             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4141         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4142         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4143 
4144         a0_x = _mm256_cvtepu8_epi16(a0_x128);
4145         a1_x = _mm256_cvtepu8_epi16(a1_x128);
4146 
4147         r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4148         shift = _mm256_srli_epi16(
4149             _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4150 
4151         diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4152         a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4153         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4154 
4155         b = _mm256_mullo_epi16(diff, shift);
4156         res = _mm256_add_epi16(a32, b);
4157         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4158         resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4159             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4160       } else {
4161         resx = _mm_setzero_si128();
4162       }
4163 
4164       // y calc
4165       if (base_x < min_base_x) {
4166         __m256i c256, y_c256, base_y_c256, mask256, mul16;
4167         r6 = _mm256_set1_epi16(r << 6);
4168         c256 = _mm256_add_epi16(j256, c1234);
4169         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4170                                  _mm256_srli_epi16(min_base_y256, 1));
4171         y_c256 = _mm256_sub_epi16(r6, mul16);
4172 
4173         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4174         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4175 
4176         base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4177         int16_t min_y = (int16_t)_mm_extract_epi16(
4178             _mm256_extracti128_si256(base_y_c256, 1), 7);
4179         int16_t max_y =
4180             (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4181         int16_t offset_diff = max_y - min_y;
4182 
4183         if (offset_diff < 16) {
4184           __m256i min_y256 = _mm256_set1_epi16(min_y);
4185 
4186           __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4187           __m128i base_y_offset128 =
4188               _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4189                               _mm256_extracti128_si256(base_y_offset, 1));
4190 
4191           __m128i a0_y128 = _mm_maskload_epi32(
4192               (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4193           __m128i a1_y128 =
4194               _mm_maskload_epi32((int *)(left + min_y + 1),
4195                                  *(__m128i *)LoadMaskz2[offset_diff / 4]);
4196           a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4197           a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4198           a0_y = _mm256_cvtepu8_epi16(a0_y128);
4199           a1_y = _mm256_cvtepu8_epi16(a1_y128);
4200         } else {
4201           base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4202           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4203 
4204           a0_y = _mm256_setr_epi16(
4205               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4206               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4207               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4208               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4209               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4210               left[base_y_c[15]]);
4211           base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4212           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4213 
4214           a1_y = _mm256_setr_epi16(
4215               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4216               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4217               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4218               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4219               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4220               left[base_y_c[15]]);
4221         }
4222         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4223 
4224         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4225         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4226         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4227 
4228         b = _mm256_mullo_epi16(diff, shifty);
4229         res = _mm256_add_epi16(a32, b);
4230         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4231         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4232             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4233       } else {
4234         resy = _mm_setzero_si128();
4235       }
4236       resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4237       _mm_storeu_si128((__m128i *)(dst + j), resxy);
4238     }  // for j
4239     dst += stride;
4240   }
4241 }
4242 
4243 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4244 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4245                                const uint8_t *above, const uint8_t *left,
4246                                int upsample_above, int upsample_left, int dx,
4247                                int dy) {
4248   assert(dx > 0);
4249   assert(dy > 0);
4250   switch (bw) {
4251     case 4:
4252       dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4253                                 upsample_left, dx, dy);
4254       break;
4255     case 8:
4256       dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4257                                 upsample_left, dx, dy);
4258       break;
4259     default:
4260       dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4261                                 upsample_above, upsample_left, dx, dy);
4262       break;
4263   }
4264   return;
4265 }
4266 
4267 // z3 functions
transpose16x32_avx2(__m256i * x,__m256i * d)4268 static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
4269   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4270   __m256i w10, w11, w12, w13, w14, w15;
4271 
4272   w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4273   w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4274   w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4275   w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4276 
4277   w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4278   w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4279   w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4280   w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4281 
4282   w4 = _mm256_unpacklo_epi16(w0, w1);
4283   w5 = _mm256_unpacklo_epi16(w2, w3);
4284   w12 = _mm256_unpacklo_epi16(w8, w9);
4285   w13 = _mm256_unpacklo_epi16(w10, w11);
4286 
4287   w6 = _mm256_unpacklo_epi32(w4, w5);
4288   w7 = _mm256_unpackhi_epi32(w4, w5);
4289   w14 = _mm256_unpacklo_epi32(w12, w13);
4290   w15 = _mm256_unpackhi_epi32(w12, w13);
4291 
4292   // Store first 4-line result
4293   d[0] = _mm256_unpacklo_epi64(w6, w14);
4294   d[1] = _mm256_unpackhi_epi64(w6, w14);
4295   d[2] = _mm256_unpacklo_epi64(w7, w15);
4296   d[3] = _mm256_unpackhi_epi64(w7, w15);
4297 
4298   w4 = _mm256_unpackhi_epi16(w0, w1);
4299   w5 = _mm256_unpackhi_epi16(w2, w3);
4300   w12 = _mm256_unpackhi_epi16(w8, w9);
4301   w13 = _mm256_unpackhi_epi16(w10, w11);
4302 
4303   w6 = _mm256_unpacklo_epi32(w4, w5);
4304   w7 = _mm256_unpackhi_epi32(w4, w5);
4305   w14 = _mm256_unpacklo_epi32(w12, w13);
4306   w15 = _mm256_unpackhi_epi32(w12, w13);
4307 
4308   // Store second 4-line result
4309   d[4] = _mm256_unpacklo_epi64(w6, w14);
4310   d[5] = _mm256_unpackhi_epi64(w6, w14);
4311   d[6] = _mm256_unpacklo_epi64(w7, w15);
4312   d[7] = _mm256_unpackhi_epi64(w7, w15);
4313 
4314   // upper half
4315   w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4316   w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4317   w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4318   w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4319 
4320   w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4321   w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4322   w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4323   w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4324 
4325   w4 = _mm256_unpacklo_epi16(w0, w1);
4326   w5 = _mm256_unpacklo_epi16(w2, w3);
4327   w12 = _mm256_unpacklo_epi16(w8, w9);
4328   w13 = _mm256_unpacklo_epi16(w10, w11);
4329 
4330   w6 = _mm256_unpacklo_epi32(w4, w5);
4331   w7 = _mm256_unpackhi_epi32(w4, w5);
4332   w14 = _mm256_unpacklo_epi32(w12, w13);
4333   w15 = _mm256_unpackhi_epi32(w12, w13);
4334 
4335   // Store first 4-line result
4336   d[8] = _mm256_unpacklo_epi64(w6, w14);
4337   d[9] = _mm256_unpackhi_epi64(w6, w14);
4338   d[10] = _mm256_unpacklo_epi64(w7, w15);
4339   d[11] = _mm256_unpackhi_epi64(w7, w15);
4340 
4341   w4 = _mm256_unpackhi_epi16(w0, w1);
4342   w5 = _mm256_unpackhi_epi16(w2, w3);
4343   w12 = _mm256_unpackhi_epi16(w8, w9);
4344   w13 = _mm256_unpackhi_epi16(w10, w11);
4345 
4346   w6 = _mm256_unpacklo_epi32(w4, w5);
4347   w7 = _mm256_unpackhi_epi32(w4, w5);
4348   w14 = _mm256_unpacklo_epi32(w12, w13);
4349   w15 = _mm256_unpackhi_epi32(w12, w13);
4350 
4351   // Store second 4-line result
4352   d[12] = _mm256_unpacklo_epi64(w6, w14);
4353   d[13] = _mm256_unpackhi_epi64(w6, w14);
4354   d[14] = _mm256_unpacklo_epi64(w7, w15);
4355   d[15] = _mm256_unpackhi_epi64(w7, w15);
4356 }
4357 
dr_prediction_z3_4x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4358 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4359                                       const uint8_t *left, int upsample_left,
4360                                       int dy) {
4361   __m128i dstvec[4], d[4];
4362 
4363   dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4364   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4365                             &d[0], &d[1], &d[2], &d[3]);
4366 
4367   *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4368   *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4369   *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4370   *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4371   return;
4372 }
4373 
dr_prediction_z3_8x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4374 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4375                                       const uint8_t *left, int upsample_left,
4376                                       int dy) {
4377   __m128i dstvec[8], d[8];
4378 
4379   dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4380   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4381                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4382                     &d[3]);
4383 
4384   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4385   _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4386   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4387   _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4388   _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4389   _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4390   _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4391   _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4392 }
4393 
dr_prediction_z3_4x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4394 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4395                                       const uint8_t *left, int upsample_left,
4396                                       int dy) {
4397   __m128i dstvec[4], d[8];
4398 
4399   dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4400   transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4401                         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4402   for (int i = 0; i < 8; i++) {
4403     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4404   }
4405 }
4406 
dr_prediction_z3_8x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4407 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4408                                       const uint8_t *left, int upsample_left,
4409                                       int dy) {
4410   __m128i dstvec[8], d[4];
4411 
4412   dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4413   transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4414                         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4415                         &d[1], &d[2], &d[3]);
4416   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4417   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4418   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4419   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4420 }
4421 
dr_prediction_z3_8x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4422 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4423                                        const uint8_t *left, int upsample_left,
4424                                        int dy) {
4425   __m128i dstvec[8], d[8];
4426 
4427   dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4428   transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4429                           dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4430                           d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4431   for (int i = 0; i < 8; i++) {
4432     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4433     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4434                      _mm_srli_si128(d[i], 8));
4435   }
4436 }
4437 
dr_prediction_z3_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4438 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4439                                        const uint8_t *left, int upsample_left,
4440                                        int dy) {
4441   __m128i dstvec[16], d[16];
4442 
4443   dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4444   transpose16x8_8x16_sse2(
4445       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4446       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4447       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4448       &d[3], &d[4], &d[5], &d[6], &d[7]);
4449 
4450   for (int i = 0; i < 8; i++) {
4451     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4452   }
4453 }
4454 
4455 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4456 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4457                                        const uint8_t *left, int upsample_left,
4458                                        int dy) {
4459   __m128i dstvec[4], d[16];
4460 
4461   dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4462   transpose4x16_sse2(dstvec, d);
4463   for (int i = 0; i < 16; i++) {
4464     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4465   }
4466 }
4467 
dr_prediction_z3_16x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4468 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4469                                        const uint8_t *left, int upsample_left,
4470                                        int dy) {
4471   __m128i dstvec[16], d[8];
4472 
4473   dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4474   for (int i = 4; i < 8; i++) {
4475     d[i] = _mm_setzero_si128();
4476   }
4477   transpose16x8_8x16_sse2(
4478       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4479       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4480       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4481       &d[3], &d[4], &d[5], &d[6], &d[7]);
4482 
4483   for (int i = 0; i < 4; i++) {
4484     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4485   }
4486 }
4487 
dr_prediction_z3_8x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4488 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4489                                        const uint8_t *left, int upsample_left,
4490                                        int dy) {
4491   __m256i dstvec[16], d[16];
4492 
4493   dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4494   for (int i = 8; i < 16; i++) {
4495     dstvec[i] = _mm256_setzero_si256();
4496   }
4497   transpose16x32_avx2(dstvec, d);
4498 
4499   for (int i = 0; i < 16; i++) {
4500     _mm_storel_epi64((__m128i *)(dst + i * stride),
4501                      _mm256_castsi256_si128(d[i]));
4502   }
4503   for (int i = 0; i < 16; i++) {
4504     _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4505                      _mm256_extracti128_si256(d[i], 1));
4506   }
4507 }
4508 
dr_prediction_z3_32x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4509 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4510                                        const uint8_t *left, int upsample_left,
4511                                        int dy) {
4512   __m128i dstvec[32], d[16];
4513 
4514   dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4515 
4516   transpose16x8_8x16_sse2(
4517       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4518       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4519       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4520       &d[3], &d[4], &d[5], &d[6], &d[7]);
4521   transpose16x8_8x16_sse2(
4522       &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4523       &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4524       &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4525       &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4526       &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4527       &d[6 + 8], &d[7 + 8]);
4528 
4529   for (int i = 0; i < 8; i++) {
4530     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4531     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4532   }
4533 }
4534 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4535 
dr_prediction_z3_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4536 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4537                                         const uint8_t *left, int upsample_left,
4538                                         int dy) {
4539   __m128i dstvec[16], d[16];
4540 
4541   dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4542   transpose16x16_sse2(dstvec, d);
4543 
4544   for (int i = 0; i < 16; i++) {
4545     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4546   }
4547 }
4548 
dr_prediction_z3_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4549 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4550                                         const uint8_t *left, int upsample_left,
4551                                         int dy) {
4552   __m256i dstvec[32], d[32];
4553 
4554   dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4555   transpose16x32_avx2(dstvec, d);
4556   transpose16x32_avx2(dstvec + 16, d + 16);
4557   for (int j = 0; j < 16; j++) {
4558     _mm_storeu_si128((__m128i *)(dst + j * stride),
4559                      _mm256_castsi256_si128(d[j]));
4560     _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4561                      _mm256_castsi256_si128(d[j + 16]));
4562   }
4563   for (int j = 0; j < 16; j++) {
4564     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4565                      _mm256_extracti128_si256(d[j], 1));
4566     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4567                      _mm256_extracti128_si256(d[j + 16], 1));
4568   }
4569 }
4570 
dr_prediction_z3_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4571 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4572                                         const uint8_t *left, int upsample_left,
4573                                         int dy) {
4574   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4575   dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4576   transpose(dstT, 64, dst, stride, 64, 64);
4577 }
4578 
dr_prediction_z3_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4579 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4580                                         const uint8_t *left, int upsample_left,
4581                                         int dy) {
4582   __m256i dstvec[16], d[16];
4583 
4584   dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4585   transpose16x32_avx2(dstvec, d);
4586   // store
4587   for (int j = 0; j < 16; j++) {
4588     _mm_storeu_si128((__m128i *)(dst + j * stride),
4589                      _mm256_castsi256_si128(d[j]));
4590     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4591                      _mm256_extracti128_si256(d[j], 1));
4592   }
4593 }
4594 
dr_prediction_z3_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4595 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4596                                         const uint8_t *left, int upsample_left,
4597                                         int dy) {
4598   __m128i dstvec[32], d[16];
4599 
4600   dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4601   for (int i = 0; i < 32; i += 16) {
4602     transpose16x16_sse2((dstvec + i), d);
4603     for (int j = 0; j < 16; j++) {
4604       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4605     }
4606   }
4607 }
4608 
dr_prediction_z3_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4609 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4610                                         const uint8_t *left, int upsample_left,
4611                                         int dy) {
4612   uint8_t dstT[64 * 32];
4613   dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4614   transpose(dstT, 64, dst, stride, 32, 64);
4615 }
4616 
dr_prediction_z3_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4617 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4618                                         const uint8_t *left, int upsample_left,
4619                                         int dy) {
4620   uint8_t dstT[32 * 64];
4621   dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4622   transpose(dstT, 32, dst, stride, 64, 32);
4623   return;
4624 }
4625 
4626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4627 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4628                                         const uint8_t *left, int upsample_left,
4629                                         int dy) {
4630   uint8_t dstT[64 * 16];
4631   dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4632   transpose(dstT, 64, dst, stride, 16, 64);
4633 }
4634 
dr_prediction_z3_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4635 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4636                                         const uint8_t *left, int upsample_left,
4637                                         int dy) {
4638   __m128i dstvec[64], d[16];
4639 
4640   dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4641   for (int i = 0; i < 64; i += 16) {
4642     transpose16x16_sse2((dstvec + i), d);
4643     for (int j = 0; j < 16; j++) {
4644       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4645     }
4646   }
4647 }
4648 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4649 
av1_dr_prediction_z3_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)4650 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4651                                const uint8_t *above, const uint8_t *left,
4652                                int upsample_left, int dx, int dy) {
4653   (void)above;
4654   (void)dx;
4655   assert(dx == 1);
4656   assert(dy > 0);
4657 
4658   if (bw == bh) {
4659     switch (bw) {
4660       case 4:
4661         dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4662         break;
4663       case 8:
4664         dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4665         break;
4666       case 16:
4667         dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4668         break;
4669       case 32:
4670         dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4671         break;
4672       case 64:
4673         dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4674         break;
4675     }
4676   } else {
4677     if (bw < bh) {
4678       if (bw + bw == bh) {
4679         switch (bw) {
4680           case 4:
4681             dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4682             break;
4683           case 8:
4684             dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4685             break;
4686           case 16:
4687             dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4688             break;
4689           case 32:
4690             dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4691             break;
4692         }
4693       } else {
4694         switch (bw) {
4695 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4696           case 4:
4697             dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4698             break;
4699           case 8:
4700             dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4701             break;
4702           case 16:
4703             dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4704             break;
4705 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4706         }
4707       }
4708     } else {
4709       if (bh + bh == bw) {
4710         switch (bh) {
4711           case 4:
4712             dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4713             break;
4714           case 8:
4715             dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4716             break;
4717           case 16:
4718             dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4719             break;
4720           case 32:
4721             dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4722             break;
4723         }
4724       } else {
4725         switch (bh) {
4726 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4727           case 4:
4728             dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4729             break;
4730           case 8:
4731             dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4732             break;
4733           case 16:
4734             dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4735             break;
4736 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4737         }
4738       }
4739     }
4740   }
4741 }
4742