1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15 #include "aom_dsp/x86/intrapred_x86.h"
16 #include "aom_dsp/x86/intrapred_utils.h"
17 #include "aom_dsp/x86/lpf_common_sse2.h"
18
dc_sum_64(const uint8_t * ref)19 static inline __m256i dc_sum_64(const uint8_t *ref) {
20 const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22 const __m256i zero = _mm256_setzero_si256();
23 __m256i y0 = _mm256_sad_epu8(x0, zero);
24 __m256i y1 = _mm256_sad_epu8(x1, zero);
25 y0 = _mm256_add_epi64(y0, y1);
26 __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27 y0 = _mm256_add_epi64(u0, y0);
28 u0 = _mm256_unpackhi_epi64(y0, y0);
29 return _mm256_add_epi16(y0, u0);
30 }
31
dc_sum_32(const uint8_t * ref)32 static inline __m256i dc_sum_32(const uint8_t *ref) {
33 const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34 const __m256i zero = _mm256_setzero_si256();
35 __m256i y = _mm256_sad_epu8(x, zero);
36 __m256i u = _mm256_permute2x128_si256(y, y, 1);
37 y = _mm256_add_epi64(u, y);
38 u = _mm256_unpackhi_epi64(y, y);
39 return _mm256_add_epi16(y, u);
40 }
41
row_store_32xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)42 static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43 ptrdiff_t stride) {
44 for (int i = 0; i < height; ++i) {
45 _mm256_storeu_si256((__m256i *)dst, *r);
46 dst += stride;
47 }
48 }
49
row_store_32x2xh(const __m256i * r0,const __m256i * r1,int height,uint8_t * dst,ptrdiff_t stride)50 static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51 int height, uint8_t *dst,
52 ptrdiff_t stride) {
53 for (int i = 0; i < height; ++i) {
54 _mm256_storeu_si256((__m256i *)dst, *r0);
55 _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56 dst += stride;
57 }
58 }
59
row_store_64xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)60 static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61 ptrdiff_t stride) {
62 for (int i = 0; i < height; ++i) {
63 _mm256_storeu_si256((__m256i *)dst, *r);
64 _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65 dst += stride;
66 }
67 }
68
69 #if CONFIG_AV1_HIGHBITDEPTH
70 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
71 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
72 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
73 { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
74 { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
75 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
76 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
77 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
78 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
79 };
80
81 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
82 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
83 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
84 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
85 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
86 };
87
88 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
89 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
90 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
91 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
92 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
93 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25,
94 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
95 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
96 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
97 { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21,
98 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
99 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
100 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
101 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
102 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
103 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
104 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
105 };
106
107 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
108 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109 { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110 { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111 { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112 { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
114 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0 },
116 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
117 0, 0 },
118 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
119 0, 0, 0, 0 },
120 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
121 0, 0, 0, 0, 0, 0 },
122 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
123 0xffff, 0, 0, 0, 0, 0, 0 },
124 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
125 0xffff, 0xffff, 0, 0, 0, 0, 0 },
126 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
127 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
128 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
129 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
130 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
131 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
132 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
133 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
134 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
135 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
136 };
137
138 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_transpose16x4_8x8_sse2(__m128i * x,__m128i * d)139 static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
140 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
141
142 r0 = _mm_unpacklo_epi16(x[0], x[1]);
143 r1 = _mm_unpacklo_epi16(x[2], x[3]);
144 r2 = _mm_unpacklo_epi16(x[4], x[5]);
145 r3 = _mm_unpacklo_epi16(x[6], x[7]);
146
147 r4 = _mm_unpacklo_epi16(x[8], x[9]);
148 r5 = _mm_unpacklo_epi16(x[10], x[11]);
149 r6 = _mm_unpacklo_epi16(x[12], x[13]);
150 r7 = _mm_unpacklo_epi16(x[14], x[15]);
151
152 r8 = _mm_unpacklo_epi32(r0, r1);
153 r9 = _mm_unpackhi_epi32(r0, r1);
154 r10 = _mm_unpacklo_epi32(r2, r3);
155 r11 = _mm_unpackhi_epi32(r2, r3);
156
157 r12 = _mm_unpacklo_epi32(r4, r5);
158 r13 = _mm_unpackhi_epi32(r4, r5);
159 r14 = _mm_unpacklo_epi32(r6, r7);
160 r15 = _mm_unpackhi_epi32(r6, r7);
161
162 r0 = _mm_unpacklo_epi64(r8, r9);
163 r1 = _mm_unpackhi_epi64(r8, r9);
164 r2 = _mm_unpacklo_epi64(r10, r11);
165 r3 = _mm_unpackhi_epi64(r10, r11);
166
167 r4 = _mm_unpacklo_epi64(r12, r13);
168 r5 = _mm_unpackhi_epi64(r12, r13);
169 r6 = _mm_unpacklo_epi64(r14, r15);
170 r7 = _mm_unpackhi_epi64(r14, r15);
171
172 d[0] = _mm_unpacklo_epi64(r0, r2);
173 d[1] = _mm_unpacklo_epi64(r4, r6);
174 d[2] = _mm_unpacklo_epi64(r1, r3);
175 d[3] = _mm_unpacklo_epi64(r5, r7);
176
177 d[4] = _mm_unpackhi_epi64(r0, r2);
178 d[5] = _mm_unpackhi_epi64(r4, r6);
179 d[6] = _mm_unpackhi_epi64(r1, r3);
180 d[7] = _mm_unpackhi_epi64(r5, r7);
181 }
182
highbd_transpose4x16_avx2(__m256i * x,__m256i * d)183 static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
184 __m256i w0, w1, w2, w3, ww0, ww1;
185
186 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
187 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
188 w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53
189 w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73
190
191 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
192 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
193
194 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
195 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
196
197 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
198 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
199
200 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
201 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
202 }
203 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204
highbd_transpose8x16_16x8_avx2(__m256i * x,__m256i * d)205 static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
206 __m256i w0, w1, w2, w3, ww0, ww1;
207
208 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
209 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
210 w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53
211 w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73
212
213 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
214 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
215
216 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
217 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
218
219 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
220 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
221
222 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
223 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
224
225 w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17
226 w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37
227 w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57
228 w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77
229
230 ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
231 ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
232
233 d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
234 d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
235
236 ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
237 ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
238
239 d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
240 d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
241 }
242
highbd_transpose16x16_avx2(__m256i * x,__m256i * d)243 static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
244 __m256i w0, w1, w2, w3, ww0, ww1;
245 __m256i dd[16];
246 w0 = _mm256_unpacklo_epi16(x[0], x[1]);
247 w1 = _mm256_unpacklo_epi16(x[2], x[3]);
248 w2 = _mm256_unpacklo_epi16(x[4], x[5]);
249 w3 = _mm256_unpacklo_epi16(x[6], x[7]);
250
251 ww0 = _mm256_unpacklo_epi32(w0, w1); //
252 ww1 = _mm256_unpacklo_epi32(w2, w3); //
253
254 dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
255 dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
256
257 ww0 = _mm256_unpackhi_epi32(w0, w1); //
258 ww1 = _mm256_unpackhi_epi32(w2, w3); //
259
260 dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
261 dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
262
263 w0 = _mm256_unpackhi_epi16(x[0], x[1]);
264 w1 = _mm256_unpackhi_epi16(x[2], x[3]);
265 w2 = _mm256_unpackhi_epi16(x[4], x[5]);
266 w3 = _mm256_unpackhi_epi16(x[6], x[7]);
267
268 ww0 = _mm256_unpacklo_epi32(w0, w1); //
269 ww1 = _mm256_unpacklo_epi32(w2, w3); //
270
271 dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
272 dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
273
274 ww0 = _mm256_unpackhi_epi32(w0, w1); //
275 ww1 = _mm256_unpackhi_epi32(w2, w3); //
276
277 dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
278 dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
279
280 w0 = _mm256_unpacklo_epi16(x[8], x[9]);
281 w1 = _mm256_unpacklo_epi16(x[10], x[11]);
282 w2 = _mm256_unpacklo_epi16(x[12], x[13]);
283 w3 = _mm256_unpacklo_epi16(x[14], x[15]);
284
285 ww0 = _mm256_unpacklo_epi32(w0, w1);
286 ww1 = _mm256_unpacklo_epi32(w2, w3);
287
288 dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
289 dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
290
291 ww0 = _mm256_unpackhi_epi32(w0, w1);
292 ww1 = _mm256_unpackhi_epi32(w2, w3);
293
294 dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
295 dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
296
297 w0 = _mm256_unpackhi_epi16(x[8], x[9]);
298 w1 = _mm256_unpackhi_epi16(x[10], x[11]);
299 w2 = _mm256_unpackhi_epi16(x[12], x[13]);
300 w3 = _mm256_unpackhi_epi16(x[14], x[15]);
301
302 ww0 = _mm256_unpacklo_epi32(w0, w1);
303 ww1 = _mm256_unpacklo_epi32(w2, w3);
304
305 dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
306 dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
307
308 ww0 = _mm256_unpackhi_epi32(w0, w1);
309 ww1 = _mm256_unpackhi_epi32(w2, w3);
310
311 dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
312 dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
313
314 for (int i = 0; i < 8; i++) {
315 d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
316 d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
317 _mm256_extracti128_si256(dd[i], 1), 0);
318 }
319 }
320 #endif // CONFIG_AV1_HIGHBITDEPTH
321
aom_dc_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)322 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
323 const uint8_t *above, const uint8_t *left) {
324 const __m256i sum_above = dc_sum_32(above);
325 __m256i sum_left = dc_sum_32(left);
326 sum_left = _mm256_add_epi16(sum_left, sum_above);
327 const __m256i thirtytwo = _mm256_set1_epi16(32);
328 sum_left = _mm256_add_epi16(sum_left, thirtytwo);
329 sum_left = _mm256_srai_epi16(sum_left, 6);
330 const __m256i zero = _mm256_setzero_si256();
331 __m256i row = _mm256_shuffle_epi8(sum_left, zero);
332 row_store_32xh(&row, 32, dst, stride);
333 }
334
aom_dc_top_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
336 const uint8_t *above,
337 const uint8_t *left) {
338 __m256i sum = dc_sum_32(above);
339 (void)left;
340
341 const __m256i sixteen = _mm256_set1_epi16(16);
342 sum = _mm256_add_epi16(sum, sixteen);
343 sum = _mm256_srai_epi16(sum, 5);
344 const __m256i zero = _mm256_setzero_si256();
345 __m256i row = _mm256_shuffle_epi8(sum, zero);
346 row_store_32xh(&row, 32, dst, stride);
347 }
348
aom_dc_left_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)349 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
350 const uint8_t *above,
351 const uint8_t *left) {
352 __m256i sum = dc_sum_32(left);
353 (void)above;
354
355 const __m256i sixteen = _mm256_set1_epi16(16);
356 sum = _mm256_add_epi16(sum, sixteen);
357 sum = _mm256_srai_epi16(sum, 5);
358 const __m256i zero = _mm256_setzero_si256();
359 __m256i row = _mm256_shuffle_epi8(sum, zero);
360 row_store_32xh(&row, 32, dst, stride);
361 }
362
aom_dc_128_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)363 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
364 const uint8_t *above,
365 const uint8_t *left) {
366 (void)above;
367 (void)left;
368 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
369 row_store_32xh(&row, 32, dst, stride);
370 }
371
aom_v_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)372 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
373 const uint8_t *above, const uint8_t *left) {
374 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
375 (void)left;
376 row_store_32xh(&row, 32, dst, stride);
377 }
378
379 // There are 32 rows togeter. This function does line:
380 // 0,1,2,3, and 16,17,18,19. The next call would do
381 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
382 // would finish 32 rows.
h_predictor_32x8line(const __m256i * row,uint8_t * dst,ptrdiff_t stride)383 static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
384 ptrdiff_t stride) {
385 __m256i t[4];
386 __m256i m = _mm256_setzero_si256();
387 const __m256i inc = _mm256_set1_epi8(4);
388 int i;
389
390 for (i = 0; i < 4; i++) {
391 t[i] = _mm256_shuffle_epi8(*row, m);
392 __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
393 __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
394 _mm256_storeu_si256((__m256i *)dst, r0);
395 _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
396 dst += stride;
397 m = _mm256_add_epi8(m, inc);
398 }
399 }
400
aom_h_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
402 const uint8_t *above, const uint8_t *left) {
403 (void)above;
404 const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
405
406 __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
407
408 __m256i v = _mm256_unpacklo_epi8(u, u);
409 h_predictor_32x8line(&v, dst, stride);
410 dst += stride << 2;
411
412 v = _mm256_unpackhi_epi8(u, u);
413 h_predictor_32x8line(&v, dst, stride);
414 dst += stride << 2;
415
416 u = _mm256_unpackhi_epi8(left_col, left_col);
417
418 v = _mm256_unpacklo_epi8(u, u);
419 h_predictor_32x8line(&v, dst, stride);
420 dst += stride << 2;
421
422 v = _mm256_unpackhi_epi8(u, u);
423 h_predictor_32x8line(&v, dst, stride);
424 }
425
426 // -----------------------------------------------------------------------------
427 // Rectangle
aom_dc_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)428 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
429 const uint8_t *above, const uint8_t *left) {
430 const __m128i top_sum = dc_sum_32_sse2(above);
431 __m128i left_sum = dc_sum_16_sse2(left);
432 left_sum = _mm_add_epi16(top_sum, left_sum);
433 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
434 sum += 24;
435 sum /= 48;
436 const __m256i row = _mm256_set1_epi8((int8_t)sum);
437 row_store_32xh(&row, 16, dst, stride);
438 }
439
aom_dc_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)440 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
441 const uint8_t *above, const uint8_t *left) {
442 const __m256i sum_above = dc_sum_32(above);
443 __m256i sum_left = dc_sum_64(left);
444 sum_left = _mm256_add_epi16(sum_left, sum_above);
445 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
446 sum += 48;
447 sum /= 96;
448 const __m256i row = _mm256_set1_epi8((int8_t)sum);
449 row_store_32xh(&row, 64, dst, stride);
450 }
451
aom_dc_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)452 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
453 const uint8_t *above, const uint8_t *left) {
454 const __m256i sum_above = dc_sum_64(above);
455 __m256i sum_left = dc_sum_64(left);
456 sum_left = _mm256_add_epi16(sum_left, sum_above);
457 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
458 sum += 64;
459 sum /= 128;
460 const __m256i row = _mm256_set1_epi8((int8_t)sum);
461 row_store_64xh(&row, 64, dst, stride);
462 }
463
aom_dc_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)464 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
465 const uint8_t *above, const uint8_t *left) {
466 const __m256i sum_above = dc_sum_64(above);
467 __m256i sum_left = dc_sum_32(left);
468 sum_left = _mm256_add_epi16(sum_left, sum_above);
469 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
470 sum += 48;
471 sum /= 96;
472 const __m256i row = _mm256_set1_epi8((int8_t)sum);
473 row_store_64xh(&row, 32, dst, stride);
474 }
475
476 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)477 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
478 const uint8_t *above, const uint8_t *left) {
479 const __m256i sum_above = dc_sum_64(above);
480 __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
481 sum_left = _mm256_add_epi16(sum_left, sum_above);
482 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
483 sum += 40;
484 sum /= 80;
485 const __m256i row = _mm256_set1_epi8((int8_t)sum);
486 row_store_64xh(&row, 16, dst, stride);
487 }
488 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
489
aom_dc_top_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)490 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
491 const uint8_t *above,
492 const uint8_t *left) {
493 __m256i sum = dc_sum_32(above);
494 (void)left;
495
496 const __m256i sixteen = _mm256_set1_epi16(16);
497 sum = _mm256_add_epi16(sum, sixteen);
498 sum = _mm256_srai_epi16(sum, 5);
499 const __m256i zero = _mm256_setzero_si256();
500 __m256i row = _mm256_shuffle_epi8(sum, zero);
501 row_store_32xh(&row, 16, dst, stride);
502 }
503
aom_dc_top_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)504 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
505 const uint8_t *above,
506 const uint8_t *left) {
507 __m256i sum = dc_sum_32(above);
508 (void)left;
509
510 const __m256i sixteen = _mm256_set1_epi16(16);
511 sum = _mm256_add_epi16(sum, sixteen);
512 sum = _mm256_srai_epi16(sum, 5);
513 const __m256i zero = _mm256_setzero_si256();
514 __m256i row = _mm256_shuffle_epi8(sum, zero);
515 row_store_32xh(&row, 64, dst, stride);
516 }
517
aom_dc_top_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)518 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
519 const uint8_t *above,
520 const uint8_t *left) {
521 __m256i sum = dc_sum_64(above);
522 (void)left;
523
524 const __m256i thirtytwo = _mm256_set1_epi16(32);
525 sum = _mm256_add_epi16(sum, thirtytwo);
526 sum = _mm256_srai_epi16(sum, 6);
527 const __m256i zero = _mm256_setzero_si256();
528 __m256i row = _mm256_shuffle_epi8(sum, zero);
529 row_store_64xh(&row, 64, dst, stride);
530 }
531
aom_dc_top_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
533 const uint8_t *above,
534 const uint8_t *left) {
535 __m256i sum = dc_sum_64(above);
536 (void)left;
537
538 const __m256i thirtytwo = _mm256_set1_epi16(32);
539 sum = _mm256_add_epi16(sum, thirtytwo);
540 sum = _mm256_srai_epi16(sum, 6);
541 const __m256i zero = _mm256_setzero_si256();
542 __m256i row = _mm256_shuffle_epi8(sum, zero);
543 row_store_64xh(&row, 32, dst, stride);
544 }
545
546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
548 const uint8_t *above,
549 const uint8_t *left) {
550 __m256i sum = dc_sum_64(above);
551 (void)left;
552
553 const __m256i thirtytwo = _mm256_set1_epi16(32);
554 sum = _mm256_add_epi16(sum, thirtytwo);
555 sum = _mm256_srai_epi16(sum, 6);
556 const __m256i zero = _mm256_setzero_si256();
557 __m256i row = _mm256_shuffle_epi8(sum, zero);
558 row_store_64xh(&row, 16, dst, stride);
559 }
560 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
561
aom_dc_left_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)562 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
563 const uint8_t *above,
564 const uint8_t *left) {
565 __m128i sum = dc_sum_16_sse2(left);
566 (void)above;
567
568 const __m128i eight = _mm_set1_epi16(8);
569 sum = _mm_add_epi16(sum, eight);
570 sum = _mm_srai_epi16(sum, 4);
571 const __m128i zero = _mm_setzero_si128();
572 const __m128i r = _mm_shuffle_epi8(sum, zero);
573 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
574 row_store_32xh(&row, 16, dst, stride);
575 }
576
aom_dc_left_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)577 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
578 const uint8_t *above,
579 const uint8_t *left) {
580 __m256i sum = dc_sum_64(left);
581 (void)above;
582
583 const __m256i thirtytwo = _mm256_set1_epi16(32);
584 sum = _mm256_add_epi16(sum, thirtytwo);
585 sum = _mm256_srai_epi16(sum, 6);
586 const __m256i zero = _mm256_setzero_si256();
587 __m256i row = _mm256_shuffle_epi8(sum, zero);
588 row_store_32xh(&row, 64, dst, stride);
589 }
590
aom_dc_left_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)591 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
592 const uint8_t *above,
593 const uint8_t *left) {
594 __m256i sum = dc_sum_64(left);
595 (void)above;
596
597 const __m256i thirtytwo = _mm256_set1_epi16(32);
598 sum = _mm256_add_epi16(sum, thirtytwo);
599 sum = _mm256_srai_epi16(sum, 6);
600 const __m256i zero = _mm256_setzero_si256();
601 __m256i row = _mm256_shuffle_epi8(sum, zero);
602 row_store_64xh(&row, 64, dst, stride);
603 }
604
aom_dc_left_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)605 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
606 const uint8_t *above,
607 const uint8_t *left) {
608 __m256i sum = dc_sum_32(left);
609 (void)above;
610
611 const __m256i sixteen = _mm256_set1_epi16(16);
612 sum = _mm256_add_epi16(sum, sixteen);
613 sum = _mm256_srai_epi16(sum, 5);
614 const __m256i zero = _mm256_setzero_si256();
615 __m256i row = _mm256_shuffle_epi8(sum, zero);
616 row_store_64xh(&row, 32, dst, stride);
617 }
618
619 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)620 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
621 const uint8_t *above,
622 const uint8_t *left) {
623 __m128i sum = dc_sum_16_sse2(left);
624 (void)above;
625
626 const __m128i eight = _mm_set1_epi16(8);
627 sum = _mm_add_epi16(sum, eight);
628 sum = _mm_srai_epi16(sum, 4);
629 const __m128i zero = _mm_setzero_si128();
630 const __m128i r = _mm_shuffle_epi8(sum, zero);
631 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
632 row_store_64xh(&row, 16, dst, stride);
633 }
634 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
635
aom_dc_128_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)636 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
637 const uint8_t *above,
638 const uint8_t *left) {
639 (void)above;
640 (void)left;
641 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
642 row_store_32xh(&row, 16, dst, stride);
643 }
644
aom_dc_128_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)645 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
646 const uint8_t *above,
647 const uint8_t *left) {
648 (void)above;
649 (void)left;
650 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
651 row_store_32xh(&row, 64, dst, stride);
652 }
653
aom_dc_128_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
655 const uint8_t *above,
656 const uint8_t *left) {
657 (void)above;
658 (void)left;
659 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
660 row_store_64xh(&row, 64, dst, stride);
661 }
662
aom_dc_128_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)663 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
664 const uint8_t *above,
665 const uint8_t *left) {
666 (void)above;
667 (void)left;
668 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
669 row_store_64xh(&row, 32, dst, stride);
670 }
671
672 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)673 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
674 const uint8_t *above,
675 const uint8_t *left) {
676 (void)above;
677 (void)left;
678 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
679 row_store_64xh(&row, 16, dst, stride);
680 }
681 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
682
aom_v_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)683 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
684 const uint8_t *above, const uint8_t *left) {
685 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
686 (void)left;
687 row_store_32xh(&row, 16, dst, stride);
688 }
689
aom_v_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)690 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
691 const uint8_t *above, const uint8_t *left) {
692 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
693 (void)left;
694 row_store_32xh(&row, 64, dst, stride);
695 }
696
aom_v_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)697 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
698 const uint8_t *above, const uint8_t *left) {
699 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
700 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
701 (void)left;
702 row_store_32x2xh(&row0, &row1, 64, dst, stride);
703 }
704
aom_v_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)705 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
706 const uint8_t *above, const uint8_t *left) {
707 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
708 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
709 (void)left;
710 row_store_32x2xh(&row0, &row1, 32, dst, stride);
711 }
712
713 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)714 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
715 const uint8_t *above, const uint8_t *left) {
716 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
717 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
718 (void)left;
719 row_store_32x2xh(&row0, &row1, 16, dst, stride);
720 }
721 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
722
723 // -----------------------------------------------------------------------------
724 // PAETH_PRED
725
726 // Return 16 16-bit pixels in one row (__m256i)
paeth_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)727 static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
728 const __m256i *topleft) {
729 const __m256i base =
730 _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
731
732 __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
733 __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
734 __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
735
736 __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
737 mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
738 __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
739
740 pl = _mm256_andnot_si256(mask1, *left);
741
742 ptl = _mm256_and_si256(mask2, *topleft);
743 pt = _mm256_andnot_si256(mask2, *top);
744 pt = _mm256_or_si256(pt, ptl);
745 pt = _mm256_and_si256(mask1, pt);
746
747 return _mm256_or_si256(pt, pl);
748 }
749
750 // Return 16 8-bit pixels in one row (__m128i)
paeth_16x1_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)751 static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
752 const __m256i *topleft) {
753 const __m256i p0 = paeth_pred(left, top, topleft);
754 const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
755 const __m256i p = _mm256_packus_epi16(p0, p1);
756 return _mm256_castsi256_si128(p);
757 }
758
get_top_vector(const uint8_t * above)759 static inline __m256i get_top_vector(const uint8_t *above) {
760 const __m128i x = _mm_load_si128((const __m128i *)above);
761 const __m128i zero = _mm_setzero_si128();
762 const __m128i t0 = _mm_unpacklo_epi8(x, zero);
763 const __m128i t1 = _mm_unpackhi_epi8(x, zero);
764 return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
765 }
766
aom_paeth_predictor_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)767 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
768 const uint8_t *above, const uint8_t *left) {
769 __m128i x = _mm_loadl_epi64((const __m128i *)left);
770 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
771 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
772 __m256i rep = _mm256_set1_epi16((short)0x8000);
773 const __m256i one = _mm256_set1_epi16(1);
774 const __m256i top = get_top_vector(above);
775
776 int i;
777 for (i = 0; i < 8; ++i) {
778 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
779 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
780
781 _mm_store_si128((__m128i *)dst, row);
782 dst += stride;
783 rep = _mm256_add_epi16(rep, one);
784 }
785 }
786
get_left_vector(const uint8_t * left)787 static inline __m256i get_left_vector(const uint8_t *left) {
788 const __m128i x = _mm_load_si128((const __m128i *)left);
789 return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
790 }
791
aom_paeth_predictor_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)792 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
793 const uint8_t *above, const uint8_t *left) {
794 const __m256i l = get_left_vector(left);
795 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
796 __m256i rep = _mm256_set1_epi16((short)0x8000);
797 const __m256i one = _mm256_set1_epi16(1);
798 const __m256i top = get_top_vector(above);
799
800 int i;
801 for (i = 0; i < 16; ++i) {
802 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
803 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
804
805 _mm_store_si128((__m128i *)dst, row);
806 dst += stride;
807 rep = _mm256_add_epi16(rep, one);
808 }
809 }
810
aom_paeth_predictor_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)811 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
812 const uint8_t *above, const uint8_t *left) {
813 __m256i l = get_left_vector(left);
814 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
815 __m256i rep = _mm256_set1_epi16((short)0x8000);
816 const __m256i one = _mm256_set1_epi16(1);
817 const __m256i top = get_top_vector(above);
818
819 int i;
820 for (i = 0; i < 16; ++i) {
821 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
822 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
823
824 _mm_store_si128((__m128i *)dst, row);
825 dst += stride;
826 rep = _mm256_add_epi16(rep, one);
827 }
828
829 l = get_left_vector(left + 16);
830 rep = _mm256_set1_epi16((short)0x8000);
831 for (i = 0; i < 16; ++i) {
832 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
833 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
834
835 _mm_store_si128((__m128i *)dst, row);
836 dst += stride;
837 rep = _mm256_add_epi16(rep, one);
838 }
839 }
840
841 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)842 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
843 const uint8_t *above, const uint8_t *left) {
844 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
845 const __m256i one = _mm256_set1_epi16(1);
846 const __m256i top = get_top_vector(above);
847
848 for (int j = 0; j < 4; ++j) {
849 const __m256i l = get_left_vector(left + j * 16);
850 __m256i rep = _mm256_set1_epi16((short)0x8000);
851 for (int i = 0; i < 16; ++i) {
852 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
853 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
854
855 _mm_store_si128((__m128i *)dst, row);
856 dst += stride;
857 rep = _mm256_add_epi16(rep, one);
858 }
859 }
860 }
861 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
862
863 // Return 32 8-bit pixels in one row (__m256i)
paeth_32x1_pred(const __m256i * left,const __m256i * top0,const __m256i * top1,const __m256i * topleft)864 static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
865 const __m256i *top1,
866 const __m256i *topleft) {
867 __m256i p0 = paeth_pred(left, top0, topleft);
868 __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
869 const __m256i x0 = _mm256_packus_epi16(p0, p1);
870
871 p0 = paeth_pred(left, top1, topleft);
872 p1 = _mm256_permute4x64_epi64(p0, 0xe);
873 const __m256i x1 = _mm256_packus_epi16(p0, p1);
874
875 return _mm256_permute2x128_si256(x0, x1, 0x20);
876 }
877
aom_paeth_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)878 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
879 const uint8_t *above, const uint8_t *left) {
880 const __m256i l = get_left_vector(left);
881 const __m256i t0 = get_top_vector(above);
882 const __m256i t1 = get_top_vector(above + 16);
883 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
884 __m256i rep = _mm256_set1_epi16((short)0x8000);
885 const __m256i one = _mm256_set1_epi16(1);
886
887 int i;
888 for (i = 0; i < 16; ++i) {
889 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
890
891 const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
892
893 _mm256_storeu_si256((__m256i *)dst, r);
894
895 dst += stride;
896 rep = _mm256_add_epi16(rep, one);
897 }
898 }
899
aom_paeth_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)900 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
901 const uint8_t *above, const uint8_t *left) {
902 __m256i l = get_left_vector(left);
903 const __m256i t0 = get_top_vector(above);
904 const __m256i t1 = get_top_vector(above + 16);
905 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
906 __m256i rep = _mm256_set1_epi16((short)0x8000);
907 const __m256i one = _mm256_set1_epi16(1);
908
909 int i;
910 for (i = 0; i < 16; ++i) {
911 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
912
913 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
914 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
915
916 _mm_store_si128((__m128i *)dst, r0);
917 _mm_store_si128((__m128i *)(dst + 16), r1);
918
919 dst += stride;
920 rep = _mm256_add_epi16(rep, one);
921 }
922
923 l = get_left_vector(left + 16);
924 rep = _mm256_set1_epi16((short)0x8000);
925 for (i = 0; i < 16; ++i) {
926 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
927
928 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
929 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
930
931 _mm_store_si128((__m128i *)dst, r0);
932 _mm_store_si128((__m128i *)(dst + 16), r1);
933
934 dst += stride;
935 rep = _mm256_add_epi16(rep, one);
936 }
937 }
938
aom_paeth_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)939 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
940 const uint8_t *above, const uint8_t *left) {
941 const __m256i t0 = get_top_vector(above);
942 const __m256i t1 = get_top_vector(above + 16);
943 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
944 const __m256i one = _mm256_set1_epi16(1);
945
946 int i, j;
947 for (j = 0; j < 4; ++j) {
948 const __m256i l = get_left_vector(left + j * 16);
949 __m256i rep = _mm256_set1_epi16((short)0x8000);
950 for (i = 0; i < 16; ++i) {
951 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
952
953 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
954 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
955
956 _mm_store_si128((__m128i *)dst, r0);
957 _mm_store_si128((__m128i *)(dst + 16), r1);
958
959 dst += stride;
960 rep = _mm256_add_epi16(rep, one);
961 }
962 }
963 }
964
aom_paeth_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)965 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
966 const uint8_t *above, const uint8_t *left) {
967 const __m256i t0 = get_top_vector(above);
968 const __m256i t1 = get_top_vector(above + 16);
969 const __m256i t2 = get_top_vector(above + 32);
970 const __m256i t3 = get_top_vector(above + 48);
971 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
972 const __m256i one = _mm256_set1_epi16(1);
973
974 int i, j;
975 for (j = 0; j < 2; ++j) {
976 const __m256i l = get_left_vector(left + j * 16);
977 __m256i rep = _mm256_set1_epi16((short)0x8000);
978 for (i = 0; i < 16; ++i) {
979 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
980
981 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
982 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
983 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
984 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
985
986 _mm_store_si128((__m128i *)dst, r0);
987 _mm_store_si128((__m128i *)(dst + 16), r1);
988 _mm_store_si128((__m128i *)(dst + 32), r2);
989 _mm_store_si128((__m128i *)(dst + 48), r3);
990
991 dst += stride;
992 rep = _mm256_add_epi16(rep, one);
993 }
994 }
995 }
996
aom_paeth_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)997 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
998 const uint8_t *above, const uint8_t *left) {
999 const __m256i t0 = get_top_vector(above);
1000 const __m256i t1 = get_top_vector(above + 16);
1001 const __m256i t2 = get_top_vector(above + 32);
1002 const __m256i t3 = get_top_vector(above + 48);
1003 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1004 const __m256i one = _mm256_set1_epi16(1);
1005
1006 int i, j;
1007 for (j = 0; j < 4; ++j) {
1008 const __m256i l = get_left_vector(left + j * 16);
1009 __m256i rep = _mm256_set1_epi16((short)0x8000);
1010 for (i = 0; i < 16; ++i) {
1011 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1012
1013 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1014 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1015 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1016 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1017
1018 _mm_store_si128((__m128i *)dst, r0);
1019 _mm_store_si128((__m128i *)(dst + 16), r1);
1020 _mm_store_si128((__m128i *)(dst + 32), r2);
1021 _mm_store_si128((__m128i *)(dst + 48), r3);
1022
1023 dst += stride;
1024 rep = _mm256_add_epi16(rep, one);
1025 }
1026 }
1027 }
1028
1029 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1030 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1031 const uint8_t *above, const uint8_t *left) {
1032 const __m256i t0 = get_top_vector(above);
1033 const __m256i t1 = get_top_vector(above + 16);
1034 const __m256i t2 = get_top_vector(above + 32);
1035 const __m256i t3 = get_top_vector(above + 48);
1036 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1037 const __m256i one = _mm256_set1_epi16(1);
1038
1039 int i;
1040 const __m256i l = get_left_vector(left);
1041 __m256i rep = _mm256_set1_epi16((short)0x8000);
1042 for (i = 0; i < 16; ++i) {
1043 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1044
1045 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1046 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1047 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1048 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1049
1050 _mm_store_si128((__m128i *)dst, r0);
1051 _mm_store_si128((__m128i *)(dst + 16), r1);
1052 _mm_store_si128((__m128i *)(dst + 32), r2);
1053 _mm_store_si128((__m128i *)(dst + 48), r3);
1054
1055 dst += stride;
1056 rep = _mm256_add_epi16(rep, one);
1057 }
1058 }
1059 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1060
1061 #if CONFIG_AV1_HIGHBITDEPTH
1062
highbd_dr_prediction_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1063 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1064 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1065 const int frac_bits = 6 - upsample_above;
1066 const int max_base_x = ((N + 4) - 1) << upsample_above;
1067
1068 assert(dx > 0);
1069 // pre-filter above pixels
1070 // store in temp buffers:
1071 // above[x] * 32 + 16
1072 // above[x+1] - above[x]
1073 // final pixels will be calculated as:
1074 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1075 __m256i a0, a1, a32, a16;
1076 __m256i diff, c3f;
1077 __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1078 __m128i a0_128, a1_128;
1079 a16 = _mm256_set1_epi16(16);
1080 a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1081 max_base_x128 = _mm_set1_epi16(max_base_x);
1082 c3f = _mm256_set1_epi16(0x3f);
1083
1084 int x = dx;
1085 for (int r = 0; r < N; r++) {
1086 __m256i b, res, shift;
1087 __m128i res1;
1088
1089 int base = x >> frac_bits;
1090 if (base >= max_base_x) {
1091 for (int i = r; i < N; ++i) {
1092 dst[i] = a_mbase_x; // save 4 values
1093 }
1094 return;
1095 }
1096
1097 a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1098 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1099
1100 if (upsample_above) {
1101 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1102 a1_128 = _mm_srli_si128(a0_128, 8);
1103
1104 base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1105 base + 10, base + 12, base + 14);
1106 shift = _mm256_srli_epi16(
1107 _mm256_and_si256(
1108 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1109 _mm256_set1_epi16(0x3f)),
1110 1);
1111 } else {
1112 base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1113 base + 5, base + 6, base + 7);
1114 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1115 }
1116 a0 = _mm256_castsi128_si256(a0_128);
1117 a1 = _mm256_castsi128_si256(a1_128);
1118 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1119 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1120 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1121
1122 b = _mm256_mullo_epi16(diff, shift);
1123 res = _mm256_add_epi16(a32, b);
1124 res = _mm256_srli_epi16(res, 5);
1125 res1 = _mm256_castsi256_si128(res);
1126
1127 mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1128 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1129 x += dx;
1130 }
1131 }
1132
highbd_dr_prediction_32bit_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1133 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1134 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1135 const int frac_bits = 6 - upsample_above;
1136 const int max_base_x = ((N + 4) - 1) << upsample_above;
1137
1138 assert(dx > 0);
1139 // pre-filter above pixels
1140 // store in temp buffers:
1141 // above[x] * 32 + 16
1142 // above[x+1] - above[x]
1143 // final pixels will be calculated as:
1144 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1145 __m256i a0, a1, a32, a16;
1146 __m256i diff;
1147 __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1148
1149 a16 = _mm256_set1_epi32(16);
1150 a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1151 max_base_x128 = _mm_set1_epi32(max_base_x);
1152
1153 int x = dx;
1154 for (int r = 0; r < N; r++) {
1155 __m256i b, res, shift;
1156 __m128i res1;
1157
1158 int base = x >> frac_bits;
1159 if (base >= max_base_x) {
1160 for (int i = r; i < N; ++i) {
1161 dst[i] = a_mbase_x; // save 4 values
1162 }
1163 return;
1164 }
1165
1166 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1167 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1168
1169 if (upsample_above) {
1170 a0 = _mm256_permutevar8x32_epi32(
1171 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1172 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1173 base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1174 shift = _mm256_srli_epi32(
1175 _mm256_and_si256(
1176 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1177 _mm256_set1_epi32(0x3f)),
1178 1);
1179 } else {
1180 base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1181 shift = _mm256_srli_epi32(
1182 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1183 }
1184
1185 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1186 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1187 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1188
1189 b = _mm256_mullo_epi32(diff, shift);
1190 res = _mm256_add_epi32(a32, b);
1191 res = _mm256_srli_epi32(res, 5);
1192
1193 res1 = _mm256_castsi256_si128(res);
1194 res1 = _mm_packus_epi32(res1, res1);
1195
1196 mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1197 mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit
1198 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1199 x += dx;
1200 }
1201 }
1202
highbd_dr_prediction_z1_4xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1203 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1204 ptrdiff_t stride,
1205 const uint16_t *above,
1206 int upsample_above, int dx,
1207 int bd) {
1208 __m128i dstvec[16];
1209 if (bd < 12) {
1210 highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1211 dx);
1212 } else {
1213 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1214 upsample_above, dx);
1215 }
1216 for (int i = 0; i < N; i++) {
1217 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1218 }
1219 }
1220
highbd_dr_prediction_32bit_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1221 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1222 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1223 const int frac_bits = 6 - upsample_above;
1224 const int max_base_x = ((8 + N) - 1) << upsample_above;
1225
1226 assert(dx > 0);
1227 // pre-filter above pixels
1228 // store in temp buffers:
1229 // above[x] * 32 + 16
1230 // above[x+1] - above[x]
1231 // final pixels will be calculated as:
1232 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1233 __m256i a0, a1, a0_1, a1_1, a32, a16;
1234 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1235
1236 a16 = _mm256_set1_epi32(16);
1237 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1238 max_base_x256 = _mm256_set1_epi32(max_base_x);
1239
1240 int x = dx;
1241 for (int r = 0; r < N; r++) {
1242 __m256i b, res, res1, shift;
1243
1244 int base = x >> frac_bits;
1245 if (base >= max_base_x) {
1246 for (int i = r; i < N; ++i) {
1247 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
1248 }
1249 return;
1250 }
1251
1252 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1253 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1254
1255 if (upsample_above) {
1256 a0 = _mm256_permutevar8x32_epi32(
1257 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1258 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1259
1260 a0_1 =
1261 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1262 a0_1 = _mm256_permutevar8x32_epi32(
1263 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1264 a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1265
1266 a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1267 a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1268 base_inc256 =
1269 _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1270 base + 10, base + 12, base + 14);
1271 shift = _mm256_srli_epi32(
1272 _mm256_and_si256(
1273 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1274 _mm256_set1_epi32(0x3f)),
1275 1);
1276 } else {
1277 base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1278 base + 4, base + 5, base + 6, base + 7);
1279 shift = _mm256_srli_epi32(
1280 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1281 }
1282
1283 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1284 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1285 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1286
1287 b = _mm256_mullo_epi32(diff, shift);
1288 res = _mm256_add_epi32(a32, b);
1289 res = _mm256_srli_epi32(res, 5);
1290
1291 res1 = _mm256_packus_epi32(
1292 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1293
1294 mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1295 mask256 = _mm256_packs_epi32(
1296 mask256, _mm256_castsi128_si256(
1297 _mm256_extracti128_si256(mask256, 1))); // goto 16 bit
1298 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1299 dst[r] = _mm256_castsi256_si128(res1);
1300 x += dx;
1301 }
1302 }
1303
highbd_dr_prediction_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1304 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1305 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1306 const int frac_bits = 6 - upsample_above;
1307 const int max_base_x = ((8 + N) - 1) << upsample_above;
1308
1309 assert(dx > 0);
1310 // pre-filter above pixels
1311 // store in temp buffers:
1312 // above[x] * 32 + 16
1313 // above[x+1] - above[x]
1314 // final pixels will be calculated as:
1315 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1316 __m256i a0, a1, a32, a16, c3f;
1317 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1318 __m128i a0_x128, a1_x128;
1319
1320 a16 = _mm256_set1_epi16(16);
1321 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1322 max_base_x256 = _mm256_set1_epi16(max_base_x);
1323 c3f = _mm256_set1_epi16(0x3f);
1324
1325 int x = dx;
1326 for (int r = 0; r < N; r++) {
1327 __m256i b, res, res1, shift;
1328
1329 int base = x >> frac_bits;
1330 if (base >= max_base_x) {
1331 for (int i = r; i < N; ++i) {
1332 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
1333 }
1334 return;
1335 }
1336
1337 a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1338 if (upsample_above) {
1339 __m128i mask, atmp0, atmp1, atmp2, atmp3;
1340 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1341 atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1342 atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1343 atmp2 =
1344 _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1345 atmp3 =
1346 _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1347 mask =
1348 _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1349 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1350 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1351 _mm_set1_epi8(15));
1352 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1353
1354 base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1355 base + 8, base + 10, base + 12, base + 14,
1356 0, 0, 0, 0, 0, 0, 0, 0);
1357 shift = _mm256_srli_epi16(
1358 _mm256_and_si256(
1359 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1360 1);
1361 } else {
1362 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1363 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1364 base + 4, base + 5, base + 6, base + 7, 0,
1365 0, 0, 0, 0, 0, 0, 0);
1366 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1367 }
1368 a0 = _mm256_castsi128_si256(a0_x128);
1369 a1 = _mm256_castsi128_si256(a1_x128);
1370
1371 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1372 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1373 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1374
1375 b = _mm256_mullo_epi16(diff, shift);
1376 res = _mm256_add_epi16(a32, b);
1377 res = _mm256_srli_epi16(res, 5);
1378
1379 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1380 res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1381 dst[r] = _mm256_castsi256_si128(res1);
1382 x += dx;
1383 }
1384 }
1385
highbd_dr_prediction_z1_8xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1386 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1387 ptrdiff_t stride,
1388 const uint16_t *above,
1389 int upsample_above, int dx,
1390 int bd) {
1391 __m128i dstvec[32];
1392 if (bd < 12) {
1393 highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1394 dx);
1395 } else {
1396 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1397 upsample_above, dx);
1398 }
1399 for (int i = 0; i < N; i++) {
1400 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1401 }
1402 }
1403
highbd_dr_prediction_32bit_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1404 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1405 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1406 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1407 (void)upsample_above;
1408 const int frac_bits = 6;
1409 const int max_base_x = ((16 + N) - 1);
1410
1411 // pre-filter above pixels
1412 // store in temp buffers:
1413 // above[x] * 32 + 16
1414 // above[x+1] - above[x]
1415 // final pixels will be calculated as:
1416 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1417 __m256i a0, a0_1, a1, a1_1, a32, a16;
1418 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1419
1420 a16 = _mm256_set1_epi32(16);
1421 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1422 max_base_x256 = _mm256_set1_epi16(max_base_x);
1423
1424 int x = dx;
1425 for (int r = 0; r < N; r++) {
1426 __m256i b, res[2], res1;
1427
1428 int base = x >> frac_bits;
1429 if (base >= max_base_x) {
1430 for (int i = r; i < N; ++i) {
1431 dstvec[i] = a_mbase_x; // save 16 values
1432 }
1433 return;
1434 }
1435 __m256i shift = _mm256_srli_epi32(
1436 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1437
1438 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1439 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1440
1441 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1442 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1443 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1444 b = _mm256_mullo_epi32(diff, shift);
1445
1446 res[0] = _mm256_add_epi32(a32, b);
1447 res[0] = _mm256_srli_epi32(res[0], 5);
1448 res[0] = _mm256_packus_epi32(
1449 res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1450
1451 int mdif = max_base_x - base;
1452 if (mdif > 8) {
1453 a0_1 =
1454 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1455 a1_1 =
1456 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1457
1458 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1459 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1460 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1461 b = _mm256_mullo_epi32(diff, shift);
1462
1463 res[1] = _mm256_add_epi32(a32, b);
1464 res[1] = _mm256_srli_epi32(res[1], 5);
1465 res[1] = _mm256_packus_epi32(
1466 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1467 } else {
1468 res[1] = a_mbase_x;
1469 }
1470 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1471 1); // 16 16bit values
1472
1473 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1474 base + 4, base + 5, base + 6, base + 7,
1475 base + 8, base + 9, base + 10, base + 11,
1476 base + 12, base + 13, base + 14, base + 15);
1477 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1478 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1479 x += dx;
1480 }
1481 }
1482
highbd_dr_prediction_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1483 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1484 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1485 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1486 (void)upsample_above;
1487 const int frac_bits = 6;
1488 const int max_base_x = ((16 + N) - 1);
1489
1490 // pre-filter above pixels
1491 // store in temp buffers:
1492 // above[x] * 32 + 16
1493 // above[x+1] - above[x]
1494 // final pixels will be calculated as:
1495 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1496 __m256i a0, a1, a32, a16, c3f;
1497 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1498
1499 a16 = _mm256_set1_epi16(16);
1500 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1501 max_base_x256 = _mm256_set1_epi16(max_base_x);
1502 c3f = _mm256_set1_epi16(0x3f);
1503
1504 int x = dx;
1505 for (int r = 0; r < N; r++) {
1506 __m256i b, res;
1507
1508 int base = x >> frac_bits;
1509 if (base >= max_base_x) {
1510 for (int i = r; i < N; ++i) {
1511 dstvec[i] = a_mbase_x; // save 16 values
1512 }
1513 return;
1514 }
1515 __m256i shift =
1516 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1517
1518 a0 = _mm256_loadu_si256((__m256i *)(above + base));
1519 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1520
1521 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1522 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1523 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1524 b = _mm256_mullo_epi16(diff, shift);
1525
1526 res = _mm256_add_epi16(a32, b);
1527 res = _mm256_srli_epi16(res, 5); // 16 16bit values
1528
1529 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1530 base + 4, base + 5, base + 6, base + 7,
1531 base + 8, base + 9, base + 10, base + 11,
1532 base + 12, base + 13, base + 14, base + 15);
1533 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1534 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1535 x += dx;
1536 }
1537 }
1538
highbd_dr_prediction_z1_16xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1539 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1540 ptrdiff_t stride,
1541 const uint16_t *above,
1542 int upsample_above, int dx,
1543 int bd) {
1544 __m256i dstvec[64];
1545 if (bd < 12) {
1546 highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1547 dx);
1548 } else {
1549 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1550 upsample_above, dx);
1551 }
1552 for (int i = 0; i < N; i++) {
1553 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1554 }
1555 }
1556
highbd_dr_prediction_32bit_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1557 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1558 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1559 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1560 (void)upsample_above;
1561 const int frac_bits = 6;
1562 const int max_base_x = ((32 + N) - 1);
1563
1564 // pre-filter above pixels
1565 // store in temp buffers:
1566 // above[x] * 32 + 16
1567 // above[x+1] - above[x]
1568 // final pixels will be calculated as:
1569 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1570 __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1571 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1572
1573 a16 = _mm256_set1_epi32(16);
1574 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1575 max_base_x256 = _mm256_set1_epi16(max_base_x);
1576 c3f = _mm256_set1_epi16(0x3f);
1577
1578 int x = dx;
1579 for (int r = 0; r < N; r++) {
1580 __m256i b, res[2], res1;
1581
1582 int base = x >> frac_bits;
1583 if (base >= max_base_x) {
1584 for (int i = r; i < N; ++i) {
1585 dstvec[i] = a_mbase_x; // save 32 values
1586 dstvec[i + N] = a_mbase_x;
1587 }
1588 return;
1589 }
1590
1591 __m256i shift =
1592 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1593
1594 for (int j = 0; j < 32; j += 16) {
1595 int mdif = max_base_x - (base + j);
1596 if (mdif <= 0) {
1597 res1 = a_mbase_x;
1598 } else {
1599 a0 = _mm256_cvtepu16_epi32(
1600 _mm_loadu_si128((__m128i *)(above + base + j)));
1601 a1 = _mm256_cvtepu16_epi32(
1602 _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1603
1604 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1605 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1606 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1607 b = _mm256_mullo_epi32(diff, shift);
1608
1609 res[0] = _mm256_add_epi32(a32, b);
1610 res[0] = _mm256_srli_epi32(res[0], 5);
1611 res[0] = _mm256_packus_epi32(
1612 res[0],
1613 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1614 if (mdif > 8) {
1615 a0_1 = _mm256_cvtepu16_epi32(
1616 _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1617 a1_1 = _mm256_cvtepu16_epi32(
1618 _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1619
1620 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1621 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1622 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1623 b = _mm256_mullo_epi32(diff, shift);
1624
1625 res[1] = _mm256_add_epi32(a32, b);
1626 res[1] = _mm256_srli_epi32(res[1], 5);
1627 res[1] = _mm256_packus_epi32(
1628 res[1],
1629 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1630 } else {
1631 res[1] = a_mbase_x;
1632 }
1633 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1634 1); // 16 16bit values
1635 base_inc256 = _mm256_setr_epi16(
1636 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1637 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1638 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1639 base + j + 13, base + j + 14, base + j + 15);
1640
1641 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1642 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1643 }
1644 if (!j) {
1645 dstvec[r] = res1;
1646 } else {
1647 dstvec[r + N] = res1;
1648 }
1649 }
1650 x += dx;
1651 }
1652 }
1653
highbd_dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1654 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1655 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1656 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1657 (void)upsample_above;
1658 const int frac_bits = 6;
1659 const int max_base_x = ((32 + N) - 1);
1660
1661 // pre-filter above pixels
1662 // store in temp buffers:
1663 // above[x] * 32 + 16
1664 // above[x+1] - above[x]
1665 // final pixels will be calculated as:
1666 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1667 __m256i a0, a1, a32, a16, c3f;
1668 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1669
1670 a16 = _mm256_set1_epi16(16);
1671 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1672 max_base_x256 = _mm256_set1_epi16(max_base_x);
1673 c3f = _mm256_set1_epi16(0x3f);
1674
1675 int x = dx;
1676 for (int r = 0; r < N; r++) {
1677 __m256i b, res;
1678
1679 int base = x >> frac_bits;
1680 if (base >= max_base_x) {
1681 for (int i = r; i < N; ++i) {
1682 dstvec[i] = a_mbase_x; // save 32 values
1683 dstvec[i + N] = a_mbase_x;
1684 }
1685 return;
1686 }
1687
1688 __m256i shift =
1689 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1690
1691 for (int j = 0; j < 32; j += 16) {
1692 int mdif = max_base_x - (base + j);
1693 if (mdif <= 0) {
1694 res = a_mbase_x;
1695 } else {
1696 a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1697 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1698
1699 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1700 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1701 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1702 b = _mm256_mullo_epi16(diff, shift);
1703
1704 res = _mm256_add_epi16(a32, b);
1705 res = _mm256_srli_epi16(res, 5);
1706
1707 base_inc256 = _mm256_setr_epi16(
1708 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1709 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1710 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1711 base + j + 13, base + j + 14, base + j + 15);
1712
1713 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1714 res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1715 }
1716 if (!j) {
1717 dstvec[r] = res;
1718 } else {
1719 dstvec[r + N] = res;
1720 }
1721 }
1722 x += dx;
1723 }
1724 }
1725
highbd_dr_prediction_z1_32xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1726 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1727 ptrdiff_t stride,
1728 const uint16_t *above,
1729 int upsample_above, int dx,
1730 int bd) {
1731 __m256i dstvec[128];
1732 if (bd < 12) {
1733 highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1734 dx);
1735 } else {
1736 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1737 upsample_above, dx);
1738 }
1739 for (int i = 0; i < N; i++) {
1740 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1741 _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1742 }
1743 }
1744
highbd_dr_prediction_32bit_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1745 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1746 ptrdiff_t stride,
1747 const uint16_t *above,
1748 int upsample_above,
1749 int dx) {
1750 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1751 (void)upsample_above;
1752 const int frac_bits = 6;
1753 const int max_base_x = ((64 + N) - 1);
1754
1755 // pre-filter above pixels
1756 // store in temp buffers:
1757 // above[x] * 32 + 16
1758 // above[x+1] - above[x]
1759 // final pixels will be calculated as:
1760 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1761 __m256i a0, a0_1, a1, a1_1, a32, a16;
1762 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1763
1764 a16 = _mm256_set1_epi32(16);
1765 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1766 max_base_x256 = _mm256_set1_epi16(max_base_x);
1767
1768 int x = dx;
1769 for (int r = 0; r < N; r++, dst += stride) {
1770 __m256i b, res[2], res1;
1771
1772 int base = x >> frac_bits;
1773 if (base >= max_base_x) {
1774 for (int i = r; i < N; ++i) {
1775 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1776 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1777 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1778 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1779 dst += stride;
1780 }
1781 return;
1782 }
1783
1784 __m256i shift = _mm256_srli_epi32(
1785 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1786
1787 __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1788 for (int j = 0; j < 64; j += 16) {
1789 int mdif = max_base_x - (base + j);
1790 if (mdif <= 0) {
1791 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1792 } else {
1793 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1794 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1795 a0 = _mm256_cvtepu16_epi32(a0_128);
1796 a1 = _mm256_cvtepu16_epi32(a1_128);
1797
1798 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1799 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1800 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1801 b = _mm256_mullo_epi32(diff, shift);
1802
1803 res[0] = _mm256_add_epi32(a32, b);
1804 res[0] = _mm256_srli_epi32(res[0], 5);
1805 res[0] = _mm256_packus_epi32(
1806 res[0],
1807 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1808 if (mdif > 8) {
1809 a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1810 a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1811 a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1812 a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1813
1814 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1815 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1816 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1817 b = _mm256_mullo_epi32(diff, shift);
1818
1819 res[1] = _mm256_add_epi32(a32, b);
1820 res[1] = _mm256_srli_epi32(res[1], 5);
1821 res[1] = _mm256_packus_epi32(
1822 res[1],
1823 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1824 } else {
1825 res[1] = a_mbase_x;
1826 }
1827 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1828 1); // 16 16bit values
1829 base_inc256 = _mm256_setr_epi16(
1830 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1831 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1832 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1833 base + j + 13, base + j + 14, base + j + 15);
1834
1835 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1836 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1837 _mm256_storeu_si256((__m256i *)(dst + j), res1);
1838 }
1839 }
1840 x += dx;
1841 }
1842 }
1843
highbd_dr_prediction_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1844 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1845 ptrdiff_t stride,
1846 const uint16_t *above,
1847 int upsample_above, int dx) {
1848 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1849 (void)upsample_above;
1850 const int frac_bits = 6;
1851 const int max_base_x = ((64 + N) - 1);
1852
1853 // pre-filter above pixels
1854 // store in temp buffers:
1855 // above[x] * 32 + 16
1856 // above[x+1] - above[x]
1857 // final pixels will be calculated as:
1858 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1859 __m256i a0, a1, a32, a16, c3f;
1860 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1861
1862 a16 = _mm256_set1_epi16(16);
1863 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1864 max_base_x256 = _mm256_set1_epi16(max_base_x);
1865 c3f = _mm256_set1_epi16(0x3f);
1866
1867 int x = dx;
1868 for (int r = 0; r < N; r++, dst += stride) {
1869 __m256i b, res;
1870
1871 int base = x >> frac_bits;
1872 if (base >= max_base_x) {
1873 for (int i = r; i < N; ++i) {
1874 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1875 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1876 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1877 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1878 dst += stride;
1879 }
1880 return;
1881 }
1882
1883 __m256i shift =
1884 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1885
1886 for (int j = 0; j < 64; j += 16) {
1887 int mdif = max_base_x - (base + j);
1888 if (mdif <= 0) {
1889 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1890 } else {
1891 a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1892 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1893
1894 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1895 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1896 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1897 b = _mm256_mullo_epi16(diff, shift);
1898
1899 res = _mm256_add_epi16(a32, b);
1900 res = _mm256_srli_epi16(res, 5);
1901
1902 base_inc256 = _mm256_setr_epi16(
1903 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1904 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1905 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1906 base + j + 13, base + j + 14, base + j + 15);
1907
1908 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1909 res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1910 _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values
1911 }
1912 }
1913 x += dx;
1914 }
1915 }
1916
1917 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1918 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1919 int bh, const uint16_t *above,
1920 const uint16_t *left, int upsample_above,
1921 int dx, int dy, int bd) {
1922 (void)left;
1923 (void)dy;
1924
1925 switch (bw) {
1926 case 4:
1927 highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1928 dx, bd);
1929 break;
1930 case 8:
1931 highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1932 dx, bd);
1933 break;
1934 case 16:
1935 highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1936 dx, bd);
1937 break;
1938 case 32:
1939 highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1940 dx, bd);
1941 break;
1942 case 64:
1943 if (bd < 12) {
1944 highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1945 upsample_above, dx);
1946 } else {
1947 highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1948 upsample_above, dx);
1949 }
1950 break;
1951 default: break;
1952 }
1953 return;
1954 }
1955
highbd_transpose_TX_16X16(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst)1956 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1957 uint16_t *dst, ptrdiff_t pitchDst) {
1958 __m256i r[16];
1959 __m256i d[16];
1960 for (int j = 0; j < 16; j++) {
1961 r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1962 }
1963 highbd_transpose16x16_avx2(r, d);
1964 for (int j = 0; j < 16; j++) {
1965 _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1966 }
1967 }
1968
highbd_transpose(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst,int width,int height)1969 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1970 uint16_t *dst, ptrdiff_t pitchDst, int width,
1971 int height) {
1972 for (int j = 0; j < height; j += 16)
1973 for (int i = 0; i < width; i += 16)
1974 highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1975 dst + j * pitchDst + i, pitchDst);
1976 }
1977
highbd_dr_prediction_32bit_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)1978 static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1979 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1980 const uint16_t *left, int upsample_above, int upsample_left, int dx,
1981 int dy) {
1982 const int min_base_x = -(1 << upsample_above);
1983 const int min_base_y = -(1 << upsample_left);
1984 const int frac_bits_x = 6 - upsample_above;
1985 const int frac_bits_y = 6 - upsample_left;
1986
1987 assert(dx > 0);
1988 // pre-filter above pixels
1989 // store in temp buffers:
1990 // above[x] * 32 + 16
1991 // above[x+1] - above[x]
1992 // final pixels will be calculated as:
1993 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1994 __m256i a0_x, a1_x, a32, a16;
1995 __m256i diff;
1996 __m128i c3f, min_base_y128;
1997
1998 a16 = _mm256_set1_epi32(16);
1999 c3f = _mm_set1_epi32(0x3f);
2000 min_base_y128 = _mm_set1_epi32(min_base_y);
2001
2002 for (int r = 0; r < N; r++) {
2003 __m256i b, res, shift;
2004 __m128i resx, resy, resxy;
2005 __m128i a0_x128, a1_x128;
2006 int y = r + 1;
2007 int base_x = (-y * dx) >> frac_bits_x;
2008 int base_shift = 0;
2009 if (base_x < (min_base_x - 1)) {
2010 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2011 }
2012 int base_min_diff =
2013 (min_base_x - base_x + upsample_above) >> upsample_above;
2014 if (base_min_diff > 4) {
2015 base_min_diff = 4;
2016 } else {
2017 if (base_min_diff < 0) base_min_diff = 0;
2018 }
2019
2020 if (base_shift > 3) {
2021 a0_x = _mm256_setzero_si256();
2022 a1_x = _mm256_setzero_si256();
2023 shift = _mm256_setzero_si256();
2024 } else {
2025 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2026 if (upsample_above) {
2027 a0_x128 = _mm_shuffle_epi8(a0_x128,
2028 *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2029 a1_x128 = _mm_srli_si128(a0_x128, 8);
2030
2031 shift = _mm256_castsi128_si256(_mm_srli_epi32(
2032 _mm_and_si128(
2033 _mm_slli_epi32(
2034 _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2035 (2 << 6) - y * dx, (3 << 6) - y * dx),
2036 upsample_above),
2037 c3f),
2038 1));
2039 } else {
2040 a0_x128 =
2041 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2042 a1_x128 = _mm_srli_si128(a0_x128, 2);
2043
2044 shift = _mm256_castsi128_si256(_mm_srli_epi32(
2045 _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2046 (2 << 6) - y * dx, (3 << 6) - y * dx),
2047 c3f),
2048 1));
2049 }
2050 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2051 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2052 }
2053 // y calc
2054 __m128i a0_y, a1_y, shifty;
2055 if (base_x < min_base_x) {
2056 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2057 DECLARE_ALIGNED(32, int, base_y_c[4]);
2058 r6 = _mm_set1_epi32(r << 6);
2059 dy128 = _mm_set1_epi32(dy);
2060 c1234 = _mm_setr_epi32(1, 2, 3, 4);
2061 y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2062 base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2063 mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2064 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2065 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2066
2067 a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2068 left[base_y_c[2]], left[base_y_c[3]]);
2069 a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2070 left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2071
2072 if (upsample_left) {
2073 shifty = _mm_srli_epi32(
2074 _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2075 } else {
2076 shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2077 }
2078 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2079 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2080 shift = _mm256_inserti128_si256(shift, shifty, 1);
2081 }
2082
2083 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2084 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2085 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2086
2087 b = _mm256_mullo_epi32(diff, shift);
2088 res = _mm256_add_epi32(a32, b);
2089 res = _mm256_srli_epi32(res, 5);
2090
2091 resx = _mm256_castsi256_si128(res);
2092 resx = _mm_packus_epi32(resx, resx);
2093
2094 resy = _mm256_extracti128_si256(res, 1);
2095 resy = _mm_packus_epi32(resy, resy);
2096
2097 resxy =
2098 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2099 _mm_storel_epi64((__m128i *)(dst), resxy);
2100 dst += stride;
2101 }
2102 }
2103
highbd_dr_prediction_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2104 static void highbd_dr_prediction_z2_Nx4_avx2(
2105 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2106 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2107 int dy) {
2108 const int min_base_x = -(1 << upsample_above);
2109 const int min_base_y = -(1 << upsample_left);
2110 const int frac_bits_x = 6 - upsample_above;
2111 const int frac_bits_y = 6 - upsample_left;
2112
2113 assert(dx > 0);
2114 // pre-filter above pixels
2115 // store in temp buffers:
2116 // above[x] * 32 + 16
2117 // above[x+1] - above[x]
2118 // final pixels will be calculated as:
2119 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2120 __m256i a0_x, a1_x, a32, a16;
2121 __m256i diff;
2122 __m128i c3f, min_base_y128;
2123
2124 a16 = _mm256_set1_epi16(16);
2125 c3f = _mm_set1_epi16(0x3f);
2126 min_base_y128 = _mm_set1_epi16(min_base_y);
2127
2128 for (int r = 0; r < N; r++) {
2129 __m256i b, res, shift;
2130 __m128i resx, resy, resxy;
2131 __m128i a0_x128, a1_x128;
2132 int y = r + 1;
2133 int base_x = (-y * dx) >> frac_bits_x;
2134 int base_shift = 0;
2135 if (base_x < (min_base_x - 1)) {
2136 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2137 }
2138 int base_min_diff =
2139 (min_base_x - base_x + upsample_above) >> upsample_above;
2140 if (base_min_diff > 4) {
2141 base_min_diff = 4;
2142 } else {
2143 if (base_min_diff < 0) base_min_diff = 0;
2144 }
2145
2146 if (base_shift > 3) {
2147 a0_x = _mm256_setzero_si256();
2148 a1_x = _mm256_setzero_si256();
2149 shift = _mm256_setzero_si256();
2150 } else {
2151 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2152 if (upsample_above) {
2153 a0_x128 = _mm_shuffle_epi8(a0_x128,
2154 *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2155 a1_x128 = _mm_srli_si128(a0_x128, 8);
2156
2157 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2158 _mm_and_si128(
2159 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2160 (2 << 6) - y * dx,
2161 (3 << 6) - y * dx, 0, 0, 0, 0),
2162 upsample_above),
2163 c3f),
2164 1));
2165 } else {
2166 a0_x128 =
2167 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2168 a1_x128 = _mm_srli_si128(a0_x128, 2);
2169
2170 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2171 _mm_and_si128(
2172 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2173 (3 << 6) - y * dx, 0, 0, 0, 0),
2174 c3f),
2175 1));
2176 }
2177 a0_x = _mm256_castsi128_si256(a0_x128);
2178 a1_x = _mm256_castsi128_si256(a1_x128);
2179 }
2180 // y calc
2181 __m128i a0_y, a1_y, shifty;
2182 if (base_x < min_base_x) {
2183 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2184 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2185 r6 = _mm_set1_epi16(r << 6);
2186 dy128 = _mm_set1_epi16(dy);
2187 c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2188 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2189 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2190 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2191 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2192 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2193
2194 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2195 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2196 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2197 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2198 0, 0);
2199
2200 if (upsample_left) {
2201 shifty = _mm_srli_epi16(
2202 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2203 } else {
2204 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2205 }
2206 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2207 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2208 shift = _mm256_inserti128_si256(shift, shifty, 1);
2209 }
2210
2211 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2212 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2213 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2214
2215 b = _mm256_mullo_epi16(diff, shift);
2216 res = _mm256_add_epi16(a32, b);
2217 res = _mm256_srli_epi16(res, 5);
2218
2219 resx = _mm256_castsi256_si128(res);
2220 resy = _mm256_extracti128_si256(res, 1);
2221 resxy =
2222 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2223 _mm_storel_epi64((__m128i *)(dst), resxy);
2224 dst += stride;
2225 }
2226 }
2227
highbd_dr_prediction_32bit_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2228 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2229 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2230 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2231 int dy) {
2232 const int min_base_x = -(1 << upsample_above);
2233 const int min_base_y = -(1 << upsample_left);
2234 const int frac_bits_x = 6 - upsample_above;
2235 const int frac_bits_y = 6 - upsample_left;
2236
2237 // pre-filter above pixels
2238 // store in temp buffers:
2239 // above[x] * 32 + 16
2240 // above[x+1] - above[x]
2241 // final pixels will be calculated as:
2242 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2243 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2244 __m256i diff;
2245 __m128i a0_x128, a1_x128;
2246
2247 a16 = _mm256_set1_epi32(16);
2248 c3f = _mm256_set1_epi32(0x3f);
2249 min_base_y256 = _mm256_set1_epi32(min_base_y);
2250
2251 for (int r = 0; r < N; r++) {
2252 __m256i b, res, shift;
2253 __m128i resx, resy, resxy;
2254 int y = r + 1;
2255 int base_x = (-y * dx) >> frac_bits_x;
2256 int base_shift = 0;
2257 if (base_x < (min_base_x - 1)) {
2258 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2259 }
2260 int base_min_diff =
2261 (min_base_x - base_x + upsample_above) >> upsample_above;
2262 if (base_min_diff > 8) {
2263 base_min_diff = 8;
2264 } else {
2265 if (base_min_diff < 0) base_min_diff = 0;
2266 }
2267
2268 if (base_shift > 7) {
2269 resx = _mm_setzero_si128();
2270 } else {
2271 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2272 if (upsample_above) {
2273 __m128i mask, atmp0, atmp1, atmp2, atmp3;
2274 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2275 atmp0 = _mm_shuffle_epi8(a0_x128,
2276 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2277 atmp1 = _mm_shuffle_epi8(a1_x128,
2278 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2279 atmp2 = _mm_shuffle_epi8(
2280 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2281 atmp3 = _mm_shuffle_epi8(
2282 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2283 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2284 _mm_set1_epi8(15));
2285 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2286 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2287 _mm_set1_epi8(15));
2288 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2289 shift = _mm256_srli_epi32(
2290 _mm256_and_si256(
2291 _mm256_slli_epi32(
2292 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2293 (2 << 6) - y * dx, (3 << 6) - y * dx,
2294 (4 << 6) - y * dx, (5 << 6) - y * dx,
2295 (6 << 6) - y * dx, (7 << 6) - y * dx),
2296 upsample_above),
2297 c3f),
2298 1);
2299 } else {
2300 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2301 a0_x128 =
2302 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2303 a1_x128 =
2304 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2305
2306 shift = _mm256_srli_epi32(
2307 _mm256_and_si256(
2308 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2309 (3 << 6) - y * dx, (4 << 6) - y * dx,
2310 (5 << 6) - y * dx, (6 << 6) - y * dx,
2311 (7 << 6) - y * dx),
2312 c3f),
2313 1);
2314 }
2315 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2316 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2317
2318 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2319 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2320 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2321
2322 b = _mm256_mullo_epi32(diff, shift);
2323 res = _mm256_add_epi32(a32, b);
2324 res = _mm256_srli_epi32(res, 5);
2325
2326 resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2327 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2328 }
2329 // y calc
2330 if (base_x < min_base_x) {
2331 DECLARE_ALIGNED(32, int, base_y_c[8]);
2332 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2333 r6 = _mm256_set1_epi32(r << 6);
2334 dy256 = _mm256_set1_epi32(dy);
2335 c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2336 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2337 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2338 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2339 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2340 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2341
2342 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2343 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2344 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2345 left[base_y_c[6]], left[base_y_c[7]]));
2346 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2347 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2348 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2349 left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2350
2351 if (upsample_left) {
2352 shift = _mm256_srli_epi32(
2353 _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2354 1);
2355 } else {
2356 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2357 }
2358 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2359 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2360 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2361
2362 b = _mm256_mullo_epi32(diff, shift);
2363 res = _mm256_add_epi32(a32, b);
2364 res = _mm256_srli_epi32(res, 5);
2365
2366 resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2367 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2368 } else {
2369 resy = resx;
2370 }
2371 resxy =
2372 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2373 _mm_storeu_si128((__m128i *)(dst), resxy);
2374 dst += stride;
2375 }
2376 }
2377
highbd_dr_prediction_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2378 static void highbd_dr_prediction_z2_Nx8_avx2(
2379 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2380 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2381 int dy) {
2382 const int min_base_x = -(1 << upsample_above);
2383 const int min_base_y = -(1 << upsample_left);
2384 const int frac_bits_x = 6 - upsample_above;
2385 const int frac_bits_y = 6 - upsample_left;
2386
2387 // pre-filter above pixels
2388 // store in temp buffers:
2389 // above[x] * 32 + 16
2390 // above[x+1] - above[x]
2391 // final pixels will be calculated as:
2392 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2393 __m128i c3f, min_base_y128;
2394 __m256i a0_x, a1_x, diff, a32, a16;
2395 __m128i a0_x128, a1_x128;
2396
2397 a16 = _mm256_set1_epi16(16);
2398 c3f = _mm_set1_epi16(0x3f);
2399 min_base_y128 = _mm_set1_epi16(min_base_y);
2400
2401 for (int r = 0; r < N; r++) {
2402 __m256i b, res, shift;
2403 __m128i resx, resy, resxy;
2404 int y = r + 1;
2405 int base_x = (-y * dx) >> frac_bits_x;
2406 int base_shift = 0;
2407 if (base_x < (min_base_x - 1)) {
2408 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2409 }
2410 int base_min_diff =
2411 (min_base_x - base_x + upsample_above) >> upsample_above;
2412 if (base_min_diff > 8) {
2413 base_min_diff = 8;
2414 } else {
2415 if (base_min_diff < 0) base_min_diff = 0;
2416 }
2417
2418 if (base_shift > 7) {
2419 a0_x = _mm256_setzero_si256();
2420 a1_x = _mm256_setzero_si256();
2421 shift = _mm256_setzero_si256();
2422 } else {
2423 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2424 if (upsample_above) {
2425 __m128i mask, atmp0, atmp1, atmp2, atmp3;
2426 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2427 atmp0 = _mm_shuffle_epi8(a0_x128,
2428 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2429 atmp1 = _mm_shuffle_epi8(a1_x128,
2430 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2431 atmp2 = _mm_shuffle_epi8(
2432 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2433 atmp3 = _mm_shuffle_epi8(
2434 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2435 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2436 _mm_set1_epi8(15));
2437 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2438 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2439 _mm_set1_epi8(15));
2440 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2441
2442 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443 _mm_and_si128(
2444 _mm_slli_epi16(
2445 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2446 (2 << 6) - y * dx, (3 << 6) - y * dx,
2447 (4 << 6) - y * dx, (5 << 6) - y * dx,
2448 (6 << 6) - y * dx, (7 << 6) - y * dx),
2449 upsample_above),
2450 c3f),
2451 1));
2452 } else {
2453 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2454 a0_x128 =
2455 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2456 a1_x128 =
2457 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2458
2459 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2460 _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2461 (2 << 6) - y * dx, (3 << 6) - y * dx,
2462 (4 << 6) - y * dx, (5 << 6) - y * dx,
2463 (6 << 6) - y * dx, (7 << 6) - y * dx),
2464 c3f),
2465 1));
2466 }
2467 a0_x = _mm256_castsi128_si256(a0_x128);
2468 a1_x = _mm256_castsi128_si256(a1_x128);
2469 }
2470
2471 // y calc
2472 __m128i a0_y, a1_y, shifty;
2473 if (base_x < min_base_x) {
2474 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2475 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2476 r6 = _mm_set1_epi16(r << 6);
2477 dy128 = _mm_set1_epi16(dy);
2478 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2479 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2480 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2481 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2482 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2483 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2484
2485 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2486 left[base_y_c[2]], left[base_y_c[3]],
2487 left[base_y_c[4]], left[base_y_c[5]],
2488 left[base_y_c[6]], left[base_y_c[7]]);
2489 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2490 left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2491 left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2492 left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2493
2494 if (upsample_left) {
2495 shifty = _mm_srli_epi16(
2496 _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2497 } else {
2498 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2499 }
2500 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2501 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2502 shift = _mm256_inserti128_si256(shift, shifty, 1);
2503 }
2504
2505 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2506 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2507 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2508
2509 b = _mm256_mullo_epi16(diff, shift);
2510 res = _mm256_add_epi16(a32, b);
2511 res = _mm256_srli_epi16(res, 5);
2512
2513 resx = _mm256_castsi256_si128(res);
2514 resy = _mm256_extracti128_si256(res, 1);
2515
2516 resxy =
2517 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2518 _mm_storeu_si128((__m128i *)(dst), resxy);
2519 dst += stride;
2520 }
2521 }
2522
highbd_dr_prediction_32bit_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2523 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2524 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2525 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2526 int dy) {
2527 // here upsample_above and upsample_left are 0 by design of
2528 // av1_use_intra_edge_upsample
2529 const int min_base_x = -1;
2530 const int min_base_y = -1;
2531 (void)upsample_above;
2532 (void)upsample_left;
2533 const int frac_bits_x = 6;
2534 const int frac_bits_y = 6;
2535
2536 // pre-filter above pixels
2537 // store in temp buffers:
2538 // above[x] * 32 + 16
2539 // above[x+1] - above[x]
2540 // final pixels will be calculated as:
2541 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2542 __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2543 __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2544 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2545 DECLARE_ALIGNED(32, int, base_y_c[16]);
2546
2547 a16 = _mm256_set1_epi32(16);
2548 c1 = _mm256_srli_epi32(a16, 4);
2549 c8 = _mm256_srli_epi32(a16, 1);
2550 min_base_y256 = _mm256_set1_epi32(min_base_y);
2551 c3f = _mm256_set1_epi32(0x3f);
2552 dy256 = _mm256_set1_epi32(dy);
2553 c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2554 c1234 = _mm256_add_epi32(c0123, c1);
2555
2556 for (int r = 0; r < H; r++) {
2557 __m256i b, res, shift, ydx;
2558 __m256i resx[2], resy[2];
2559 __m256i resxy, j256, r6;
2560 for (int j = 0; j < W; j += 16) {
2561 j256 = _mm256_set1_epi32(j);
2562 int y = r + 1;
2563 ydx = _mm256_set1_epi32(y * dx);
2564
2565 int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2566 int base_shift = 0;
2567 if ((base_x) < (min_base_x - 1)) {
2568 base_shift = (min_base_x - base_x - 1);
2569 }
2570 int base_min_diff = (min_base_x - base_x);
2571 if (base_min_diff > 16) {
2572 base_min_diff = 16;
2573 } else {
2574 if (base_min_diff < 0) base_min_diff = 0;
2575 }
2576
2577 if (base_shift > 7) {
2578 resx[0] = _mm256_setzero_si256();
2579 } else {
2580 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2581 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2582 a0_x128 =
2583 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2584 a1_x128 =
2585 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2586
2587 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2588 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2589
2590 r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2591 shift = _mm256_srli_epi32(
2592 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2593
2594 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2595 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2596 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2597
2598 b = _mm256_mullo_epi32(diff, shift);
2599 res = _mm256_add_epi32(a32, b);
2600 res = _mm256_srli_epi32(res, 5);
2601
2602 resx[0] = _mm256_packus_epi32(
2603 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2604 }
2605 int base_shift8 = 0;
2606 if ((base_x + 8) < (min_base_x - 1)) {
2607 base_shift8 = (min_base_x - (base_x + 8) - 1);
2608 }
2609 if (base_shift8 > 7) {
2610 resx[1] = _mm256_setzero_si256();
2611 } else {
2612 a0_1_x128 =
2613 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2614 a1_1_x128 =
2615 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2616 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2617 *(__m128i *)HighbdLoadMaskx[base_shift8]);
2618 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2619 *(__m128i *)HighbdLoadMaskx[base_shift8]);
2620
2621 a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2622 a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2623
2624 r6 = _mm256_slli_epi32(
2625 _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2626 shift = _mm256_srli_epi32(
2627 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2628
2629 diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
2630 a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
2631 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2632 b = _mm256_mullo_epi32(diff, shift);
2633
2634 resx[1] = _mm256_add_epi32(a32, b);
2635 resx[1] = _mm256_srli_epi32(resx[1], 5);
2636 resx[1] = _mm256_packus_epi32(
2637 resx[1],
2638 _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2639 }
2640 resx[0] =
2641 _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2642 1); // 16 16bit values
2643
2644 // y calc
2645 resy[0] = _mm256_setzero_si256();
2646 if ((base_x < min_base_x)) {
2647 __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2648 r6 = _mm256_set1_epi32(r << 6);
2649 c256 = _mm256_add_epi32(j256, c1234);
2650 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2651 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2652 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2653 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2654 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2655 c256 = _mm256_add_epi32(c256, c8);
2656 y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2657 base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2658 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2659 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2660 _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2661
2662 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2663 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2664 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2665 left[base_y_c[6]], left[base_y_c[7]]));
2666 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2667 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2668 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2669 left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2670
2671 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2672
2673 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2674 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2675 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2676
2677 b = _mm256_mullo_epi32(diff, shift);
2678 res = _mm256_add_epi32(a32, b);
2679 res = _mm256_srli_epi32(res, 5);
2680
2681 resy[0] = _mm256_packus_epi32(
2682 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2683
2684 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2685 left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2686 left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2687 left[base_y_c[14]], left[base_y_c[15]]));
2688 a1_y = _mm256_cvtepu16_epi32(
2689 _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2690 left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2691 left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2692 left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2693 shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2694
2695 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2696 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2697 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2698
2699 b = _mm256_mullo_epi32(diff, shift);
2700 res = _mm256_add_epi32(a32, b);
2701 res = _mm256_srli_epi32(res, 5);
2702
2703 resy[1] = _mm256_packus_epi32(
2704 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2705
2706 resy[0] =
2707 _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2708 1); // 16 16bit values
2709 }
2710
2711 resxy = _mm256_blendv_epi8(resx[0], resy[0],
2712 *(__m256i *)HighbdBaseMask[base_min_diff]);
2713 _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2714 } // for j
2715 dst += stride;
2716 }
2717 }
2718
highbd_dr_prediction_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2719 static void highbd_dr_prediction_z2_HxW_avx2(
2720 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2721 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2722 int dy) {
2723 // here upsample_above and upsample_left are 0 by design of
2724 // av1_use_intra_edge_upsample
2725 const int min_base_x = -1;
2726 const int min_base_y = -1;
2727 (void)upsample_above;
2728 (void)upsample_left;
2729 const int frac_bits_x = 6;
2730 const int frac_bits_y = 6;
2731
2732 // pre-filter above pixels
2733 // store in temp buffers:
2734 // above[x] * 32 + 16
2735 // above[x+1] - above[x]
2736 // final pixels will be calculated as:
2737 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2738 __m256i a0_x, a1_x, a32, a16, c3f, c1;
2739 __m256i diff, min_base_y256, dy256, c1234, c0123;
2740 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2741
2742 a16 = _mm256_set1_epi16(16);
2743 c1 = _mm256_srli_epi16(a16, 4);
2744 min_base_y256 = _mm256_set1_epi16(min_base_y);
2745 c3f = _mm256_set1_epi16(0x3f);
2746 dy256 = _mm256_set1_epi16(dy);
2747 c0123 =
2748 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2749 c1234 = _mm256_add_epi16(c0123, c1);
2750
2751 for (int r = 0; r < H; r++) {
2752 __m256i b, res, shift;
2753 __m256i resx, resy, ydx;
2754 __m256i resxy, j256, r6;
2755 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2756 int y = r + 1;
2757 ydx = _mm256_set1_epi16((short)(y * dx));
2758
2759 for (int j = 0; j < W; j += 16) {
2760 j256 = _mm256_set1_epi16(j);
2761 int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2762 int base_shift = 0;
2763 if ((base_x) < (min_base_x - 1)) {
2764 base_shift = (min_base_x - (base_x)-1);
2765 }
2766 int base_min_diff = (min_base_x - base_x);
2767 if (base_min_diff > 16) {
2768 base_min_diff = 16;
2769 } else {
2770 if (base_min_diff < 0) base_min_diff = 0;
2771 }
2772
2773 if (base_shift < 8) {
2774 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2775 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2776 a0_x128 =
2777 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2778 a1_x128 =
2779 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2780
2781 a0_x = _mm256_castsi128_si256(a0_x128);
2782 a1_x = _mm256_castsi128_si256(a1_x128);
2783 } else {
2784 a0_x = _mm256_setzero_si256();
2785 a1_x = _mm256_setzero_si256();
2786 }
2787
2788 int base_shift1 = 0;
2789 if (base_shift > 8) {
2790 base_shift1 = base_shift - 8;
2791 }
2792 if (base_shift1 < 8) {
2793 a0_1_x128 =
2794 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2795 a1_1_x128 =
2796 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2797 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2798 *(__m128i *)HighbdLoadMaskx[base_shift1]);
2799 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2800 *(__m128i *)HighbdLoadMaskx[base_shift1]);
2801
2802 a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2803 a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2804 }
2805 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2806 shift = _mm256_srli_epi16(
2807 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2808
2809 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2810 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2811 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2812
2813 b = _mm256_mullo_epi16(diff, shift);
2814 res = _mm256_add_epi16(a32, b);
2815 resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
2816
2817 // y calc
2818 resy = _mm256_setzero_si256();
2819 __m256i a0_y, a1_y, shifty;
2820 if ((base_x < min_base_x)) {
2821 __m256i c256, y_c256, base_y_c256, mask256, mul16;
2822 r6 = _mm256_set1_epi16(r << 6);
2823 c256 = _mm256_add_epi16(j256, c1234);
2824 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2825 _mm256_srli_epi16(min_base_y256, 1));
2826 y_c256 = _mm256_sub_epi16(r6, mul16);
2827 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2828 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2829 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2830 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2831
2832 a0_y = _mm256_setr_epi16(
2833 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2834 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2835 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2836 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2837 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2838 left[base_y_c[15]]);
2839 base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2840 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2841
2842 a1_y = _mm256_setr_epi16(
2843 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2844 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2845 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2846 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2847 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2848 left[base_y_c[15]]);
2849
2850 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2851
2852 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
2853 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
2854 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2855
2856 b = _mm256_mullo_epi16(diff, shifty);
2857 res = _mm256_add_epi16(a32, b);
2858 resy = _mm256_srli_epi16(res, 5);
2859 }
2860
2861 resxy = _mm256_blendv_epi8(resx, resy,
2862 *(__m256i *)HighbdBaseMask[base_min_diff]);
2863 _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2864 } // for j
2865 dst += stride;
2866 }
2867 }
2868
2869 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2870 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2871 int bh, const uint16_t *above,
2872 const uint16_t *left, int upsample_above,
2873 int upsample_left, int dx, int dy,
2874 int bd) {
2875 (void)bd;
2876 assert(dx > 0);
2877 assert(dy > 0);
2878 switch (bw) {
2879 case 4:
2880 if (bd < 12) {
2881 highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2882 upsample_above, upsample_left, dx, dy);
2883 } else {
2884 highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2885 upsample_above, upsample_left,
2886 dx, dy);
2887 }
2888 break;
2889 case 8:
2890 if (bd < 12) {
2891 highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2892 upsample_above, upsample_left, dx, dy);
2893 } else {
2894 highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2895 upsample_above, upsample_left,
2896 dx, dy);
2897 }
2898 break;
2899 default:
2900 if (bd < 12) {
2901 highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2902 upsample_above, upsample_left, dx, dy);
2903 } else {
2904 highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2905 upsample_above, upsample_left,
2906 dx, dy);
2907 }
2908 break;
2909 }
2910 }
2911
2912 // Directional prediction, zone 3 functions
highbd_dr_prediction_z3_4x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2913 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2914 const uint16_t *left,
2915 int upsample_left, int dy,
2916 int bd) {
2917 __m128i dstvec[4], d[4];
2918 if (bd < 12) {
2919 highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2920 dy);
2921 } else {
2922 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2923 upsample_left, dy);
2924 }
2925 highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2926 &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2927 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2928 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2929 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2930 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2931 return;
2932 }
2933
highbd_dr_prediction_z3_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2934 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2935 const uint16_t *left,
2936 int upsample_left, int dy,
2937 int bd) {
2938 __m128i dstvec[8], d[8];
2939 if (bd < 12) {
2940 highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2941 dy);
2942 } else {
2943 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2944 upsample_left, dy);
2945 }
2946 highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2947 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2948 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2949 &d[7]);
2950 for (int i = 0; i < 8; i++) {
2951 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2952 }
2953 }
2954
highbd_dr_prediction_z3_4x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2955 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2956 const uint16_t *left,
2957 int upsample_left, int dy,
2958 int bd) {
2959 __m128i dstvec[4], d[8];
2960 if (bd < 12) {
2961 highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2962 dy);
2963 } else {
2964 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2965 upsample_left, dy);
2966 }
2967
2968 highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2969 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2970 &d[7]);
2971 for (int i = 0; i < 8; i++) {
2972 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2973 }
2974 }
2975
highbd_dr_prediction_z3_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2976 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2977 const uint16_t *left,
2978 int upsample_left, int dy,
2979 int bd) {
2980 __m128i dstvec[8], d[4];
2981 if (bd < 12) {
2982 highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2983 dy);
2984 } else {
2985 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2986 upsample_left, dy);
2987 }
2988
2989 highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2990 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2991 &d[0], &d[1], &d[2], &d[3]);
2992 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2993 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2994 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2995 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2996 }
2997
highbd_dr_prediction_z3_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2998 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2999 const uint16_t *left,
3000 int upsample_left, int dy,
3001 int bd) {
3002 __m256i dstvec[8], d[8];
3003 if (bd < 12) {
3004 highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
3005 dy);
3006 } else {
3007 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
3008 upsample_left, dy);
3009 }
3010 highbd_transpose8x16_16x8_avx2(dstvec, d);
3011 for (int i = 0; i < 8; i++) {
3012 _mm_storeu_si128((__m128i *)(dst + i * stride),
3013 _mm256_castsi256_si128(d[i]));
3014 }
3015 for (int i = 8; i < 16; i++) {
3016 _mm_storeu_si128((__m128i *)(dst + i * stride),
3017 _mm256_extracti128_si256(d[i - 8], 1));
3018 }
3019 }
3020
highbd_dr_prediction_z3_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3021 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3022 const uint16_t *left,
3023 int upsample_left, int dy,
3024 int bd) {
3025 __m128i dstvec[16], d[16];
3026 if (bd < 12) {
3027 highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3028 dy);
3029 } else {
3030 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3031 upsample_left, dy);
3032 }
3033 for (int i = 0; i < 16; i += 8) {
3034 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3035 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3036 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3037 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3038 &d[5 + i], &d[6 + i], &d[7 + i]);
3039 }
3040 for (int i = 0; i < 8; i++) {
3041 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3042 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3043 }
3044 }
3045
3046 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_4x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3047 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3048 const uint16_t *left,
3049 int upsample_left, int dy,
3050 int bd) {
3051 __m256i dstvec[4], d[4], d1;
3052 if (bd < 12) {
3053 highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3054 dy);
3055 } else {
3056 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3057 upsample_left, dy);
3058 }
3059 highbd_transpose4x16_avx2(dstvec, d);
3060 for (int i = 0; i < 4; i++) {
3061 _mm_storel_epi64((__m128i *)(dst + i * stride),
3062 _mm256_castsi256_si128(d[i]));
3063 d1 = _mm256_bsrli_epi128(d[i], 8);
3064 _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3065 _mm256_castsi256_si128(d1));
3066 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3067 _mm256_extracti128_si256(d[i], 1));
3068 _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3069 _mm256_extracti128_si256(d1, 1));
3070 }
3071 }
3072
highbd_dr_prediction_z3_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3073 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3074 const uint16_t *left,
3075 int upsample_left, int dy,
3076 int bd) {
3077 __m128i dstvec[16], d[8];
3078 if (bd < 12) {
3079 highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3080 dy);
3081 } else {
3082 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3083 upsample_left, dy);
3084 }
3085 highbd_transpose16x4_8x8_sse2(dstvec, d);
3086
3087 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3088 _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3089 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3090 _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3091 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3092 _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3093 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3094 _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3095 }
3096
highbd_dr_prediction_z3_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3097 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3098 const uint16_t *left,
3099 int upsample_left, int dy,
3100 int bd) {
3101 __m256i dstvec[16], d[16];
3102 if (bd < 12) {
3103 highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3104 dy);
3105 } else {
3106 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3107 upsample_left, dy);
3108 }
3109
3110 for (int i = 0; i < 16; i += 8) {
3111 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3112 }
3113
3114 for (int i = 0; i < 8; i++) {
3115 _mm_storeu_si128((__m128i *)(dst + i * stride),
3116 _mm256_castsi256_si128(d[i]));
3117 }
3118 for (int i = 0; i < 8; i++) {
3119 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3120 _mm256_extracti128_si256(d[i], 1));
3121 }
3122 for (int i = 8; i < 16; i++) {
3123 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3124 _mm256_castsi256_si128(d[i]));
3125 }
3126 for (int i = 8; i < 16; i++) {
3127 _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3128 _mm256_extracti128_si256(d[i], 1));
3129 }
3130 }
3131
highbd_dr_prediction_z3_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3132 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3133 const uint16_t *left,
3134 int upsample_left, int dy,
3135 int bd) {
3136 __m128i dstvec[32], d[32];
3137 if (bd < 12) {
3138 highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3139 dy);
3140 } else {
3141 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3142 upsample_left, dy);
3143 }
3144
3145 for (int i = 0; i < 32; i += 8) {
3146 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3147 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3148 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3149 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3150 &d[5 + i], &d[6 + i], &d[7 + i]);
3151 }
3152 for (int i = 0; i < 8; i++) {
3153 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3154 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3155 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3156 _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3157 }
3158 }
3159 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3160
highbd_dr_prediction_z3_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3161 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3162 const uint16_t *left,
3163 int upsample_left, int dy,
3164 int bd) {
3165 __m256i dstvec[16], d[16];
3166 if (bd < 12) {
3167 highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3168 dy);
3169 } else {
3170 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3171 upsample_left, dy);
3172 }
3173
3174 highbd_transpose16x16_avx2(dstvec, d);
3175
3176 for (int i = 0; i < 16; i++) {
3177 _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3178 }
3179 }
3180
highbd_dr_prediction_z3_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3181 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3182 const uint16_t *left,
3183 int upsample_left, int dy,
3184 int bd) {
3185 __m256i dstvec[64], d[16];
3186 if (bd < 12) {
3187 highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3188 dy);
3189 } else {
3190 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3191 upsample_left, dy);
3192 }
3193 highbd_transpose16x16_avx2(dstvec, d);
3194 for (int j = 0; j < 16; j++) {
3195 _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3196 }
3197 highbd_transpose16x16_avx2(dstvec + 16, d);
3198 for (int j = 0; j < 16; j++) {
3199 _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3200 }
3201 highbd_transpose16x16_avx2(dstvec + 32, d);
3202 for (int j = 0; j < 16; j++) {
3203 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3204 }
3205 highbd_transpose16x16_avx2(dstvec + 48, d);
3206 for (int j = 0; j < 16; j++) {
3207 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3208 }
3209 }
3210
highbd_dr_prediction_z3_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3211 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3212 const uint16_t *left,
3213 int upsample_left, int dy,
3214 int bd) {
3215 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3216 if (bd < 12) {
3217 highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3218 } else {
3219 highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3220 dy);
3221 }
3222 highbd_transpose(dstT, 64, dst, stride, 64, 64);
3223 }
3224
highbd_dr_prediction_z3_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3225 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3226 const uint16_t *left,
3227 int upsample_left, int dy,
3228 int bd) {
3229 __m256i dstvec[32], d[32];
3230 if (bd < 12) {
3231 highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3232 dy);
3233 } else {
3234 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3235 upsample_left, dy);
3236 }
3237 for (int i = 0; i < 32; i += 8) {
3238 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3239 }
3240 // store
3241 for (int j = 0; j < 32; j += 16) {
3242 for (int i = 0; i < 8; i++) {
3243 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3244 _mm256_castsi256_si128(d[(i + j)]));
3245 }
3246 for (int i = 0; i < 8; i++) {
3247 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3248 _mm256_castsi256_si128(d[(i + j) + 8]));
3249 }
3250 for (int i = 8; i < 16; i++) {
3251 _mm256_storeu_si256(
3252 (__m256i *)(dst + (i + j) * stride),
3253 _mm256_inserti128_si256(
3254 d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3255 }
3256 }
3257 }
3258
highbd_dr_prediction_z3_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3259 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3260 const uint16_t *left,
3261 int upsample_left, int dy,
3262 int bd) {
3263 __m256i dstvec[32], d[16];
3264 if (bd < 12) {
3265 highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3266 dy);
3267 } else {
3268 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3269 upsample_left, dy);
3270 }
3271 for (int i = 0; i < 32; i += 16) {
3272 highbd_transpose16x16_avx2((dstvec + i), d);
3273 for (int j = 0; j < 16; j++) {
3274 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3275 }
3276 }
3277 }
3278
highbd_dr_prediction_z3_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3279 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3280 const uint16_t *left,
3281 int upsample_left, int dy,
3282 int bd) {
3283 uint16_t dstT[64 * 32];
3284 if (bd < 12) {
3285 highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3286 } else {
3287 highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3288 dy);
3289 }
3290 highbd_transpose(dstT, 64, dst, stride, 32, 64);
3291 }
3292
highbd_dr_prediction_z3_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3293 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3294 const uint16_t *left,
3295 int upsample_left, int dy,
3296 int bd) {
3297 DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3298 highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3299 highbd_transpose(dstT, 32, dst, stride, 64, 32);
3300 return;
3301 }
3302
3303 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3304 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3305 const uint16_t *left,
3306 int upsample_left, int dy,
3307 int bd) {
3308 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3309 if (bd < 12) {
3310 highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3311 } else {
3312 highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3313 dy);
3314 }
3315 highbd_transpose(dstT, 64, dst, stride, 16, 64);
3316 }
3317
highbd_dr_prediction_z3_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3318 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3319 const uint16_t *left,
3320 int upsample_left, int dy,
3321 int bd) {
3322 __m256i dstvec[64], d[16];
3323 if (bd < 12) {
3324 highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3325 dy);
3326 } else {
3327 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3328 upsample_left, dy);
3329 }
3330 for (int i = 0; i < 64; i += 16) {
3331 highbd_transpose16x16_avx2((dstvec + i), d);
3332 for (int j = 0; j < 16; j++) {
3333 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3334 }
3335 }
3336 }
3337 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3338
av1_highbd_dr_prediction_z3_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)3339 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3340 int bh, const uint16_t *above,
3341 const uint16_t *left, int upsample_left,
3342 int dx, int dy, int bd) {
3343 (void)above;
3344 (void)dx;
3345
3346 assert(dx == 1);
3347 assert(dy > 0);
3348 if (bw == bh) {
3349 switch (bw) {
3350 case 4:
3351 highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3352 bd);
3353 break;
3354 case 8:
3355 highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3356 bd);
3357 break;
3358 case 16:
3359 highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3360 bd);
3361 break;
3362 case 32:
3363 highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3364 bd);
3365 break;
3366 case 64:
3367 highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3368 bd);
3369 break;
3370 }
3371 } else {
3372 if (bw < bh) {
3373 if (bw + bw == bh) {
3374 switch (bw) {
3375 case 4:
3376 highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3377 dy, bd);
3378 break;
3379 case 8:
3380 highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3381 dy, bd);
3382 break;
3383 case 16:
3384 highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3385 dy, bd);
3386 break;
3387 case 32:
3388 highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3389 dy, bd);
3390 break;
3391 }
3392 } else {
3393 switch (bw) {
3394 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3395 case 4:
3396 highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3397 dy, bd);
3398 break;
3399 case 8:
3400 highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3401 dy, bd);
3402 break;
3403 case 16:
3404 highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3405 dy, bd);
3406 break;
3407 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3408 }
3409 }
3410 } else {
3411 if (bh + bh == bw) {
3412 switch (bh) {
3413 case 4:
3414 highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3415 dy, bd);
3416 break;
3417 case 8:
3418 highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3419 dy, bd);
3420 break;
3421 case 16:
3422 highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3423 dy, bd);
3424 break;
3425 case 32:
3426 highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3427 dy, bd);
3428 break;
3429 }
3430 } else {
3431 switch (bh) {
3432 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3433 case 4:
3434 highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3435 dy, bd);
3436 break;
3437 case 8:
3438 highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3439 dy, bd);
3440 break;
3441 case 16:
3442 highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3443 dy, bd);
3444 break;
3445 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3446 }
3447 }
3448 }
3449 }
3450 return;
3451 }
3452 #endif // CONFIG_AV1_HIGHBITDEPTH
3453
3454 // Low bit depth functions
3455 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3456 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3458 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3460 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3462 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3464 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3466 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3468 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3470 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3472 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3474 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3476 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3479 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3482 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3483 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3485 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3486 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3488 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3489 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3491 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3492 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3494 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3495 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
3496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3497 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
3499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3500 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
3502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3503 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
3505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3506 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
3508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3509 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3511 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3512 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3515 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3518 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3520 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
3521 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3523 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
3524 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3526 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
3527 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3529 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
3530 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
3533 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
3536 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
3539 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3542 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3543 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3544 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3545 };
3546
3547 /* clang-format on */
dr_prediction_z1_HxW_internal_avx2(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)3548 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3549 int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3550 int dx) {
3551 const int frac_bits = 6 - upsample_above;
3552 const int max_base_x = ((W + H) - 1) << upsample_above;
3553
3554 assert(dx > 0);
3555 // pre-filter above pixels
3556 // store in temp buffers:
3557 // above[x] * 32 + 16
3558 // above[x+1] - above[x]
3559 // final pixels will be calculated as:
3560 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3561 __m256i a0, a1, a32, a16;
3562 __m256i diff, c3f;
3563 __m128i a_mbase_x;
3564
3565 a16 = _mm256_set1_epi16(16);
3566 a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3567 c3f = _mm256_set1_epi16(0x3f);
3568
3569 int x = dx;
3570 for (int r = 0; r < W; r++) {
3571 __m256i b, res, shift;
3572 __m128i res1, a0_128, a1_128;
3573
3574 int base = x >> frac_bits;
3575 int base_max_diff = (max_base_x - base) >> upsample_above;
3576 if (base_max_diff <= 0) {
3577 for (int i = r; i < W; ++i) {
3578 dst[i] = a_mbase_x; // save 4 values
3579 }
3580 return;
3581 }
3582 if (base_max_diff > H) base_max_diff = H;
3583 a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3584 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3585
3586 if (upsample_above) {
3587 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3588 a1_128 = _mm_srli_si128(a0_128, 8);
3589
3590 shift = _mm256_srli_epi16(
3591 _mm256_and_si256(
3592 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3593 1);
3594 } else {
3595 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3596 }
3597 a0 = _mm256_cvtepu8_epi16(a0_128);
3598 a1 = _mm256_cvtepu8_epi16(a1_128);
3599
3600 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3601 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3602 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3603
3604 b = _mm256_mullo_epi16(diff, shift);
3605 res = _mm256_add_epi16(a32, b);
3606 res = _mm256_srli_epi16(res, 5);
3607
3608 res = _mm256_packus_epi16(
3609 res, _mm256_castsi128_si256(
3610 _mm256_extracti128_si256(res, 1))); // goto 8 bit
3611 res1 = _mm256_castsi256_si128(res); // 16 8bit values
3612
3613 dst[r] =
3614 _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3615 x += dx;
3616 }
3617 }
3618
dr_prediction_z1_4xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3619 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3620 const uint8_t *above, int upsample_above,
3621 int dx) {
3622 __m128i dstvec[16];
3623
3624 dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3625 for (int i = 0; i < N; i++) {
3626 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3627 }
3628 }
3629
dr_prediction_z1_8xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3630 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3631 const uint8_t *above, int upsample_above,
3632 int dx) {
3633 __m128i dstvec[32];
3634
3635 dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3636 for (int i = 0; i < N; i++) {
3637 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3638 }
3639 }
3640
dr_prediction_z1_16xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3641 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3642 const uint8_t *above, int upsample_above,
3643 int dx) {
3644 __m128i dstvec[64];
3645
3646 dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3647 for (int i = 0; i < N; i++) {
3648 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3649 }
3650 }
3651
dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint8_t * above,int upsample_above,int dx)3652 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3653 int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3654 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3655 (void)upsample_above;
3656 const int frac_bits = 6;
3657 const int max_base_x = ((32 + N) - 1);
3658
3659 // pre-filter above pixels
3660 // store in temp buffers:
3661 // above[x] * 32 + 16
3662 // above[x+1] - above[x]
3663 // final pixels will be calculated as:
3664 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3665 __m256i a0, a1, a32, a16;
3666 __m256i a_mbase_x, diff, c3f;
3667
3668 a16 = _mm256_set1_epi16(16);
3669 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3670 c3f = _mm256_set1_epi16(0x3f);
3671
3672 int x = dx;
3673 for (int r = 0; r < N; r++) {
3674 __m256i b, res, res16[2];
3675 __m128i a0_128, a1_128;
3676
3677 int base = x >> frac_bits;
3678 int base_max_diff = (max_base_x - base);
3679 if (base_max_diff <= 0) {
3680 for (int i = r; i < N; ++i) {
3681 dstvec[i] = a_mbase_x; // save 32 values
3682 }
3683 return;
3684 }
3685 if (base_max_diff > 32) base_max_diff = 32;
3686 __m256i shift =
3687 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3688
3689 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3690 int mdiff = base_max_diff - j;
3691 if (mdiff <= 0) {
3692 res16[jj] = a_mbase_x;
3693 } else {
3694 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3695 a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3696 a0 = _mm256_cvtepu8_epi16(a0_128);
3697 a1 = _mm256_cvtepu8_epi16(a1_128);
3698
3699 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3700 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3701 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3702 b = _mm256_mullo_epi16(diff, shift);
3703
3704 res = _mm256_add_epi16(a32, b);
3705 res = _mm256_srli_epi16(res, 5);
3706 res16[jj] = _mm256_packus_epi16(
3707 res, _mm256_castsi128_si256(
3708 _mm256_extracti128_si256(res, 1))); // 16 8bit values
3709 }
3710 }
3711 res16[1] =
3712 _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3713 1); // 32 8bit values
3714
3715 dstvec[r] = _mm256_blendv_epi8(
3716 a_mbase_x, res16[1],
3717 *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
3718 x += dx;
3719 }
3720 }
3721
dr_prediction_z1_32xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3722 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3723 const uint8_t *above, int upsample_above,
3724 int dx) {
3725 __m256i dstvec[64];
3726 dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3727 for (int i = 0; i < N; i++) {
3728 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3729 }
3730 }
3731
dr_prediction_z1_64xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3732 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3733 const uint8_t *above, int upsample_above,
3734 int dx) {
3735 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3736 (void)upsample_above;
3737 const int frac_bits = 6;
3738 const int max_base_x = ((64 + N) - 1);
3739
3740 // pre-filter above pixels
3741 // store in temp buffers:
3742 // above[x] * 32 + 16
3743 // above[x+1] - above[x]
3744 // final pixels will be calculated as:
3745 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3746 __m256i a0, a1, a32, a16;
3747 __m256i a_mbase_x, diff, c3f;
3748 __m128i max_base_x128, base_inc128, mask128;
3749
3750 a16 = _mm256_set1_epi16(16);
3751 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3752 max_base_x128 = _mm_set1_epi8(max_base_x);
3753 c3f = _mm256_set1_epi16(0x3f);
3754
3755 int x = dx;
3756 for (int r = 0; r < N; r++, dst += stride) {
3757 __m256i b, res;
3758 int base = x >> frac_bits;
3759 if (base >= max_base_x) {
3760 for (int i = r; i < N; ++i) {
3761 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
3762 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3763 dst += stride;
3764 }
3765 return;
3766 }
3767
3768 __m256i shift =
3769 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3770
3771 __m128i a0_128, a1_128, res128;
3772 for (int j = 0; j < 64; j += 16) {
3773 int mdif = max_base_x - (base + j);
3774 if (mdif <= 0) {
3775 _mm_storeu_si128((__m128i *)(dst + j),
3776 _mm256_castsi256_si128(a_mbase_x));
3777 } else {
3778 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3779 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3780 a0 = _mm256_cvtepu8_epi16(a0_128);
3781 a1 = _mm256_cvtepu8_epi16(a1_128);
3782
3783 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3784 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3785 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3786 b = _mm256_mullo_epi16(diff, shift);
3787
3788 res = _mm256_add_epi16(a32, b);
3789 res = _mm256_srli_epi16(res, 5);
3790 res = _mm256_packus_epi16(
3791 res, _mm256_castsi128_si256(
3792 _mm256_extracti128_si256(res, 1))); // 16 8bit values
3793
3794 base_inc128 =
3795 _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3796 (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3797 (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3798 (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3799 (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3800 (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3801 (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3802 (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3803
3804 mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3805 _mm_setzero_si128());
3806 res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3807 _mm256_castsi256_si128(res), mask128);
3808 _mm_storeu_si128((__m128i *)(dst + j), res128);
3809 }
3810 }
3811 x += dx;
3812 }
3813 }
3814
3815 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)3816 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3817 const uint8_t *above, const uint8_t *left,
3818 int upsample_above, int dx, int dy) {
3819 (void)left;
3820 (void)dy;
3821 switch (bw) {
3822 case 4:
3823 dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3824 break;
3825 case 8:
3826 dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3827 break;
3828 case 16:
3829 dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3830 break;
3831 case 32:
3832 dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3833 break;
3834 case 64:
3835 dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3836 break;
3837 default: break;
3838 }
3839 return;
3840 }
3841
dr_prediction_z2_Nx4_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3842 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3843 const uint8_t *above, const uint8_t *left,
3844 int upsample_above, int upsample_left,
3845 int dx, int dy) {
3846 const int min_base_x = -(1 << upsample_above);
3847 const int min_base_y = -(1 << upsample_left);
3848 const int frac_bits_x = 6 - upsample_above;
3849 const int frac_bits_y = 6 - upsample_left;
3850
3851 assert(dx > 0);
3852 // pre-filter above pixels
3853 // store in temp buffers:
3854 // above[x] * 32 + 16
3855 // above[x+1] - above[x]
3856 // final pixels will be calculated as:
3857 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3858 __m128i a0_x, a1_x, a32, a16, diff;
3859 __m128i c3f, min_base_y128, c1234, dy128;
3860
3861 a16 = _mm_set1_epi16(16);
3862 c3f = _mm_set1_epi16(0x3f);
3863 min_base_y128 = _mm_set1_epi16(min_base_y);
3864 c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3865 dy128 = _mm_set1_epi16(dy);
3866
3867 for (int r = 0; r < N; r++) {
3868 __m128i b, res, shift, r6, ydx;
3869 __m128i resx, resy, resxy;
3870 __m128i a0_x128, a1_x128;
3871 int y = r + 1;
3872 int base_x = (-y * dx) >> frac_bits_x;
3873 int base_shift = 0;
3874 if (base_x < (min_base_x - 1)) {
3875 base_shift = (min_base_x - base_x - 1) >> upsample_above;
3876 }
3877 int base_min_diff =
3878 (min_base_x - base_x + upsample_above) >> upsample_above;
3879 if (base_min_diff > 4) {
3880 base_min_diff = 4;
3881 } else {
3882 if (base_min_diff < 0) base_min_diff = 0;
3883 }
3884
3885 if (base_shift > 3) {
3886 a0_x = _mm_setzero_si128();
3887 a1_x = _mm_setzero_si128();
3888 shift = _mm_setzero_si128();
3889 } else {
3890 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3891 ydx = _mm_set1_epi16(y * dx);
3892 r6 = _mm_slli_epi16(c1234, 6);
3893
3894 if (upsample_above) {
3895 a0_x128 =
3896 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3897 a1_x128 = _mm_srli_si128(a0_x128, 8);
3898
3899 shift = _mm_srli_epi16(
3900 _mm_and_si128(
3901 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3902 1);
3903 } else {
3904 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3905 a1_x128 = _mm_srli_si128(a0_x128, 1);
3906
3907 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3908 }
3909 a0_x = _mm_cvtepu8_epi16(a0_x128);
3910 a1_x = _mm_cvtepu8_epi16(a1_x128);
3911 }
3912 // y calc
3913 __m128i a0_y, a1_y, shifty;
3914 if (base_x < min_base_x) {
3915 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3916 __m128i y_c128, base_y_c128, mask128, c1234_;
3917 c1234_ = _mm_srli_si128(c1234, 2);
3918 r6 = _mm_set1_epi16(r << 6);
3919 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3920 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3921 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3922 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3923 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3924
3925 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3926 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3927 base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3928 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3929 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3930 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3931
3932 if (upsample_left) {
3933 shifty = _mm_srli_epi16(
3934 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3935 } else {
3936 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3937 }
3938 a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3939 a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3940 shift = _mm_unpacklo_epi64(shift, shifty);
3941 }
3942
3943 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
3944 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
3945 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
3946
3947 b = _mm_mullo_epi16(diff, shift);
3948 res = _mm_add_epi16(a32, b);
3949 res = _mm_srli_epi16(res, 5);
3950
3951 resx = _mm_packus_epi16(res, res);
3952 resy = _mm_srli_si128(resx, 4);
3953
3954 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3955 *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3956 dst += stride;
3957 }
3958 }
3959
dr_prediction_z2_Nx8_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3960 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3961 const uint8_t *above, const uint8_t *left,
3962 int upsample_above, int upsample_left,
3963 int dx, int dy) {
3964 const int min_base_x = -(1 << upsample_above);
3965 const int min_base_y = -(1 << upsample_left);
3966 const int frac_bits_x = 6 - upsample_above;
3967 const int frac_bits_y = 6 - upsample_left;
3968
3969 // pre-filter above pixels
3970 // store in temp buffers:
3971 // above[x] * 32 + 16
3972 // above[x+1] - above[x]
3973 // final pixels will be calculated as:
3974 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3975 __m256i diff, a32, a16;
3976 __m256i a0_x, a1_x;
3977 __m128i a0_x128, a1_x128, min_base_y128, c3f;
3978 __m128i c1234, dy128;
3979
3980 a16 = _mm256_set1_epi16(16);
3981 c3f = _mm_set1_epi16(0x3f);
3982 min_base_y128 = _mm_set1_epi16(min_base_y);
3983 dy128 = _mm_set1_epi16(dy);
3984 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3985
3986 for (int r = 0; r < N; r++) {
3987 __m256i b, res, shift;
3988 __m128i resx, resy, resxy, r6, ydx;
3989
3990 int y = r + 1;
3991 int base_x = (-y * dx) >> frac_bits_x;
3992 int base_shift = 0;
3993 if (base_x < (min_base_x - 1)) {
3994 base_shift = (min_base_x - base_x - 1) >> upsample_above;
3995 }
3996 int base_min_diff =
3997 (min_base_x - base_x + upsample_above) >> upsample_above;
3998 if (base_min_diff > 8) {
3999 base_min_diff = 8;
4000 } else {
4001 if (base_min_diff < 0) base_min_diff = 0;
4002 }
4003
4004 if (base_shift > 7) {
4005 a0_x = _mm256_setzero_si256();
4006 a1_x = _mm256_setzero_si256();
4007 shift = _mm256_setzero_si256();
4008 } else {
4009 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
4010 ydx = _mm_set1_epi16(y * dx);
4011 r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
4012 if (upsample_above) {
4013 a0_x128 =
4014 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
4015 a1_x128 = _mm_srli_si128(a0_x128, 8);
4016
4017 shift = _mm256_castsi128_si256(_mm_srli_epi16(
4018 _mm_and_si128(
4019 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
4020 1));
4021 } else {
4022 a1_x128 = _mm_srli_si128(a0_x128, 1);
4023 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4024 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4025
4026 shift = _mm256_castsi128_si256(
4027 _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4028 }
4029 a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4030 a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4031 }
4032
4033 // y calc
4034 __m128i a0_y, a1_y, shifty;
4035 if (base_x < min_base_x) {
4036 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4037 __m128i y_c128, base_y_c128, mask128;
4038 r6 = _mm_set1_epi16(r << 6);
4039 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4040 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4041 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4042 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4043 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4044
4045 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4046 left[base_y_c[2]], left[base_y_c[3]],
4047 left[base_y_c[4]], left[base_y_c[5]],
4048 left[base_y_c[6]], left[base_y_c[7]]);
4049 base_y_c128 = _mm_add_epi16(
4050 base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4051 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4052
4053 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4054 left[base_y_c[2]], left[base_y_c[3]],
4055 left[base_y_c[4]], left[base_y_c[5]],
4056 left[base_y_c[6]], left[base_y_c[7]]);
4057
4058 if (upsample_left) {
4059 shifty = _mm_srli_epi16(
4060 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4061 } else {
4062 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4063 }
4064
4065 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4066 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4067 shift = _mm256_inserti128_si256(shift, shifty, 1);
4068 }
4069
4070 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
4071 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
4072 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4073
4074 b = _mm256_mullo_epi16(diff, shift);
4075 res = _mm256_add_epi16(a32, b);
4076 res = _mm256_srli_epi16(res, 5);
4077
4078 resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4079 _mm256_castsi256_si128(res));
4080 resy = _mm256_extracti128_si256(res, 1);
4081 resy = _mm_packus_epi16(resy, resy);
4082
4083 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4084 _mm_storel_epi64((__m128i *)(dst), resxy);
4085 dst += stride;
4086 }
4087 }
4088
dr_prediction_z2_HxW_avx2(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4089 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4090 ptrdiff_t stride, const uint8_t *above,
4091 const uint8_t *left, int upsample_above,
4092 int upsample_left, int dx, int dy) {
4093 // here upsample_above and upsample_left are 0 by design of
4094 // av1_use_intra_edge_upsample
4095 const int min_base_x = -1;
4096 const int min_base_y = -1;
4097 (void)upsample_above;
4098 (void)upsample_left;
4099 const int frac_bits_x = 6;
4100 const int frac_bits_y = 6;
4101
4102 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4103 __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4104 __m128i a0_x128, a1_x128;
4105
4106 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4107 a16 = _mm256_set1_epi16(16);
4108 c1 = _mm256_srli_epi16(a16, 4);
4109 min_base_y256 = _mm256_set1_epi16(min_base_y);
4110 c3f = _mm256_set1_epi16(0x3f);
4111 dy256 = _mm256_set1_epi16(dy);
4112 c0123 =
4113 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4114 c1234 = _mm256_add_epi16(c0123, c1);
4115
4116 for (int r = 0; r < H; r++) {
4117 __m256i b, res, shift, j256, r6, ydx;
4118 __m128i resx, resy;
4119 __m128i resxy;
4120 int y = r + 1;
4121 ydx = _mm256_set1_epi16((int16_t)(y * dx));
4122
4123 int base_x = (-y * dx) >> frac_bits_x;
4124 for (int j = 0; j < W; j += 16) {
4125 j256 = _mm256_set1_epi16(j);
4126 int base_shift = 0;
4127 if ((base_x + j) < (min_base_x - 1)) {
4128 base_shift = (min_base_x - (base_x + j) - 1);
4129 }
4130 int base_min_diff = (min_base_x - base_x - j);
4131 if (base_min_diff > 16) {
4132 base_min_diff = 16;
4133 } else {
4134 if (base_min_diff < 0) base_min_diff = 0;
4135 }
4136
4137 if (base_shift < 16) {
4138 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4139 a1_x128 =
4140 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4141 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4142 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4143
4144 a0_x = _mm256_cvtepu8_epi16(a0_x128);
4145 a1_x = _mm256_cvtepu8_epi16(a1_x128);
4146
4147 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4148 shift = _mm256_srli_epi16(
4149 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4150
4151 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
4152 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
4153 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4154
4155 b = _mm256_mullo_epi16(diff, shift);
4156 res = _mm256_add_epi16(a32, b);
4157 res = _mm256_srli_epi16(res, 5); // 16 16-bit values
4158 resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4159 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4160 } else {
4161 resx = _mm_setzero_si128();
4162 }
4163
4164 // y calc
4165 if (base_x < min_base_x) {
4166 __m256i c256, y_c256, base_y_c256, mask256, mul16;
4167 r6 = _mm256_set1_epi16(r << 6);
4168 c256 = _mm256_add_epi16(j256, c1234);
4169 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4170 _mm256_srli_epi16(min_base_y256, 1));
4171 y_c256 = _mm256_sub_epi16(r6, mul16);
4172
4173 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4174 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4175
4176 base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4177 int16_t min_y = (int16_t)_mm_extract_epi16(
4178 _mm256_extracti128_si256(base_y_c256, 1), 7);
4179 int16_t max_y =
4180 (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4181 int16_t offset_diff = max_y - min_y;
4182
4183 if (offset_diff < 16) {
4184 __m256i min_y256 = _mm256_set1_epi16(min_y);
4185
4186 __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4187 __m128i base_y_offset128 =
4188 _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4189 _mm256_extracti128_si256(base_y_offset, 1));
4190
4191 __m128i a0_y128 = _mm_maskload_epi32(
4192 (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4193 __m128i a1_y128 =
4194 _mm_maskload_epi32((int *)(left + min_y + 1),
4195 *(__m128i *)LoadMaskz2[offset_diff / 4]);
4196 a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4197 a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4198 a0_y = _mm256_cvtepu8_epi16(a0_y128);
4199 a1_y = _mm256_cvtepu8_epi16(a1_y128);
4200 } else {
4201 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4202 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4203
4204 a0_y = _mm256_setr_epi16(
4205 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4206 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4207 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4208 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4209 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4210 left[base_y_c[15]]);
4211 base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4212 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4213
4214 a1_y = _mm256_setr_epi16(
4215 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4216 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4217 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4218 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4219 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4220 left[base_y_c[15]]);
4221 }
4222 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4223
4224 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
4225 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
4226 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4227
4228 b = _mm256_mullo_epi16(diff, shifty);
4229 res = _mm256_add_epi16(a32, b);
4230 res = _mm256_srli_epi16(res, 5); // 16 16-bit values
4231 resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4232 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4233 } else {
4234 resy = _mm_setzero_si128();
4235 }
4236 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4237 _mm_storeu_si128((__m128i *)(dst + j), resxy);
4238 } // for j
4239 dst += stride;
4240 }
4241 }
4242
4243 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4244 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4245 const uint8_t *above, const uint8_t *left,
4246 int upsample_above, int upsample_left, int dx,
4247 int dy) {
4248 assert(dx > 0);
4249 assert(dy > 0);
4250 switch (bw) {
4251 case 4:
4252 dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4253 upsample_left, dx, dy);
4254 break;
4255 case 8:
4256 dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4257 upsample_left, dx, dy);
4258 break;
4259 default:
4260 dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4261 upsample_above, upsample_left, dx, dy);
4262 break;
4263 }
4264 return;
4265 }
4266
4267 // z3 functions
transpose16x32_avx2(__m256i * x,__m256i * d)4268 static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
4269 __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4270 __m256i w10, w11, w12, w13, w14, w15;
4271
4272 w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4273 w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4274 w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4275 w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4276
4277 w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4278 w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4279 w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4280 w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4281
4282 w4 = _mm256_unpacklo_epi16(w0, w1);
4283 w5 = _mm256_unpacklo_epi16(w2, w3);
4284 w12 = _mm256_unpacklo_epi16(w8, w9);
4285 w13 = _mm256_unpacklo_epi16(w10, w11);
4286
4287 w6 = _mm256_unpacklo_epi32(w4, w5);
4288 w7 = _mm256_unpackhi_epi32(w4, w5);
4289 w14 = _mm256_unpacklo_epi32(w12, w13);
4290 w15 = _mm256_unpackhi_epi32(w12, w13);
4291
4292 // Store first 4-line result
4293 d[0] = _mm256_unpacklo_epi64(w6, w14);
4294 d[1] = _mm256_unpackhi_epi64(w6, w14);
4295 d[2] = _mm256_unpacklo_epi64(w7, w15);
4296 d[3] = _mm256_unpackhi_epi64(w7, w15);
4297
4298 w4 = _mm256_unpackhi_epi16(w0, w1);
4299 w5 = _mm256_unpackhi_epi16(w2, w3);
4300 w12 = _mm256_unpackhi_epi16(w8, w9);
4301 w13 = _mm256_unpackhi_epi16(w10, w11);
4302
4303 w6 = _mm256_unpacklo_epi32(w4, w5);
4304 w7 = _mm256_unpackhi_epi32(w4, w5);
4305 w14 = _mm256_unpacklo_epi32(w12, w13);
4306 w15 = _mm256_unpackhi_epi32(w12, w13);
4307
4308 // Store second 4-line result
4309 d[4] = _mm256_unpacklo_epi64(w6, w14);
4310 d[5] = _mm256_unpackhi_epi64(w6, w14);
4311 d[6] = _mm256_unpacklo_epi64(w7, w15);
4312 d[7] = _mm256_unpackhi_epi64(w7, w15);
4313
4314 // upper half
4315 w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4316 w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4317 w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4318 w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4319
4320 w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4321 w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4322 w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4323 w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4324
4325 w4 = _mm256_unpacklo_epi16(w0, w1);
4326 w5 = _mm256_unpacklo_epi16(w2, w3);
4327 w12 = _mm256_unpacklo_epi16(w8, w9);
4328 w13 = _mm256_unpacklo_epi16(w10, w11);
4329
4330 w6 = _mm256_unpacklo_epi32(w4, w5);
4331 w7 = _mm256_unpackhi_epi32(w4, w5);
4332 w14 = _mm256_unpacklo_epi32(w12, w13);
4333 w15 = _mm256_unpackhi_epi32(w12, w13);
4334
4335 // Store first 4-line result
4336 d[8] = _mm256_unpacklo_epi64(w6, w14);
4337 d[9] = _mm256_unpackhi_epi64(w6, w14);
4338 d[10] = _mm256_unpacklo_epi64(w7, w15);
4339 d[11] = _mm256_unpackhi_epi64(w7, w15);
4340
4341 w4 = _mm256_unpackhi_epi16(w0, w1);
4342 w5 = _mm256_unpackhi_epi16(w2, w3);
4343 w12 = _mm256_unpackhi_epi16(w8, w9);
4344 w13 = _mm256_unpackhi_epi16(w10, w11);
4345
4346 w6 = _mm256_unpacklo_epi32(w4, w5);
4347 w7 = _mm256_unpackhi_epi32(w4, w5);
4348 w14 = _mm256_unpacklo_epi32(w12, w13);
4349 w15 = _mm256_unpackhi_epi32(w12, w13);
4350
4351 // Store second 4-line result
4352 d[12] = _mm256_unpacklo_epi64(w6, w14);
4353 d[13] = _mm256_unpackhi_epi64(w6, w14);
4354 d[14] = _mm256_unpacklo_epi64(w7, w15);
4355 d[15] = _mm256_unpackhi_epi64(w7, w15);
4356 }
4357
dr_prediction_z3_4x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4358 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4359 const uint8_t *left, int upsample_left,
4360 int dy) {
4361 __m128i dstvec[4], d[4];
4362
4363 dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4364 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4365 &d[0], &d[1], &d[2], &d[3]);
4366
4367 *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4368 *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4369 *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4370 *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4371 return;
4372 }
4373
dr_prediction_z3_8x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4374 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4375 const uint8_t *left, int upsample_left,
4376 int dy) {
4377 __m128i dstvec[8], d[8];
4378
4379 dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4380 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4381 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4382 &d[3]);
4383
4384 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4385 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4386 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4387 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4388 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4389 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4390 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4391 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4392 }
4393
dr_prediction_z3_4x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4394 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4395 const uint8_t *left, int upsample_left,
4396 int dy) {
4397 __m128i dstvec[4], d[8];
4398
4399 dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4400 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4401 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4402 for (int i = 0; i < 8; i++) {
4403 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4404 }
4405 }
4406
dr_prediction_z3_8x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4407 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4408 const uint8_t *left, int upsample_left,
4409 int dy) {
4410 __m128i dstvec[8], d[4];
4411
4412 dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4413 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4414 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4415 &d[1], &d[2], &d[3]);
4416 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4417 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4418 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4419 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4420 }
4421
dr_prediction_z3_8x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4422 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4423 const uint8_t *left, int upsample_left,
4424 int dy) {
4425 __m128i dstvec[8], d[8];
4426
4427 dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4428 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4429 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4430 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4431 for (int i = 0; i < 8; i++) {
4432 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4433 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4434 _mm_srli_si128(d[i], 8));
4435 }
4436 }
4437
dr_prediction_z3_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4438 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4439 const uint8_t *left, int upsample_left,
4440 int dy) {
4441 __m128i dstvec[16], d[16];
4442
4443 dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4444 transpose16x8_8x16_sse2(
4445 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4446 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4447 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4448 &d[3], &d[4], &d[5], &d[6], &d[7]);
4449
4450 for (int i = 0; i < 8; i++) {
4451 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4452 }
4453 }
4454
4455 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4456 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4457 const uint8_t *left, int upsample_left,
4458 int dy) {
4459 __m128i dstvec[4], d[16];
4460
4461 dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4462 transpose4x16_sse2(dstvec, d);
4463 for (int i = 0; i < 16; i++) {
4464 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4465 }
4466 }
4467
dr_prediction_z3_16x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4468 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4469 const uint8_t *left, int upsample_left,
4470 int dy) {
4471 __m128i dstvec[16], d[8];
4472
4473 dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4474 for (int i = 4; i < 8; i++) {
4475 d[i] = _mm_setzero_si128();
4476 }
4477 transpose16x8_8x16_sse2(
4478 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4479 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4480 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4481 &d[3], &d[4], &d[5], &d[6], &d[7]);
4482
4483 for (int i = 0; i < 4; i++) {
4484 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4485 }
4486 }
4487
dr_prediction_z3_8x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4488 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4489 const uint8_t *left, int upsample_left,
4490 int dy) {
4491 __m256i dstvec[16], d[16];
4492
4493 dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4494 for (int i = 8; i < 16; i++) {
4495 dstvec[i] = _mm256_setzero_si256();
4496 }
4497 transpose16x32_avx2(dstvec, d);
4498
4499 for (int i = 0; i < 16; i++) {
4500 _mm_storel_epi64((__m128i *)(dst + i * stride),
4501 _mm256_castsi256_si128(d[i]));
4502 }
4503 for (int i = 0; i < 16; i++) {
4504 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4505 _mm256_extracti128_si256(d[i], 1));
4506 }
4507 }
4508
dr_prediction_z3_32x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4509 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4510 const uint8_t *left, int upsample_left,
4511 int dy) {
4512 __m128i dstvec[32], d[16];
4513
4514 dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4515
4516 transpose16x8_8x16_sse2(
4517 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4518 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4519 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4520 &d[3], &d[4], &d[5], &d[6], &d[7]);
4521 transpose16x8_8x16_sse2(
4522 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4523 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4524 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4525 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4526 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4527 &d[6 + 8], &d[7 + 8]);
4528
4529 for (int i = 0; i < 8; i++) {
4530 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4531 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4532 }
4533 }
4534 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4535
dr_prediction_z3_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4536 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4537 const uint8_t *left, int upsample_left,
4538 int dy) {
4539 __m128i dstvec[16], d[16];
4540
4541 dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4542 transpose16x16_sse2(dstvec, d);
4543
4544 for (int i = 0; i < 16; i++) {
4545 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4546 }
4547 }
4548
dr_prediction_z3_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4549 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4550 const uint8_t *left, int upsample_left,
4551 int dy) {
4552 __m256i dstvec[32], d[32];
4553
4554 dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4555 transpose16x32_avx2(dstvec, d);
4556 transpose16x32_avx2(dstvec + 16, d + 16);
4557 for (int j = 0; j < 16; j++) {
4558 _mm_storeu_si128((__m128i *)(dst + j * stride),
4559 _mm256_castsi256_si128(d[j]));
4560 _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4561 _mm256_castsi256_si128(d[j + 16]));
4562 }
4563 for (int j = 0; j < 16; j++) {
4564 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4565 _mm256_extracti128_si256(d[j], 1));
4566 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4567 _mm256_extracti128_si256(d[j + 16], 1));
4568 }
4569 }
4570
dr_prediction_z3_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4571 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4572 const uint8_t *left, int upsample_left,
4573 int dy) {
4574 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4575 dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4576 transpose(dstT, 64, dst, stride, 64, 64);
4577 }
4578
dr_prediction_z3_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4579 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4580 const uint8_t *left, int upsample_left,
4581 int dy) {
4582 __m256i dstvec[16], d[16];
4583
4584 dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4585 transpose16x32_avx2(dstvec, d);
4586 // store
4587 for (int j = 0; j < 16; j++) {
4588 _mm_storeu_si128((__m128i *)(dst + j * stride),
4589 _mm256_castsi256_si128(d[j]));
4590 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4591 _mm256_extracti128_si256(d[j], 1));
4592 }
4593 }
4594
dr_prediction_z3_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4595 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4596 const uint8_t *left, int upsample_left,
4597 int dy) {
4598 __m128i dstvec[32], d[16];
4599
4600 dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4601 for (int i = 0; i < 32; i += 16) {
4602 transpose16x16_sse2((dstvec + i), d);
4603 for (int j = 0; j < 16; j++) {
4604 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4605 }
4606 }
4607 }
4608
dr_prediction_z3_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4609 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4610 const uint8_t *left, int upsample_left,
4611 int dy) {
4612 uint8_t dstT[64 * 32];
4613 dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4614 transpose(dstT, 64, dst, stride, 32, 64);
4615 }
4616
dr_prediction_z3_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4617 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4618 const uint8_t *left, int upsample_left,
4619 int dy) {
4620 uint8_t dstT[32 * 64];
4621 dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4622 transpose(dstT, 32, dst, stride, 64, 32);
4623 return;
4624 }
4625
4626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4627 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4628 const uint8_t *left, int upsample_left,
4629 int dy) {
4630 uint8_t dstT[64 * 16];
4631 dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4632 transpose(dstT, 64, dst, stride, 16, 64);
4633 }
4634
dr_prediction_z3_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4635 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4636 const uint8_t *left, int upsample_left,
4637 int dy) {
4638 __m128i dstvec[64], d[16];
4639
4640 dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4641 for (int i = 0; i < 64; i += 16) {
4642 transpose16x16_sse2((dstvec + i), d);
4643 for (int j = 0; j < 16; j++) {
4644 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4645 }
4646 }
4647 }
4648 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4649
av1_dr_prediction_z3_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)4650 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4651 const uint8_t *above, const uint8_t *left,
4652 int upsample_left, int dx, int dy) {
4653 (void)above;
4654 (void)dx;
4655 assert(dx == 1);
4656 assert(dy > 0);
4657
4658 if (bw == bh) {
4659 switch (bw) {
4660 case 4:
4661 dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4662 break;
4663 case 8:
4664 dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4665 break;
4666 case 16:
4667 dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4668 break;
4669 case 32:
4670 dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4671 break;
4672 case 64:
4673 dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4674 break;
4675 }
4676 } else {
4677 if (bw < bh) {
4678 if (bw + bw == bh) {
4679 switch (bw) {
4680 case 4:
4681 dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4682 break;
4683 case 8:
4684 dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4685 break;
4686 case 16:
4687 dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4688 break;
4689 case 32:
4690 dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4691 break;
4692 }
4693 } else {
4694 switch (bw) {
4695 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4696 case 4:
4697 dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4698 break;
4699 case 8:
4700 dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4701 break;
4702 case 16:
4703 dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4704 break;
4705 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4706 }
4707 }
4708 } else {
4709 if (bh + bh == bw) {
4710 switch (bh) {
4711 case 4:
4712 dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4713 break;
4714 case 8:
4715 dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4716 break;
4717 case 16:
4718 dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4719 break;
4720 case 32:
4721 dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4722 break;
4723 }
4724 } else {
4725 switch (bh) {
4726 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4727 case 4:
4728 dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4729 break;
4730 case 8:
4731 dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4732 break;
4733 case 16:
4734 dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4735 break;
4736 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4737 }
4738 }
4739 }
4740 }
4741 }
4742