1 /*
2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h> // SSE2
13 #include <smmintrin.h> /* SSE4.1 */
14
15 #include "config/av1_rtcd.h"
16 #include "aom_dsp/x86/intrapred_x86.h"
17 #include "aom_dsp/x86/intrapred_utils.h"
18 #include "aom_dsp/x86/lpf_common_sse2.h"
19
20 // Low bit depth functions
21 static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
22 { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
23 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
24 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
25 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
26 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
27 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
28 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
29 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
30 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
31 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
32 0 },
33 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
34 0 },
35 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
36 0, 0 },
37 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
38 0, 0, 0 },
39 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
40 0xff, 0, 0, 0 },
41 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
42 0xff, 0xff, 0, 0 },
43 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
44 0xff, 0xff, 0xff, 0 },
45 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
46 0xff, 0xff, 0xff, 0xff },
47 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
48 0xff, 0xff, 0xff, 0xff },
49 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
50 0xff, 0xff, 0xff, 0xff },
51 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
52 0xff, 0xff, 0xff, 0xff },
53 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
54 0xff, 0xff, 0xff, 0xff },
55 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56 0xff, 0xff, 0xff, 0xff },
57 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58 0xff, 0xff, 0xff, 0xff },
59 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60 0xff, 0xff, 0xff, 0xff },
61 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62 0xff, 0xff, 0xff, 0xff },
63 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64 0xff, 0xff, 0xff, 0xff },
65 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0xff, 0xff, 0xff, 0xff },
67 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0xff, 0xff, 0xff, 0xff },
69 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0xff, 0xff, 0xff, 0xff },
71 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0xff, 0xff, 0xff, 0xff },
73 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0xff, 0xff, 0xff, 0xff },
75 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76 0xff, 0xff, 0xff, 0xff },
77 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 0xff, 0xff, 0xff, 0xff } },
79 {
80 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
86 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
88 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
90 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
92 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
93 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
94 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
95 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
96 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
97 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
98 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
99 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
100 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
101 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
102 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
103 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
104 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
105 0 },
106 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
107 0 },
108 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
109 0, 0 },
110 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
111 0, 0, 0 },
112 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
113 0, 0, 0, 0 },
114 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115 0xff, 0, 0, 0 },
116 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
117 0xff, 0xff, 0, 0 },
118 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
119 0xff, 0xff, 0xff, 0 },
120 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
121 0xff, 0xff, 0xff, 0xff },
122 },
123 };
124
125 /* clang-format on */
dr_prediction_z1_HxW_internal_sse4_1(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)126 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
127 int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
128 int dx) {
129 const int frac_bits = 6 - upsample_above;
130 const int max_base_x = ((W + H) - 1) << upsample_above;
131
132 assert(dx > 0);
133 // pre-filter above pixels
134 // store in temp buffers:
135 // above[x] * 32 + 16
136 // above[x+1] - above[x]
137 // final pixels will be calculated as:
138 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
139 __m128i a0, a1, a32, a16;
140 __m128i diff, c3f;
141 __m128i a_mbase_x;
142
143 a16 = _mm_set1_epi16(16);
144 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
145 c3f = _mm_set1_epi16(0x3f);
146
147 int x = dx;
148 for (int r = 0; r < W; r++) {
149 __m128i b, res, res1, shift;
150 __m128i a0_above, a1_above;
151
152 int base = x >> frac_bits;
153 int base_max_diff = (max_base_x - base) >> upsample_above;
154 if (base_max_diff <= 0) {
155 for (int i = r; i < W; ++i) {
156 dst[i] = a_mbase_x; // save 4 values
157 }
158 return;
159 }
160 if (base_max_diff > H) base_max_diff = H;
161 a0_above = _mm_loadu_si128((__m128i *)(above + base));
162 a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
163
164 if (upsample_above) {
165 a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
166 a1_above = _mm_srli_si128(a0_above, 8);
167
168 shift = _mm_srli_epi16(
169 _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
170 1);
171 } else {
172 shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
173 }
174 // lower half
175 a0 = _mm_cvtepu8_epi16(a0_above);
176 a1 = _mm_cvtepu8_epi16(a1_above);
177
178 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
179 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
180 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
181
182 b = _mm_mullo_epi16(diff, shift);
183 res = _mm_add_epi16(a32, b);
184 res = _mm_srli_epi16(res, 5);
185
186 // uppar half
187 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
188 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
189
190 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
191 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
192 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
193
194 b = _mm_mullo_epi16(diff, shift);
195 res1 = _mm_add_epi16(a32, b);
196 res1 = _mm_srli_epi16(res1, 5);
197
198 res = _mm_packus_epi16(res, res1);
199
200 dst[r] =
201 _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
202 x += dx;
203 }
204 }
205
dr_prediction_z1_4xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)206 static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
207 const uint8_t *above,
208 int upsample_above, int dx) {
209 __m128i dstvec[16];
210
211 dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
212 for (int i = 0; i < N; i++) {
213 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
214 }
215 }
216
dr_prediction_z1_8xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)217 static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
218 const uint8_t *above,
219 int upsample_above, int dx) {
220 __m128i dstvec[32];
221
222 dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
223 for (int i = 0; i < N; i++) {
224 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
225 }
226 }
227
dr_prediction_z1_16xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)228 static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
229 const uint8_t *above,
230 int upsample_above, int dx) {
231 __m128i dstvec[64];
232
233 dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
234 dx);
235 for (int i = 0; i < N; i++) {
236 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
237 }
238 }
239
dr_prediction_z1_32xN_internal_sse4_1(int N,__m128i * dstvec,__m128i * dstvec_h,const uint8_t * above,int upsample_above,int dx)240 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
241 int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
242 int upsample_above, int dx) {
243 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
244 (void)upsample_above;
245 const int frac_bits = 6;
246 const int max_base_x = ((32 + N) - 1);
247
248 // pre-filter above pixels
249 // store in temp buffers:
250 // above[x] * 32 + 16
251 // above[x+1] - above[x]
252 // final pixels will be calculated as:
253 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
254 __m128i a0, a1, a32, a16;
255 __m128i a_mbase_x, diff, c3f;
256
257 a16 = _mm_set1_epi16(16);
258 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
259 c3f = _mm_set1_epi16(0x3f);
260
261 int x = dx;
262 for (int r = 0; r < N; r++) {
263 __m128i b, res, res1, res16[2];
264 __m128i a0_above, a1_above;
265
266 int base = x >> frac_bits;
267 int base_max_diff = (max_base_x - base);
268 if (base_max_diff <= 0) {
269 for (int i = r; i < N; ++i) {
270 dstvec[i] = a_mbase_x; // save 32 values
271 dstvec_h[i] = a_mbase_x;
272 }
273 return;
274 }
275 if (base_max_diff > 32) base_max_diff = 32;
276 __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
277
278 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
279 int mdiff = base_max_diff - j;
280 if (mdiff <= 0) {
281 res16[jj] = a_mbase_x;
282 } else {
283 a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
284 a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
285
286 // lower half
287 a0 = _mm_cvtepu8_epi16(a0_above);
288 a1 = _mm_cvtepu8_epi16(a1_above);
289
290 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
291 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
292 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
293 b = _mm_mullo_epi16(diff, shift);
294
295 res = _mm_add_epi16(a32, b);
296 res = _mm_srli_epi16(res, 5);
297
298 // uppar half
299 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
300 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
301
302 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
303 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
304 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
305
306 b = _mm_mullo_epi16(diff, shift);
307 res1 = _mm_add_epi16(a32, b);
308 res1 = _mm_srli_epi16(res1, 5);
309
310 res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values
311 }
312 }
313
314 dstvec[r] =
315 _mm_blendv_epi8(a_mbase_x, res16[0],
316 *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values
317
318 dstvec_h[r] =
319 _mm_blendv_epi8(a_mbase_x, res16[1],
320 *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values
321 x += dx;
322 }
323 }
324
dr_prediction_z1_32xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)325 static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
326 const uint8_t *above,
327 int upsample_above, int dx) {
328 __m128i dstvec[64], dstvec_h[64];
329 dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
330 upsample_above, dx);
331 for (int i = 0; i < N; i++) {
332 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
333 _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
334 }
335 }
336
dr_prediction_z1_64xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)337 static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
338 const uint8_t *above,
339 int upsample_above, int dx) {
340 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
341 (void)upsample_above;
342 const int frac_bits = 6;
343 const int max_base_x = ((64 + N) - 1);
344
345 // pre-filter above pixels
346 // store in temp buffers:
347 // above[x] * 32 + 16
348 // above[x+1] - above[x]
349 // final pixels will be calculated as:
350 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
351 __m128i a0, a1, a32, a16;
352 __m128i a_mbase_x, diff, c3f;
353 __m128i max_base, base_inc, mask;
354
355 a16 = _mm_set1_epi16(16);
356 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
357 max_base = _mm_set1_epi8(max_base_x);
358 c3f = _mm_set1_epi16(0x3f);
359
360 int x = dx;
361 for (int r = 0; r < N; r++, dst += stride) {
362 __m128i b, res, res1;
363 int base = x >> frac_bits;
364 if (base >= max_base_x) {
365 for (int i = r; i < N; ++i) {
366 _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values
367 _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
368 _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
369 _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
370 dst += stride;
371 }
372 return;
373 }
374
375 __m128i shift =
376 _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element
377
378 __m128i a0_above, a1_above, res_val;
379 for (int j = 0; j < 64; j += 16) {
380 int mdif = max_base_x - (base + j);
381 if (mdif <= 0) {
382 _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
383 } else {
384 a0_above =
385 _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element
386 a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
387
388 // lower half
389 a0 = _mm_cvtepu8_epi16(a0_above);
390 a1 = _mm_cvtepu8_epi16(a1_above);
391
392 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
393 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
394 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
395 b = _mm_mullo_epi16(diff, shift);
396
397 res = _mm_add_epi16(a32, b);
398 res = _mm_srli_epi16(res, 5);
399
400 // uppar half
401 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
402 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
403
404 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
405 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
406 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
407
408 b = _mm_mullo_epi16(diff, shift);
409 res1 = _mm_add_epi16(a32, b);
410 res1 = _mm_srli_epi16(res1, 5);
411
412 res = _mm_packus_epi16(res, res1); // 16 8bit values
413
414 base_inc =
415 _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
416 (int8_t)(base + j + 2), (int8_t)(base + j + 3),
417 (int8_t)(base + j + 4), (int8_t)(base + j + 5),
418 (int8_t)(base + j + 6), (int8_t)(base + j + 7),
419 (int8_t)(base + j + 8), (int8_t)(base + j + 9),
420 (int8_t)(base + j + 10), (int8_t)(base + j + 11),
421 (int8_t)(base + j + 12), (int8_t)(base + j + 13),
422 (int8_t)(base + j + 14), (int8_t)(base + j + 15));
423
424 mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
425 _mm_setzero_si128());
426 res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
427 _mm_storeu_si128((__m128i *)(dst + j), res_val);
428 }
429 }
430 x += dx;
431 }
432 }
433
434 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)435 void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
436 const uint8_t *above, const uint8_t *left,
437 int upsample_above, int dx, int dy) {
438 (void)left;
439 (void)dy;
440 switch (bw) {
441 case 4:
442 dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
443 break;
444 case 8:
445 dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
446 break;
447 case 16:
448 dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
449 break;
450 case 32:
451 dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
452 break;
453 case 64:
454 dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
455 break;
456 default: assert(0 && "Invalid block size");
457 }
458 return;
459 }
460
dr_prediction_z2_Nx4_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)461 static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
462 const uint8_t *above,
463 const uint8_t *left, int upsample_above,
464 int upsample_left, int dx, int dy) {
465 const int min_base_x = -(1 << upsample_above);
466 const int min_base_y = -(1 << upsample_left);
467 const int frac_bits_x = 6 - upsample_above;
468 const int frac_bits_y = 6 - upsample_left;
469
470 assert(dx > 0);
471 // pre-filter above pixels
472 // store in temp buffers:
473 // above[x] * 32 + 16
474 // above[x+1] - above[x]
475 // final pixels will be calculated as:
476 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
477 __m128i a0_x, a1_x, a32, diff;
478
479 const __m128i c3f = _mm_set1_epi16(0x3f);
480 const __m128i min_y_base = _mm_set1_epi16(min_base_y);
481 const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
482 const __m128i dy_reg = _mm_set1_epi16(dy);
483 const __m128i a16 = _mm_set1_epi16(16);
484
485 for (int r = 0; r < N; r++) {
486 __m128i b, res, shift, r6, ydx;
487 __m128i resx, resy, resxy;
488 __m128i a0_above, a1_above;
489 int y = r + 1;
490 int base_x = (-y * dx) >> frac_bits_x;
491 int base_shift = 0;
492 if (base_x < (min_base_x - 1)) {
493 base_shift = (min_base_x - base_x - 1) >> upsample_above;
494 }
495 int base_min_diff =
496 (min_base_x - base_x + upsample_above) >> upsample_above;
497 if (base_min_diff > 4) {
498 base_min_diff = 4;
499 } else {
500 if (base_min_diff < 0) base_min_diff = 0;
501 }
502
503 if (base_shift > 3) {
504 a0_x = _mm_setzero_si128();
505 a1_x = _mm_setzero_si128();
506 shift = _mm_setzero_si128();
507 } else {
508 a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
509 ydx = _mm_set1_epi16(y * dx);
510 r6 = _mm_slli_epi16(c1234, 6);
511
512 if (upsample_above) {
513 a0_above =
514 _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
515 a1_above = _mm_srli_si128(a0_above, 8);
516
517 shift = _mm_srli_epi16(
518 _mm_and_si128(
519 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
520 1);
521 } else {
522 a0_above =
523 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
524 a1_above = _mm_srli_si128(a0_above, 1);
525
526 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
527 }
528 a0_x = _mm_cvtepu8_epi16(a0_above);
529 a1_x = _mm_cvtepu8_epi16(a1_above);
530 }
531 // y calc
532 __m128i a0_y, a1_y, shifty;
533 if (base_x < min_base_x) {
534 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
535 __m128i y_c, base_y_c_reg, mask, c1234_;
536 c1234_ = _mm_srli_si128(c1234, 2);
537 r6 = _mm_set1_epi16(r << 6);
538 y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
539 base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
540 mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
541 base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
542 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
543
544 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
545 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
546 base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
547 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
548 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
549 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
550
551 if (upsample_left) {
552 shifty = _mm_srli_epi16(
553 _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
554 } else {
555 shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
556 }
557 a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
558 a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
559 shift = _mm_unpacklo_epi64(shift, shifty);
560 }
561
562 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
563 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
564 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
565
566 b = _mm_mullo_epi16(diff, shift);
567 res = _mm_add_epi16(a32, b);
568 res = _mm_srli_epi16(res, 5);
569
570 resx = _mm_packus_epi16(res, res);
571 resy = _mm_srli_si128(resx, 4);
572
573 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
574 *(int *)(dst) = _mm_cvtsi128_si32(resxy);
575 dst += stride;
576 }
577 }
578
dr_prediction_z2_Nx8_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)579 static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
580 const uint8_t *above,
581 const uint8_t *left, int upsample_above,
582 int upsample_left, int dx, int dy) {
583 const int min_base_x = -(1 << upsample_above);
584 const int min_base_y = -(1 << upsample_left);
585 const int frac_bits_x = 6 - upsample_above;
586 const int frac_bits_y = 6 - upsample_left;
587
588 // pre-filter above pixels
589 // store in temp buffers:
590 // above[x] * 32 + 16
591 // above[x+1] - above[x]
592 // final pixels will be calculated as:
593 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
594 __m128i diff, a32;
595 __m128i a0_x, a1_x, a0_y, a1_y;
596 __m128i a0_above, a1_above;
597
598 const __m128i a16 = _mm_set1_epi16(16);
599 const __m128i c3f = _mm_set1_epi16(0x3f);
600 const __m128i min_y_base = _mm_set1_epi16(min_base_y);
601 const __m128i dy_reg = _mm_set1_epi16(dy);
602 const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
603
604 for (int r = 0; r < N; r++) {
605 __m128i b, res, res1, shift;
606 __m128i resx, resy, resxy, r6, ydx;
607
608 int y = r + 1;
609 int base_x = (-y * dx) >> frac_bits_x;
610 int base_shift = 0;
611 if (base_x < (min_base_x - 1)) {
612 base_shift = (min_base_x - base_x - 1) >> upsample_above;
613 }
614 int base_min_diff =
615 (min_base_x - base_x + upsample_above) >> upsample_above;
616 if (base_min_diff > 8) {
617 base_min_diff = 8;
618 } else {
619 if (base_min_diff < 0) base_min_diff = 0;
620 }
621
622 if (base_shift > 7) {
623 resx = _mm_setzero_si128();
624 } else {
625 a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
626 ydx = _mm_set1_epi16(y * dx);
627 r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
628 if (upsample_above) {
629 a0_above =
630 _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
631 a1_above = _mm_srli_si128(a0_above, 8);
632
633 shift = _mm_srli_epi16(
634 _mm_and_si128(
635 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
636 1);
637 } else {
638 a1_above = _mm_srli_si128(a0_above, 1);
639 a0_above =
640 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
641 a1_above =
642 _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
643
644 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
645 }
646 a0_x = _mm_cvtepu8_epi16(a0_above);
647 a1_x = _mm_cvtepu8_epi16(a1_above);
648
649 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
650 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
651 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
652
653 b = _mm_mullo_epi16(diff, shift);
654 res = _mm_add_epi16(a32, b);
655 res = _mm_srli_epi16(res, 5);
656 resx = _mm_packus_epi16(res, res);
657 }
658
659 // y calc
660 if (base_x < min_base_x) {
661 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
662 __m128i y_c, base_y_c_reg, mask;
663 r6 = _mm_set1_epi16(r << 6);
664 y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
665 base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
666 mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
667 base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
668 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
669
670 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
671 left[base_y_c[2]], left[base_y_c[3]],
672 left[base_y_c[4]], left[base_y_c[5]],
673 left[base_y_c[6]], left[base_y_c[7]]);
674 base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
675 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
676
677 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
678 left[base_y_c[2]], left[base_y_c[3]],
679 left[base_y_c[4]], left[base_y_c[5]],
680 left[base_y_c[6]], left[base_y_c[7]]);
681
682 if (upsample_left) {
683 shift = _mm_srli_epi16(
684 _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
685 } else {
686 shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
687 }
688
689 diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
690 a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
691 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
692
693 b = _mm_mullo_epi16(diff, shift);
694 res1 = _mm_add_epi16(a32, b);
695 res1 = _mm_srli_epi16(res1, 5);
696
697 resy = _mm_packus_epi16(res1, res1);
698 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
699 _mm_storel_epi64((__m128i *)dst, resxy);
700 } else {
701 _mm_storel_epi64((__m128i *)dst, resx);
702 }
703
704 dst += stride;
705 }
706 }
707
dr_prediction_z2_HxW_sse4_1(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)708 static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
709 ptrdiff_t stride, const uint8_t *above,
710 const uint8_t *left, int upsample_above,
711 int upsample_left, int dx, int dy) {
712 // here upsample_above and upsample_left are 0 by design of
713 // av1_use_intra_edge_upsample
714 const int min_base_x = -1;
715 const int min_base_y = -1;
716 (void)upsample_above;
717 (void)upsample_left;
718 const int frac_bits_x = 6;
719 const int frac_bits_y = 6;
720
721 __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
722 __m128i diff, shifty, shifty_h;
723 __m128i a0_above, a1_above;
724
725 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
726 const __m128i a16 = _mm_set1_epi16(16);
727 const __m128i c1 = _mm_srli_epi16(a16, 4);
728 const __m128i min_y_base = _mm_set1_epi16(min_base_y);
729 const __m128i c3f = _mm_set1_epi16(0x3f);
730 const __m128i dy256 = _mm_set1_epi16(dy);
731 const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
732 const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
733 const __m128i c1234 = _mm_add_epi16(c0123, c1);
734 const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
735
736 for (int r = 0; r < H; r++) {
737 __m128i b, res, res1, shift, reg_j, r6, ydx;
738 __m128i resx, resy;
739 __m128i resxy;
740 int y = r + 1;
741 ydx = _mm_set1_epi16((int16_t)(y * dx));
742
743 int base_x = (-y * dx) >> frac_bits_x;
744 for (int j = 0; j < W; j += 16) {
745 reg_j = _mm_set1_epi16(j);
746 int base_shift = 0;
747 if ((base_x + j) < (min_base_x - 1)) {
748 base_shift = (min_base_x - (base_x + j) - 1);
749 }
750 int base_min_diff = (min_base_x - base_x - j);
751 if (base_min_diff > 16) {
752 base_min_diff = 16;
753 } else {
754 if (base_min_diff < 0) base_min_diff = 0;
755 }
756
757 if (base_shift < 16) {
758 a0_above =
759 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
760 a1_above =
761 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
762 a0_above =
763 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
764 a1_above =
765 _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
766
767 a0_x = _mm_cvtepu8_epi16(a0_above);
768 a1_x = _mm_cvtepu8_epi16(a1_above);
769
770 r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
771 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
772
773 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
774 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
775 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
776
777 b = _mm_mullo_epi16(diff, shift);
778 res = _mm_add_epi16(a32, b);
779 res = _mm_srli_epi16(res, 5); // 16 16-bit values
780
781 a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
782 a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
783
784 r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
785 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
786
787 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
788 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
789 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
790
791 b = _mm_mullo_epi16(diff, shift);
792 res1 = _mm_add_epi16(a32, b);
793 res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
794
795 resx = _mm_packus_epi16(res, res1);
796 } else {
797 resx = _mm_setzero_si128();
798 }
799
800 // y calc
801 if (base_x < min_base_x) {
802 __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
803 __m128i mask, mask_h, mul16, mul16_h;
804 r6 = _mm_set1_epi16(r << 6);
805 c_reg = _mm_add_epi16(reg_j, c1234);
806 c_reg_h = _mm_add_epi16(reg_j, c1234_h);
807 mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
808 _mm_srli_epi16(min_y_base, 1));
809 mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
810 _mm_srli_epi16(min_y_base, 1));
811 y_reg = _mm_sub_epi16(r6, mul16);
812 y_reg_h = _mm_sub_epi16(r6, mul16_h);
813
814 base_y = _mm_srai_epi16(y_reg, frac_bits_y);
815 base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
816 mask = _mm_cmpgt_epi16(min_y_base, base_y);
817 mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
818
819 base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
820 base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
821 int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
822 int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
823 int16_t offset_diff = max_y - min_y;
824
825 if (offset_diff < 16) {
826 __m128i min_y_reg = _mm_set1_epi16(min_y);
827
828 __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
829 __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
830 __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
831
832 __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
833 __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
834 __m128i LoadMask =
835 _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
836
837 a0_mask = _mm_and_si128(a0_mask, LoadMask);
838 a1_mask = _mm_and_si128(a1_mask, LoadMask);
839
840 a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
841 a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
842 a0_y = _mm_cvtepu8_epi16(a0_mask);
843 a1_y = _mm_cvtepu8_epi16(a1_mask);
844 a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
845 a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
846 } else {
847 base_y = _mm_andnot_si128(mask, base_y);
848 base_y_h = _mm_andnot_si128(mask_h, base_y_h);
849 _mm_store_si128((__m128i *)base_y_c, base_y);
850 _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
851
852 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
853 left[base_y_c[2]], left[base_y_c[3]],
854 left[base_y_c[4]], left[base_y_c[5]],
855 left[base_y_c[6]], left[base_y_c[7]]);
856 a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
857 left[base_y_c[10]], left[base_y_c[11]],
858 left[base_y_c[12]], left[base_y_c[13]],
859 left[base_y_c[14]], left[base_y_c[15]]);
860 base_y = _mm_add_epi16(base_y, c1);
861 base_y_h = _mm_add_epi16(base_y_h, c1);
862 _mm_store_si128((__m128i *)base_y_c, base_y);
863 _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
864
865 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
866 left[base_y_c[2]], left[base_y_c[3]],
867 left[base_y_c[4]], left[base_y_c[5]],
868 left[base_y_c[6]], left[base_y_c[7]]);
869 a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
870 left[base_y_c[10]], left[base_y_c[11]],
871 left[base_y_c[12]], left[base_y_c[13]],
872 left[base_y_c[14]], left[base_y_c[15]]);
873 }
874 shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
875 shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
876
877 diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
878 a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
879 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
880
881 b = _mm_mullo_epi16(diff, shifty);
882 res = _mm_add_epi16(a32, b);
883 res = _mm_srli_epi16(res, 5); // 16 16-bit values
884
885 diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x]
886 a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32
887 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
888
889 b = _mm_mullo_epi16(diff, shifty_h);
890 res1 = _mm_add_epi16(a32, b);
891 res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
892 resy = _mm_packus_epi16(res, res1);
893 } else {
894 resy = _mm_setzero_si128();
895 }
896 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
897 _mm_storeu_si128((__m128i *)(dst + j), resxy);
898 } // for j
899 dst += stride;
900 }
901 }
902
903 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)904 void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
905 const uint8_t *above, const uint8_t *left,
906 int upsample_above, int upsample_left, int dx,
907 int dy) {
908 assert(dx > 0);
909 assert(dy > 0);
910 switch (bw) {
911 case 4:
912 dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
913 upsample_left, dx, dy);
914 break;
915 case 8:
916 dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
917 upsample_left, dx, dy);
918 break;
919 default:
920 dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
921 upsample_above, upsample_left, dx, dy);
922 }
923 return;
924 }
925
926 // z3 functions
dr_prediction_z3_4x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)927 static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
928 const uint8_t *left, int upsample_left,
929 int dy) {
930 __m128i dstvec[4], d[4];
931
932 dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
933 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
934 &d[0], &d[1], &d[2], &d[3]);
935
936 *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
937 *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
938 *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
939 *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
940 return;
941 }
942
dr_prediction_z3_8x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)943 static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
944 const uint8_t *left, int upsample_left,
945 int dy) {
946 __m128i dstvec[8], d[8];
947
948 dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
949 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
950 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
951 &d[3]);
952
953 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
954 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
955 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
956 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
957 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
958 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
959 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
960 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
961 }
962
dr_prediction_z3_4x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)963 static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
964 const uint8_t *left, int upsample_left,
965 int dy) {
966 __m128i dstvec[4], d[8];
967
968 dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
969 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
970 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
971 for (int i = 0; i < 8; i++) {
972 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
973 }
974 }
975
dr_prediction_z3_8x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)976 static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
977 const uint8_t *left, int upsample_left,
978 int dy) {
979 __m128i dstvec[8], d[4];
980
981 dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
982 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
983 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
984 &d[1], &d[2], &d[3]);
985 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
986 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
987 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
988 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
989 }
990
dr_prediction_z3_8x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)991 static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
992 const uint8_t *left, int upsample_left,
993 int dy) {
994 __m128i dstvec[8], d[8];
995
996 dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
997 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
998 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
999 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
1000 for (int i = 0; i < 8; i++) {
1001 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1002 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
1003 _mm_srli_si128(d[i], 8));
1004 }
1005 }
1006
dr_prediction_z3_16x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1007 static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1008 const uint8_t *left, int upsample_left,
1009 int dy) {
1010 __m128i dstvec[16], d[16];
1011
1012 dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
1013 transpose16x8_8x16_sse2(
1014 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1015 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1016 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1017 &d[3], &d[4], &d[5], &d[6], &d[7]);
1018
1019 for (int i = 0; i < 8; i++) {
1020 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1021 }
1022 }
1023
1024 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1025 static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1026 const uint8_t *left, int upsample_left,
1027 int dy) {
1028 __m128i dstvec[4], d[16];
1029
1030 dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
1031 transpose4x16_sse2(dstvec, d);
1032 for (int i = 0; i < 16; i++) {
1033 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
1034 }
1035 }
1036
dr_prediction_z3_16x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1037 static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
1038 const uint8_t *left, int upsample_left,
1039 int dy) {
1040 __m128i dstvec[16], d[8];
1041
1042 dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
1043 for (int i = 4; i < 8; i++) {
1044 d[i] = _mm_setzero_si128();
1045 }
1046 transpose16x8_8x16_sse2(
1047 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1048 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1049 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1050 &d[3], &d[4], &d[5], &d[6], &d[7]);
1051
1052 for (int i = 0; i < 4; i++) {
1053 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1054 }
1055 }
1056
dr_prediction_z3_8x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1057 static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1058 const uint8_t *left, int upsample_left,
1059 int dy) {
1060 __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1061
1062 dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
1063 upsample_left, dy);
1064 for (int i = 8; i < 16; i++) {
1065 dstvec[i] = _mm_setzero_si128();
1066 dstvec_h[i] = _mm_setzero_si128();
1067 }
1068 transpose16x16_sse2(dstvec, d);
1069 transpose16x16_sse2(dstvec_h, d_h);
1070
1071 for (int i = 0; i < 16; i++) {
1072 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1073 }
1074 for (int i = 0; i < 16; i++) {
1075 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
1076 }
1077 }
1078
dr_prediction_z3_32x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1079 static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1080 const uint8_t *left, int upsample_left,
1081 int dy) {
1082 __m128i dstvec[32], d[16];
1083
1084 dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
1085
1086 transpose16x8_8x16_sse2(
1087 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1088 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1089 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1090 &d[3], &d[4], &d[5], &d[6], &d[7]);
1091 transpose16x8_8x16_sse2(
1092 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
1093 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
1094 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
1095 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
1096 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
1097 &d[6 + 8], &d[7 + 8]);
1098
1099 for (int i = 0; i < 8; i++) {
1100 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1101 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
1102 }
1103 }
1104 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1105
dr_prediction_z3_16x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1106 static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1107 const uint8_t *left,
1108 int upsample_left, int dy) {
1109 __m128i dstvec[16], d[16];
1110
1111 dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
1112 transpose16x16_sse2(dstvec, d);
1113
1114 for (int i = 0; i < 16; i++) {
1115 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1116 }
1117 }
1118
dr_prediction_z3_32x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1119 static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1120 const uint8_t *left,
1121 int upsample_left, int dy) {
1122 __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
1123
1124 dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
1125 upsample_left, dy);
1126 transpose16x16_sse2(dstvec, d);
1127 transpose16x16_sse2(dstvec_h, d_h);
1128 transpose16x16_sse2(dstvec + 16, d + 16);
1129 transpose16x16_sse2(dstvec_h + 16, d_h + 16);
1130 for (int j = 0; j < 16; j++) {
1131 _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1132 _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
1133 }
1134 for (int j = 0; j < 16; j++) {
1135 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1136 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
1137 }
1138 }
1139
dr_prediction_z3_64x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1140 static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1141 const uint8_t *left,
1142 int upsample_left, int dy) {
1143 uint8_t dstT[64 * 64];
1144 dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
1145 transpose(dstT, 64, dst, stride, 64, 64);
1146 }
1147
dr_prediction_z3_16x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1148 static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1149 const uint8_t *left,
1150 int upsample_left, int dy) {
1151 __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1152
1153 dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
1154 upsample_left, dy);
1155 transpose16x16_sse2(dstvec, d);
1156 transpose16x16_sse2(dstvec_h, d_h);
1157 // store
1158 for (int j = 0; j < 16; j++) {
1159 _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1160 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1161 }
1162 }
1163
dr_prediction_z3_32x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1164 static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1165 const uint8_t *left,
1166 int upsample_left, int dy) {
1167 __m128i dstvec[32], d[16];
1168
1169 dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
1170 for (int i = 0; i < 32; i += 16) {
1171 transpose16x16_sse2((dstvec + i), d);
1172 for (int j = 0; j < 16; j++) {
1173 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1174 }
1175 }
1176 }
1177
dr_prediction_z3_32x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1178 static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1179 const uint8_t *left,
1180 int upsample_left, int dy) {
1181 uint8_t dstT[64 * 32];
1182 dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
1183 transpose(dstT, 64, dst, stride, 32, 64);
1184 }
1185
dr_prediction_z3_64x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1186 static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1187 const uint8_t *left,
1188 int upsample_left, int dy) {
1189 uint8_t dstT[32 * 64];
1190 dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
1191 transpose(dstT, 32, dst, stride, 64, 32);
1192 return;
1193 }
1194
1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1196 static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1197 const uint8_t *left,
1198 int upsample_left, int dy) {
1199 uint8_t dstT[64 * 16];
1200 dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
1201 transpose(dstT, 64, dst, stride, 16, 64);
1202 }
1203
dr_prediction_z3_64x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1204 static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1205 const uint8_t *left,
1206 int upsample_left, int dy) {
1207 __m128i dstvec[64], d[16];
1208
1209 dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
1210 for (int i = 0; i < 64; i += 16) {
1211 transpose16x16_sse2(dstvec + i, d);
1212 for (int j = 0; j < 16; j++) {
1213 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1214 }
1215 }
1216 }
1217 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1218
av1_dr_prediction_z3_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)1219 void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1220 const uint8_t *above, const uint8_t *left,
1221 int upsample_left, int dx, int dy) {
1222 (void)above;
1223 (void)dx;
1224 assert(dx == 1);
1225 assert(dy > 0);
1226
1227 if (bw == bh) {
1228 switch (bw) {
1229 case 4:
1230 dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
1231 break;
1232 case 8:
1233 dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
1234 break;
1235 case 16:
1236 dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
1237 break;
1238 case 32:
1239 dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
1240 break;
1241 case 64:
1242 dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
1243 break;
1244 default: assert(0 && "Invalid block size");
1245 }
1246 } else {
1247 if (bw < bh) {
1248 if (bw + bw == bh) {
1249 switch (bw) {
1250 case 4:
1251 dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
1252 break;
1253 case 8:
1254 dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
1255 break;
1256 case 16:
1257 dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
1258 break;
1259 case 32:
1260 dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
1261 break;
1262 default: assert(0 && "Invalid block size");
1263 }
1264 } else {
1265 switch (bw) {
1266 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1267 case 4:
1268 dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
1269 break;
1270 case 8:
1271 dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
1272 break;
1273 case 16:
1274 dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
1275 break;
1276 default: assert(0 && "Invalid block size");
1277 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1278 }
1279 }
1280 } else {
1281 if (bh + bh == bw) {
1282 switch (bh) {
1283 case 4:
1284 dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
1285 break;
1286 case 8:
1287 dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
1288 break;
1289 case 16:
1290 dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
1291 break;
1292 case 32:
1293 dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
1294 break;
1295 default: assert(0 && "Invalid block size");
1296 }
1297 } else {
1298 switch (bh) {
1299 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1300 case 4:
1301 dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
1302 break;
1303 case 8:
1304 dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
1305 break;
1306 case 16:
1307 dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
1308 break;
1309 default: assert(0 && "Invalid block size");
1310 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1311 }
1312 }
1313 }
1314 }
1315 }
1316