1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
14
15 #include "config/aom_config.h"
16
17 #if HAVE_NEON
18 #error "Do not use this file for Neon"
19 #endif
20
21 #if HAVE_SSE2
22 #include "aom_dsp/simd/v128_intrinsics_x86.h"
23 #else
24 #include "aom_dsp/simd/v128_intrinsics.h"
25 #endif
26
27 typedef struct {
28 v128 val[2];
29 } v256;
30
v256_low_u32(v256 a)31 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
32
v256_low_v64(v256 a)33 SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
34
v256_low_u64(v256 a)35 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
36
v256_low_v128(v256 a)37 SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
38
v256_high_v128(v256 a)39 SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
40
v256_from_v128(v128 hi,v128 lo)41 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
42 v256 t;
43 t.val[1] = hi;
44 t.val[0] = lo;
45 return t;
46 }
47
v256_from_64(uint64_t a,uint64_t b,uint64_t c,uint64_t d)48 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
49 return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
50 }
51
v256_from_v64(v64 a,v64 b,v64 c,v64 d)52 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
53 return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
54 }
55
v256_load_unaligned(const void * p)56 SIMD_INLINE v256 v256_load_unaligned(const void *p) {
57 return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
58 v128_load_unaligned(p));
59 }
60
v256_load_aligned(const void * p)61 SIMD_INLINE v256 v256_load_aligned(const void *p) {
62 return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
63 v128_load_aligned(p));
64 }
65
v256_store_unaligned(void * p,v256 a)66 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
67 v128_store_unaligned(p, a.val[0]);
68 v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
69 }
70
v256_store_aligned(void * p,v256 a)71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
72 v128_store_aligned(p, a.val[0]);
73 v128_store_aligned((uint8_t *)p + 16, a.val[1]);
74 }
75
v256_zero(void)76 SIMD_INLINE v256 v256_zero(void) {
77 return v256_from_v128(v128_zero(), v128_zero());
78 }
79
v256_dup_8(uint8_t x)80 SIMD_INLINE v256 v256_dup_8(uint8_t x) {
81 v128 t = v128_dup_8(x);
82 return v256_from_v128(t, t);
83 }
84
v256_dup_16(uint16_t x)85 SIMD_INLINE v256 v256_dup_16(uint16_t x) {
86 v128 t = v128_dup_16(x);
87 return v256_from_v128(t, t);
88 }
89
v256_dup_32(uint32_t x)90 SIMD_INLINE v256 v256_dup_32(uint32_t x) {
91 v128 t = v128_dup_32(x);
92 return v256_from_v128(t, t);
93 }
94
v256_dup_64(uint64_t x)95 SIMD_INLINE v256 v256_dup_64(uint64_t x) {
96 v128 t = v128_dup_64(x);
97 return v256_from_v128(t, t);
98 }
99
v256_dotp_su8(v256 a,v256 b)100 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
101 return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
102 }
103
v256_dotp_s16(v256 a,v256 b)104 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
105 return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
106 }
107
v256_dotp_s32(v256 a,v256 b)108 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
109 return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
110 }
111
v256_hadd_u8(v256 a)112 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
113 return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
114 }
115
116 typedef struct {
117 sad128_internal val[2];
118 } sad256_internal;
119
v256_sad_u8_init(void)120 SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
121 sad256_internal t;
122 t.val[1] = v128_sad_u8_init();
123 t.val[0] = v128_sad_u8_init();
124 return t;
125 }
126
127 /* Implementation dependent return value. Result must be finalised with
128 v256_sad_u8_sum().
129 The result for more than 16 v256_sad_u8() calls is undefined. */
v256_sad_u8(sad256_internal s,v256 a,v256 b)130 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
131 sad256_internal t;
132 t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
133 t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
134 return t;
135 }
136
v256_sad_u8_sum(sad256_internal s)137 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
138 return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
139 }
140
141 typedef struct {
142 ssd128_internal val[2];
143 } ssd256_internal;
144
v256_ssd_u8_init(void)145 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
146 ssd256_internal t;
147 t.val[1] = v128_ssd_u8_init();
148 t.val[0] = v128_ssd_u8_init();
149 return t;
150 }
151
152 /* Implementation dependent return value. Result must be finalised with
153 * v256_ssd_u8_sum(). */
v256_ssd_u8(ssd256_internal s,v256 a,v256 b)154 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
155 ssd256_internal t;
156 t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
157 t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
158 return t;
159 }
160
v256_ssd_u8_sum(ssd256_internal s)161 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
162 return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
163 }
164
v256_or(v256 a,v256 b)165 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
166 return v256_from_v128(v128_or(a.val[1], b.val[1]),
167 v128_or(a.val[0], b.val[0]));
168 }
169
v256_xor(v256 a,v256 b)170 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
171 return v256_from_v128(v128_xor(a.val[1], b.val[1]),
172 v128_xor(a.val[0], b.val[0]));
173 }
174
v256_and(v256 a,v256 b)175 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
176 return v256_from_v128(v128_and(a.val[1], b.val[1]),
177 v128_and(a.val[0], b.val[0]));
178 }
179
v256_andn(v256 a,v256 b)180 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
181 return v256_from_v128(v128_andn(a.val[1], b.val[1]),
182 v128_andn(a.val[0], b.val[0]));
183 }
184
v256_add_8(v256 a,v256 b)185 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
186 return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
187 v128_add_8(a.val[0], b.val[0]));
188 }
189
v256_add_16(v256 a,v256 b)190 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
191 return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
192 v128_add_16(a.val[0], b.val[0]));
193 }
194
v256_sadd_s8(v256 a,v256 b)195 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
196 return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
197 v128_sadd_s8(a.val[0], b.val[0]));
198 }
199
v256_sadd_u8(v256 a,v256 b)200 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
201 return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
202 v128_sadd_u8(a.val[0], b.val[0]));
203 }
204
v256_sadd_s16(v256 a,v256 b)205 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
206 return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
207 v128_sadd_s16(a.val[0], b.val[0]));
208 }
209
v256_add_32(v256 a,v256 b)210 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
211 return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
212 v128_add_32(a.val[0], b.val[0]));
213 }
214
v256_add_64(v256 a,v256 b)215 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
216 return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
217 v128_add_64(a.val[0], b.val[0]));
218 }
219
v256_padd_u8(v256 a)220 SIMD_INLINE v256 v256_padd_u8(v256 a) {
221 return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
222 }
223
v256_padd_s16(v256 a)224 SIMD_INLINE v256 v256_padd_s16(v256 a) {
225 return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
226 }
227
v256_sub_8(v256 a,v256 b)228 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
229 return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
230 v128_sub_8(a.val[0], b.val[0]));
231 }
232
v256_ssub_u8(v256 a,v256 b)233 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
234 return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
235 v128_ssub_u8(a.val[0], b.val[0]));
236 }
237
v256_ssub_s8(v256 a,v256 b)238 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
239 return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
240 v128_ssub_s8(a.val[0], b.val[0]));
241 }
242
v256_sub_16(v256 a,v256 b)243 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
244 return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
245 v128_sub_16(a.val[0], b.val[0]));
246 }
247
v256_ssub_s16(v256 a,v256 b)248 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
249 return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
250 v128_ssub_s16(a.val[0], b.val[0]));
251 }
252
v256_ssub_u16(v256 a,v256 b)253 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
254 return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
255 v128_ssub_u16(a.val[0], b.val[0]));
256 }
257
v256_sub_32(v256 a,v256 b)258 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
259 return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
260 v128_sub_32(a.val[0], b.val[0]));
261 }
262
v256_sub_64(v256 a,v256 b)263 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
264 return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
265 v128_sub_64(a.val[0], b.val[0]));
266 }
267
v256_abs_s16(v256 a)268 SIMD_INLINE v256 v256_abs_s16(v256 a) {
269 return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
270 }
271
v256_abs_s8(v256 a)272 SIMD_INLINE v256 v256_abs_s8(v256 a) {
273 return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
274 }
275
v256_mul_s16(v128 a,v128 b)276 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
277 v128 lo_bits = v128_mullo_s16(a, b);
278 v128 hi_bits = v128_mulhi_s16(a, b);
279 return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
280 v128_ziplo_16(hi_bits, lo_bits));
281 }
282
v256_mullo_s16(v256 a,v256 b)283 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
284 return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
285 v128_mullo_s16(a.val[0], b.val[0]));
286 }
287
v256_mulhi_s16(v256 a,v256 b)288 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
289 return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
290 v128_mulhi_s16(a.val[0], b.val[0]));
291 }
292
v256_mullo_s32(v256 a,v256 b)293 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
294 return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
295 v128_mullo_s32(a.val[0], b.val[0]));
296 }
297
v256_madd_s16(v256 a,v256 b)298 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
299 return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
300 v128_madd_s16(a.val[0], b.val[0]));
301 }
302
v256_madd_us8(v256 a,v256 b)303 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
304 return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
305 v128_madd_us8(a.val[0], b.val[0]));
306 }
307
v256_avg_u8(v256 a,v256 b)308 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
309 return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
310 v128_avg_u8(a.val[0], b.val[0]));
311 }
312
v256_rdavg_u8(v256 a,v256 b)313 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
314 return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
315 v128_rdavg_u8(a.val[0], b.val[0]));
316 }
317
v256_rdavg_u16(v256 a,v256 b)318 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
319 return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
320 v128_rdavg_u16(a.val[0], b.val[0]));
321 }
322
v256_avg_u16(v256 a,v256 b)323 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
324 return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
325 v128_avg_u16(a.val[0], b.val[0]));
326 }
327
v256_min_u8(v256 a,v256 b)328 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
329 return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
330 v128_min_u8(a.val[0], b.val[0]));
331 }
332
v256_max_u8(v256 a,v256 b)333 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
334 return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
335 v128_max_u8(a.val[0], b.val[0]));
336 }
337
v256_min_s8(v256 a,v256 b)338 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
339 return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
340 v128_min_s8(a.val[0], b.val[0]));
341 }
342
v256_movemask_8(v256 a)343 SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
344 return (v128_movemask_8(v256_high_v128(a)) << 16) |
345 v128_movemask_8(v256_low_v128(a));
346 }
347
v256_blend_8(v256 a,v256 b,v256 c)348 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
349 return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
350 v128_blend_8(a.val[0], b.val[0], c.val[0]));
351 }
352
v256_max_s8(v256 a,v256 b)353 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
354 return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
355 v128_max_s8(a.val[0], b.val[0]));
356 }
357
v256_min_s16(v256 a,v256 b)358 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
359 return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
360 v128_min_s16(a.val[0], b.val[0]));
361 }
362
v256_max_s16(v256 a,v256 b)363 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
364 return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
365 v128_max_s16(a.val[0], b.val[0]));
366 }
367
v256_min_s32(v256 a,v256 b)368 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
369 return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
370 v128_min_s32(a.val[0], b.val[0]));
371 }
372
v256_max_s32(v256 a,v256 b)373 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
374 return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
375 v128_max_s32(a.val[0], b.val[0]));
376 }
377
v256_ziplo_8(v256 a,v256 b)378 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
379 return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
380 v128_ziplo_8(a.val[0], b.val[0]));
381 }
382
v256_ziphi_8(v256 a,v256 b)383 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
384 return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
385 v128_ziplo_8(a.val[1], b.val[1]));
386 }
387
v256_ziplo_16(v256 a,v256 b)388 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
389 return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
390 v128_ziplo_16(a.val[0], b.val[0]));
391 }
392
v256_ziphi_16(v256 a,v256 b)393 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
394 return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
395 v128_ziplo_16(a.val[1], b.val[1]));
396 }
397
v256_ziplo_32(v256 a,v256 b)398 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
399 return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
400 v128_ziplo_32(a.val[0], b.val[0]));
401 }
402
v256_ziphi_32(v256 a,v256 b)403 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
404 return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
405 v128_ziplo_32(a.val[1], b.val[1]));
406 }
407
v256_ziplo_64(v256 a,v256 b)408 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
409 return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
410 v128_ziplo_64(a.val[0], b.val[0]));
411 }
412
v256_ziphi_64(v256 a,v256 b)413 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
414 return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
415 v128_ziplo_64(a.val[1], b.val[1]));
416 }
417
v256_ziplo_128(v256 a,v256 b)418 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
419 return v256_from_v128(a.val[0], b.val[0]);
420 }
421
v256_ziphi_128(v256 a,v256 b)422 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
423 return v256_from_v128(a.val[1], b.val[1]);
424 }
425
v256_zip_8(v128 a,v128 b)426 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
427 return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
428 }
429
v256_zip_16(v128 a,v128 b)430 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
431 return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
432 }
433
v256_zip_32(v128 a,v128 b)434 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
435 return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
436 }
437
v256_unziplo_8(v256 a,v256 b)438 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
439 return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
440 v128_unziplo_8(b.val[1], b.val[0]));
441 }
442
v256_unziphi_8(v256 a,v256 b)443 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
444 return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
445 v128_unziphi_8(b.val[1], b.val[0]));
446 }
447
v256_unziplo_16(v256 a,v256 b)448 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
449 return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
450 v128_unziplo_16(b.val[1], b.val[0]));
451 }
452
v256_unziphi_16(v256 a,v256 b)453 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
454 return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
455 v128_unziphi_16(b.val[1], b.val[0]));
456 }
457
v256_unziplo_32(v256 a,v256 b)458 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
459 return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
460 v128_unziplo_32(b.val[1], b.val[0]));
461 }
462
v256_unziphi_32(v256 a,v256 b)463 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
464 return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
465 v128_unziphi_32(b.val[1], b.val[0]));
466 }
467
v256_unziplo_64(v256 a,v256 b)468 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
469 #if HAVE_SSE2
470 return v256_from_v128(
471 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
472 _mm_castsi128_pd(a.val[1]), 0)),
473 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
474 _mm_castsi128_pd(b.val[1]), 0)));
475 #else
476 return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
477 v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
478 #endif
479 }
480
v256_unziphi_64(v256 a,v256 b)481 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
482 #if HAVE_SSE2
483 return v256_from_v128(
484 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
485 _mm_castsi128_pd(a.val[1]), 3)),
486 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
487 _mm_castsi128_pd(b.val[1]), 3)));
488 #else
489 return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
490 v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
491 #endif
492 }
493
v256_unpack_u8_s16(v128 a)494 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
495 return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
496 }
497
v256_unpacklo_u8_s16(v256 a)498 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
499 return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
500 v128_unpacklo_u8_s16(a.val[0]));
501 }
502
v256_unpackhi_u8_s16(v256 a)503 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
504 return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
505 v128_unpacklo_u8_s16(a.val[1]));
506 }
507
v256_unpack_s8_s16(v128 a)508 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
509 return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
510 }
511
v256_unpacklo_s8_s16(v256 a)512 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
513 return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
514 v128_unpacklo_s8_s16(a.val[0]));
515 }
516
v256_unpackhi_s8_s16(v256 a)517 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
518 return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
519 v128_unpacklo_s8_s16(a.val[1]));
520 }
521
v256_pack_s32_s16(v256 a,v256 b)522 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
523 return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
524 v128_pack_s32_s16(b.val[1], b.val[0]));
525 }
526
v256_pack_s32_u16(v256 a,v256 b)527 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
528 return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
529 v128_pack_s32_u16(b.val[1], b.val[0]));
530 }
531
v256_pack_s16_u8(v256 a,v256 b)532 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
533 return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
534 v128_pack_s16_u8(b.val[1], b.val[0]));
535 }
536
v256_pack_s16_s8(v256 a,v256 b)537 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
538 return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
539 v128_pack_s16_s8(b.val[1], b.val[0]));
540 }
541
v256_unpack_u16_s32(v128 a)542 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
543 return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
544 }
545
v256_unpack_s16_s32(v128 a)546 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
547 return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
548 }
549
v256_unpacklo_u16_s32(v256 a)550 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
551 return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
552 v128_unpacklo_u16_s32(a.val[0]));
553 }
554
v256_unpacklo_s16_s32(v256 a)555 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
556 return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
557 v128_unpacklo_s16_s32(a.val[0]));
558 }
559
v256_unpackhi_u16_s32(v256 a)560 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
561 return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
562 v128_unpacklo_u16_s32(a.val[1]));
563 }
564
v256_unpackhi_s16_s32(v256 a)565 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
566 return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
567 v128_unpacklo_s16_s32(a.val[1]));
568 }
569
v256_cmpgt_s8(v256 a,v256 b)570 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
571 return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
572 v128_cmpgt_s8(a.val[0], b.val[0]));
573 }
574
v256_cmplt_s8(v256 a,v256 b)575 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
576 return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
577 v128_cmplt_s8(a.val[0], b.val[0]));
578 }
579
v256_cmpeq_8(v256 a,v256 b)580 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
581 return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
582 v128_cmpeq_8(a.val[0], b.val[0]));
583 }
584
v256_cmpgt_s16(v256 a,v256 b)585 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
586 return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
587 v128_cmpgt_s16(a.val[0], b.val[0]));
588 }
589
v256_cmplt_s16(v256 a,v256 b)590 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
591 return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
592 v128_cmplt_s16(a.val[0], b.val[0]));
593 }
594
v256_cmpeq_16(v256 a,v256 b)595 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
596 return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
597 v128_cmpeq_16(a.val[0], b.val[0]));
598 }
599
v256_cmpgt_s32(v256 a,v256 b)600 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
601 return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
602 v128_cmpgt_s32(a.val[0], b.val[0]));
603 }
604
v256_cmplt_s32(v256 a,v256 b)605 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
606 return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
607 v128_cmplt_s32(a.val[0], b.val[0]));
608 }
609
v256_cmpeq_32(v256 a,v256 b)610 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
611 return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
612 v128_cmpeq_32(a.val[0], b.val[0]));
613 }
614
v256_shuffle_8(v256 x,v256 pattern)615 SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
616 v128 c16 = v128_dup_8(16);
617 v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
618 v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
619 return v256_from_v128(
620 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
621 v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
622 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
623 v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
624 }
625
v256_wideshuffle_8(v256 x,v256 y,v256 pattern)626 SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
627 v128 c16 = v128_dup_8(16);
628 v128 c32 = v128_dup_8(32);
629 v128 c48 = v128_dup_8(48);
630 v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
631 v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
632 v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
633 v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
634 v256 r1 = v256_from_v128(
635 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
636 v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
637 maskhi48),
638 v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
639 v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
640 masklo48));
641 v256 r2 = v256_from_v128(
642 v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
643 v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
644 v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
645 v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
646 return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
647 }
648
v256_pshuffle_8(v256 a,v256 pattern)649 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
650 return v256_from_v128(
651 v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
652 v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
653 }
654
v256_shl_8(v256 a,const unsigned int c)655 SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
656 return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
657 }
658
v256_shr_u8(v256 a,const unsigned int c)659 SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
660 return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
661 }
662
v256_shr_s8(v256 a,const unsigned int c)663 SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
664 return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
665 }
666
v256_shl_16(v256 a,const unsigned int c)667 SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
668 return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
669 }
670
v256_shr_u16(v256 a,const unsigned int c)671 SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
672 return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
673 }
674
v256_shr_s16(v256 a,const unsigned int c)675 SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
676 return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
677 }
678
v256_shl_32(v256 a,const unsigned int c)679 SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
680 return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
681 }
682
v256_shr_u32(v256 a,const unsigned int c)683 SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
684 return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
685 }
686
v256_shr_s32(v256 a,const unsigned int c)687 SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
688 return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
689 }
690
v256_shl_64(v256 a,const unsigned int c)691 SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
692 return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
693 }
694
v256_shr_u64(v256 a,const unsigned int c)695 SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
696 return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
697 }
698
v256_shr_s64(v256 a,const unsigned int c)699 SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
700 return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
701 }
702
703 /* These intrinsics require immediate values, so we must use #defines
704 to enforce that. */
705 #define v256_shl_n_byte(a, n) \
706 ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \
707 v128_shr_n_byte(a.val[0], 16 - (n))), \
708 v128_shl_n_byte(a.val[0], (n))) \
709 : v256_from_v128( \
710 (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
711 v128_zero()))
712
713 #define v256_shr_n_byte(a, n) \
714 (n == 0 \
715 ? a \
716 : ((n) < 16 \
717 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \
718 v128_or(v128_shr_n_byte(a.val[0], n), \
719 v128_shl_n_byte(a.val[1], 16 - (n)))) \
720 : v256_from_v128( \
721 v128_zero(), \
722 (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
723
724 #define v256_align(a, b, c) \
725 ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
726
727 #define v256_shl_n_8(a, n) \
728 v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
729 #define v256_shl_n_16(a, n) \
730 v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
731 #define v256_shl_n_32(a, n) \
732 v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
733 #define v256_shl_n_64(a, n) \
734 v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
735 #define v256_shr_n_u8(a, n) \
736 v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
737 #define v256_shr_n_u16(a, n) \
738 v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
739 #define v256_shr_n_u32(a, n) \
740 v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
741 #define v256_shr_n_u64(a, n) \
742 v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
743 #define v256_shr_n_s8(a, n) \
744 v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
745 #define v256_shr_n_s16(a, n) \
746 v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
747 #define v256_shr_n_s32(a, n) \
748 v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
749 #define v256_shr_n_s64(a, n) \
750 v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
751
752 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
753 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
754
755 typedef struct {
756 sad128_internal_u16 val[2];
757 } sad256_internal_u16;
758
v256_sad_u16_init(void)759 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
760 sad256_internal_u16 t;
761 t.val[1] = v128_sad_u16_init();
762 t.val[0] = v128_sad_u16_init();
763 return t;
764 }
765
766 /* Implementation dependent return value. Result must be finalised with
767 v256_sad_u16_sum().
768 The result for more than 16 v256_sad_u16() calls is undefined. */
v256_sad_u16(sad256_internal_u16 s,v256 a,v256 b)769 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
770 v256 b) {
771 sad256_internal_u16 t;
772 t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
773 t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
774 return t;
775 }
776
v256_sad_u16_sum(sad256_internal_u16 s)777 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
778 return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
779 }
780
781 typedef struct {
782 ssd128_internal_s16 val[2];
783 } ssd256_internal_s16;
784
v256_ssd_s16_init(void)785 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
786 ssd256_internal_s16 t;
787 t.val[1] = v128_ssd_s16_init();
788 t.val[0] = v128_ssd_s16_init();
789 return t;
790 }
791
792 /* Implementation dependent return value. Result must be finalised with
793 * v256_ssd_s16_sum(). */
v256_ssd_s16(ssd256_internal_s16 s,v256 a,v256 b)794 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
795 v256 b) {
796 ssd256_internal_s16 t;
797 t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
798 t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
799 return t;
800 }
801
v256_ssd_s16_sum(ssd256_internal_s16 s)802 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
803 return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
804 }
805
806 #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
807