xref: /aosp_15_r20/external/libaom/aom_dsp/simd/v256_intrinsics_v128.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
13 #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
14 
15 #include "config/aom_config.h"
16 
17 #if HAVE_NEON
18 #error "Do not use this file for Neon"
19 #endif
20 
21 #if HAVE_SSE2
22 #include "aom_dsp/simd/v128_intrinsics_x86.h"
23 #else
24 #include "aom_dsp/simd/v128_intrinsics.h"
25 #endif
26 
27 typedef struct {
28   v128 val[2];
29 } v256;
30 
v256_low_u32(v256 a)31 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
32 
v256_low_v64(v256 a)33 SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
34 
v256_low_u64(v256 a)35 SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
36 
v256_low_v128(v256 a)37 SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
38 
v256_high_v128(v256 a)39 SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
40 
v256_from_v128(v128 hi,v128 lo)41 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
42   v256 t;
43   t.val[1] = hi;
44   t.val[0] = lo;
45   return t;
46 }
47 
v256_from_64(uint64_t a,uint64_t b,uint64_t c,uint64_t d)48 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
49   return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
50 }
51 
v256_from_v64(v64 a,v64 b,v64 c,v64 d)52 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
53   return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
54 }
55 
v256_load_unaligned(const void * p)56 SIMD_INLINE v256 v256_load_unaligned(const void *p) {
57   return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
58                         v128_load_unaligned(p));
59 }
60 
v256_load_aligned(const void * p)61 SIMD_INLINE v256 v256_load_aligned(const void *p) {
62   return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
63                         v128_load_aligned(p));
64 }
65 
v256_store_unaligned(void * p,v256 a)66 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
67   v128_store_unaligned(p, a.val[0]);
68   v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
69 }
70 
v256_store_aligned(void * p,v256 a)71 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
72   v128_store_aligned(p, a.val[0]);
73   v128_store_aligned((uint8_t *)p + 16, a.val[1]);
74 }
75 
v256_zero(void)76 SIMD_INLINE v256 v256_zero(void) {
77   return v256_from_v128(v128_zero(), v128_zero());
78 }
79 
v256_dup_8(uint8_t x)80 SIMD_INLINE v256 v256_dup_8(uint8_t x) {
81   v128 t = v128_dup_8(x);
82   return v256_from_v128(t, t);
83 }
84 
v256_dup_16(uint16_t x)85 SIMD_INLINE v256 v256_dup_16(uint16_t x) {
86   v128 t = v128_dup_16(x);
87   return v256_from_v128(t, t);
88 }
89 
v256_dup_32(uint32_t x)90 SIMD_INLINE v256 v256_dup_32(uint32_t x) {
91   v128 t = v128_dup_32(x);
92   return v256_from_v128(t, t);
93 }
94 
v256_dup_64(uint64_t x)95 SIMD_INLINE v256 v256_dup_64(uint64_t x) {
96   v128 t = v128_dup_64(x);
97   return v256_from_v128(t, t);
98 }
99 
v256_dotp_su8(v256 a,v256 b)100 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
101   return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
102 }
103 
v256_dotp_s16(v256 a,v256 b)104 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
105   return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
106 }
107 
v256_dotp_s32(v256 a,v256 b)108 SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
109   return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
110 }
111 
v256_hadd_u8(v256 a)112 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
113   return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
114 }
115 
116 typedef struct {
117   sad128_internal val[2];
118 } sad256_internal;
119 
v256_sad_u8_init(void)120 SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
121   sad256_internal t;
122   t.val[1] = v128_sad_u8_init();
123   t.val[0] = v128_sad_u8_init();
124   return t;
125 }
126 
127 /* Implementation dependent return value.  Result must be finalised with
128    v256_sad_u8_sum().
129    The result for more than 16 v256_sad_u8() calls is undefined. */
v256_sad_u8(sad256_internal s,v256 a,v256 b)130 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
131   sad256_internal t;
132   t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
133   t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
134   return t;
135 }
136 
v256_sad_u8_sum(sad256_internal s)137 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
138   return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
139 }
140 
141 typedef struct {
142   ssd128_internal val[2];
143 } ssd256_internal;
144 
v256_ssd_u8_init(void)145 SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
146   ssd256_internal t;
147   t.val[1] = v128_ssd_u8_init();
148   t.val[0] = v128_ssd_u8_init();
149   return t;
150 }
151 
152 /* Implementation dependent return value.  Result must be finalised with
153  * v256_ssd_u8_sum(). */
v256_ssd_u8(ssd256_internal s,v256 a,v256 b)154 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
155   ssd256_internal t;
156   t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
157   t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
158   return t;
159 }
160 
v256_ssd_u8_sum(ssd256_internal s)161 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
162   return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
163 }
164 
v256_or(v256 a,v256 b)165 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
166   return v256_from_v128(v128_or(a.val[1], b.val[1]),
167                         v128_or(a.val[0], b.val[0]));
168 }
169 
v256_xor(v256 a,v256 b)170 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
171   return v256_from_v128(v128_xor(a.val[1], b.val[1]),
172                         v128_xor(a.val[0], b.val[0]));
173 }
174 
v256_and(v256 a,v256 b)175 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
176   return v256_from_v128(v128_and(a.val[1], b.val[1]),
177                         v128_and(a.val[0], b.val[0]));
178 }
179 
v256_andn(v256 a,v256 b)180 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
181   return v256_from_v128(v128_andn(a.val[1], b.val[1]),
182                         v128_andn(a.val[0], b.val[0]));
183 }
184 
v256_add_8(v256 a,v256 b)185 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
186   return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
187                         v128_add_8(a.val[0], b.val[0]));
188 }
189 
v256_add_16(v256 a,v256 b)190 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
191   return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
192                         v128_add_16(a.val[0], b.val[0]));
193 }
194 
v256_sadd_s8(v256 a,v256 b)195 SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
196   return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
197                         v128_sadd_s8(a.val[0], b.val[0]));
198 }
199 
v256_sadd_u8(v256 a,v256 b)200 SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
201   return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
202                         v128_sadd_u8(a.val[0], b.val[0]));
203 }
204 
v256_sadd_s16(v256 a,v256 b)205 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
206   return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
207                         v128_sadd_s16(a.val[0], b.val[0]));
208 }
209 
v256_add_32(v256 a,v256 b)210 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
211   return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
212                         v128_add_32(a.val[0], b.val[0]));
213 }
214 
v256_add_64(v256 a,v256 b)215 SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
216   return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
217                         v128_add_64(a.val[0], b.val[0]));
218 }
219 
v256_padd_u8(v256 a)220 SIMD_INLINE v256 v256_padd_u8(v256 a) {
221   return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
222 }
223 
v256_padd_s16(v256 a)224 SIMD_INLINE v256 v256_padd_s16(v256 a) {
225   return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
226 }
227 
v256_sub_8(v256 a,v256 b)228 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
229   return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
230                         v128_sub_8(a.val[0], b.val[0]));
231 }
232 
v256_ssub_u8(v256 a,v256 b)233 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
234   return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
235                         v128_ssub_u8(a.val[0], b.val[0]));
236 }
237 
v256_ssub_s8(v256 a,v256 b)238 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
239   return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
240                         v128_ssub_s8(a.val[0], b.val[0]));
241 }
242 
v256_sub_16(v256 a,v256 b)243 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
244   return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
245                         v128_sub_16(a.val[0], b.val[0]));
246 }
247 
v256_ssub_s16(v256 a,v256 b)248 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
249   return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
250                         v128_ssub_s16(a.val[0], b.val[0]));
251 }
252 
v256_ssub_u16(v256 a,v256 b)253 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
254   return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
255                         v128_ssub_u16(a.val[0], b.val[0]));
256 }
257 
v256_sub_32(v256 a,v256 b)258 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
259   return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
260                         v128_sub_32(a.val[0], b.val[0]));
261 }
262 
v256_sub_64(v256 a,v256 b)263 SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
264   return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
265                         v128_sub_64(a.val[0], b.val[0]));
266 }
267 
v256_abs_s16(v256 a)268 SIMD_INLINE v256 v256_abs_s16(v256 a) {
269   return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
270 }
271 
v256_abs_s8(v256 a)272 SIMD_INLINE v256 v256_abs_s8(v256 a) {
273   return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
274 }
275 
v256_mul_s16(v128 a,v128 b)276 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
277   v128 lo_bits = v128_mullo_s16(a, b);
278   v128 hi_bits = v128_mulhi_s16(a, b);
279   return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
280                         v128_ziplo_16(hi_bits, lo_bits));
281 }
282 
v256_mullo_s16(v256 a,v256 b)283 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
284   return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
285                         v128_mullo_s16(a.val[0], b.val[0]));
286 }
287 
v256_mulhi_s16(v256 a,v256 b)288 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
289   return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
290                         v128_mulhi_s16(a.val[0], b.val[0]));
291 }
292 
v256_mullo_s32(v256 a,v256 b)293 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
294   return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
295                         v128_mullo_s32(a.val[0], b.val[0]));
296 }
297 
v256_madd_s16(v256 a,v256 b)298 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
299   return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
300                         v128_madd_s16(a.val[0], b.val[0]));
301 }
302 
v256_madd_us8(v256 a,v256 b)303 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
304   return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
305                         v128_madd_us8(a.val[0], b.val[0]));
306 }
307 
v256_avg_u8(v256 a,v256 b)308 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
309   return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
310                         v128_avg_u8(a.val[0], b.val[0]));
311 }
312 
v256_rdavg_u8(v256 a,v256 b)313 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
314   return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
315                         v128_rdavg_u8(a.val[0], b.val[0]));
316 }
317 
v256_rdavg_u16(v256 a,v256 b)318 SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
319   return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
320                         v128_rdavg_u16(a.val[0], b.val[0]));
321 }
322 
v256_avg_u16(v256 a,v256 b)323 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
324   return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
325                         v128_avg_u16(a.val[0], b.val[0]));
326 }
327 
v256_min_u8(v256 a,v256 b)328 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
329   return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
330                         v128_min_u8(a.val[0], b.val[0]));
331 }
332 
v256_max_u8(v256 a,v256 b)333 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
334   return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
335                         v128_max_u8(a.val[0], b.val[0]));
336 }
337 
v256_min_s8(v256 a,v256 b)338 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
339   return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
340                         v128_min_s8(a.val[0], b.val[0]));
341 }
342 
v256_movemask_8(v256 a)343 SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
344   return (v128_movemask_8(v256_high_v128(a)) << 16) |
345          v128_movemask_8(v256_low_v128(a));
346 }
347 
v256_blend_8(v256 a,v256 b,v256 c)348 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
349   return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
350                         v128_blend_8(a.val[0], b.val[0], c.val[0]));
351 }
352 
v256_max_s8(v256 a,v256 b)353 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
354   return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
355                         v128_max_s8(a.val[0], b.val[0]));
356 }
357 
v256_min_s16(v256 a,v256 b)358 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
359   return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
360                         v128_min_s16(a.val[0], b.val[0]));
361 }
362 
v256_max_s16(v256 a,v256 b)363 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
364   return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
365                         v128_max_s16(a.val[0], b.val[0]));
366 }
367 
v256_min_s32(v256 a,v256 b)368 SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
369   return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
370                         v128_min_s32(a.val[0], b.val[0]));
371 }
372 
v256_max_s32(v256 a,v256 b)373 SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
374   return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
375                         v128_max_s32(a.val[0], b.val[0]));
376 }
377 
v256_ziplo_8(v256 a,v256 b)378 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
379   return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
380                         v128_ziplo_8(a.val[0], b.val[0]));
381 }
382 
v256_ziphi_8(v256 a,v256 b)383 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
384   return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
385                         v128_ziplo_8(a.val[1], b.val[1]));
386 }
387 
v256_ziplo_16(v256 a,v256 b)388 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
389   return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
390                         v128_ziplo_16(a.val[0], b.val[0]));
391 }
392 
v256_ziphi_16(v256 a,v256 b)393 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
394   return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
395                         v128_ziplo_16(a.val[1], b.val[1]));
396 }
397 
v256_ziplo_32(v256 a,v256 b)398 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
399   return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
400                         v128_ziplo_32(a.val[0], b.val[0]));
401 }
402 
v256_ziphi_32(v256 a,v256 b)403 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
404   return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
405                         v128_ziplo_32(a.val[1], b.val[1]));
406 }
407 
v256_ziplo_64(v256 a,v256 b)408 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
409   return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
410                         v128_ziplo_64(a.val[0], b.val[0]));
411 }
412 
v256_ziphi_64(v256 a,v256 b)413 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
414   return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
415                         v128_ziplo_64(a.val[1], b.val[1]));
416 }
417 
v256_ziplo_128(v256 a,v256 b)418 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
419   return v256_from_v128(a.val[0], b.val[0]);
420 }
421 
v256_ziphi_128(v256 a,v256 b)422 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
423   return v256_from_v128(a.val[1], b.val[1]);
424 }
425 
v256_zip_8(v128 a,v128 b)426 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
427   return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
428 }
429 
v256_zip_16(v128 a,v128 b)430 SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
431   return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
432 }
433 
v256_zip_32(v128 a,v128 b)434 SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
435   return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
436 }
437 
v256_unziplo_8(v256 a,v256 b)438 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
439   return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
440                         v128_unziplo_8(b.val[1], b.val[0]));
441 }
442 
v256_unziphi_8(v256 a,v256 b)443 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
444   return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
445                         v128_unziphi_8(b.val[1], b.val[0]));
446 }
447 
v256_unziplo_16(v256 a,v256 b)448 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
449   return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
450                         v128_unziplo_16(b.val[1], b.val[0]));
451 }
452 
v256_unziphi_16(v256 a,v256 b)453 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
454   return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
455                         v128_unziphi_16(b.val[1], b.val[0]));
456 }
457 
v256_unziplo_32(v256 a,v256 b)458 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
459   return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
460                         v128_unziplo_32(b.val[1], b.val[0]));
461 }
462 
v256_unziphi_32(v256 a,v256 b)463 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
464   return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
465                         v128_unziphi_32(b.val[1], b.val[0]));
466 }
467 
v256_unziplo_64(v256 a,v256 b)468 SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
469 #if HAVE_SSE2
470   return v256_from_v128(
471       _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
472                                       _mm_castsi128_pd(a.val[1]), 0)),
473       _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
474                                       _mm_castsi128_pd(b.val[1]), 0)));
475 #else
476   return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
477                        v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
478 #endif
479 }
480 
v256_unziphi_64(v256 a,v256 b)481 SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
482 #if HAVE_SSE2
483   return v256_from_v128(
484       _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
485                                       _mm_castsi128_pd(a.val[1]), 3)),
486       _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
487                                       _mm_castsi128_pd(b.val[1]), 3)));
488 #else
489   return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
490                        v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
491 #endif
492 }
493 
v256_unpack_u8_s16(v128 a)494 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
495   return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
496 }
497 
v256_unpacklo_u8_s16(v256 a)498 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
499   return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
500                         v128_unpacklo_u8_s16(a.val[0]));
501 }
502 
v256_unpackhi_u8_s16(v256 a)503 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
504   return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
505                         v128_unpacklo_u8_s16(a.val[1]));
506 }
507 
v256_unpack_s8_s16(v128 a)508 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
509   return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
510 }
511 
v256_unpacklo_s8_s16(v256 a)512 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
513   return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
514                         v128_unpacklo_s8_s16(a.val[0]));
515 }
516 
v256_unpackhi_s8_s16(v256 a)517 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
518   return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
519                         v128_unpacklo_s8_s16(a.val[1]));
520 }
521 
v256_pack_s32_s16(v256 a,v256 b)522 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
523   return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
524                         v128_pack_s32_s16(b.val[1], b.val[0]));
525 }
526 
v256_pack_s32_u16(v256 a,v256 b)527 SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
528   return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
529                         v128_pack_s32_u16(b.val[1], b.val[0]));
530 }
531 
v256_pack_s16_u8(v256 a,v256 b)532 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
533   return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
534                         v128_pack_s16_u8(b.val[1], b.val[0]));
535 }
536 
v256_pack_s16_s8(v256 a,v256 b)537 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
538   return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
539                         v128_pack_s16_s8(b.val[1], b.val[0]));
540 }
541 
v256_unpack_u16_s32(v128 a)542 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
543   return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
544 }
545 
v256_unpack_s16_s32(v128 a)546 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
547   return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
548 }
549 
v256_unpacklo_u16_s32(v256 a)550 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
551   return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
552                         v128_unpacklo_u16_s32(a.val[0]));
553 }
554 
v256_unpacklo_s16_s32(v256 a)555 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
556   return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
557                         v128_unpacklo_s16_s32(a.val[0]));
558 }
559 
v256_unpackhi_u16_s32(v256 a)560 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
561   return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
562                         v128_unpacklo_u16_s32(a.val[1]));
563 }
564 
v256_unpackhi_s16_s32(v256 a)565 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
566   return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
567                         v128_unpacklo_s16_s32(a.val[1]));
568 }
569 
v256_cmpgt_s8(v256 a,v256 b)570 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
571   return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
572                         v128_cmpgt_s8(a.val[0], b.val[0]));
573 }
574 
v256_cmplt_s8(v256 a,v256 b)575 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
576   return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
577                         v128_cmplt_s8(a.val[0], b.val[0]));
578 }
579 
v256_cmpeq_8(v256 a,v256 b)580 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
581   return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
582                         v128_cmpeq_8(a.val[0], b.val[0]));
583 }
584 
v256_cmpgt_s16(v256 a,v256 b)585 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
586   return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
587                         v128_cmpgt_s16(a.val[0], b.val[0]));
588 }
589 
v256_cmplt_s16(v256 a,v256 b)590 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
591   return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
592                         v128_cmplt_s16(a.val[0], b.val[0]));
593 }
594 
v256_cmpeq_16(v256 a,v256 b)595 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
596   return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
597                         v128_cmpeq_16(a.val[0], b.val[0]));
598 }
599 
v256_cmpgt_s32(v256 a,v256 b)600 SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
601   return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
602                         v128_cmpgt_s32(a.val[0], b.val[0]));
603 }
604 
v256_cmplt_s32(v256 a,v256 b)605 SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
606   return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
607                         v128_cmplt_s32(a.val[0], b.val[0]));
608 }
609 
v256_cmpeq_32(v256 a,v256 b)610 SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
611   return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
612                         v128_cmpeq_32(a.val[0], b.val[0]));
613 }
614 
v256_shuffle_8(v256 x,v256 pattern)615 SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
616   v128 c16 = v128_dup_8(16);
617   v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
618   v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
619   return v256_from_v128(
620       v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
621                    v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
622       v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
623                    v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
624 }
625 
v256_wideshuffle_8(v256 x,v256 y,v256 pattern)626 SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
627   v128 c16 = v128_dup_8(16);
628   v128 c32 = v128_dup_8(32);
629   v128 c48 = v128_dup_8(48);
630   v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
631   v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
632   v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
633   v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
634   v256 r1 = v256_from_v128(
635       v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
636                    v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
637                    maskhi48),
638       v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
639                    v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
640                    masklo48));
641   v256 r2 = v256_from_v128(
642       v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
643                    v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
644       v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
645                    v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
646   return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
647 }
648 
v256_pshuffle_8(v256 a,v256 pattern)649 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
650   return v256_from_v128(
651       v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
652       v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
653 }
654 
v256_shl_8(v256 a,const unsigned int c)655 SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
656   return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
657 }
658 
v256_shr_u8(v256 a,const unsigned int c)659 SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
660   return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
661 }
662 
v256_shr_s8(v256 a,const unsigned int c)663 SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
664   return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
665 }
666 
v256_shl_16(v256 a,const unsigned int c)667 SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
668   return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
669 }
670 
v256_shr_u16(v256 a,const unsigned int c)671 SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
672   return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
673 }
674 
v256_shr_s16(v256 a,const unsigned int c)675 SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
676   return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
677 }
678 
v256_shl_32(v256 a,const unsigned int c)679 SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
680   return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
681 }
682 
v256_shr_u32(v256 a,const unsigned int c)683 SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
684   return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
685 }
686 
v256_shr_s32(v256 a,const unsigned int c)687 SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
688   return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
689 }
690 
v256_shl_64(v256 a,const unsigned int c)691 SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
692   return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
693 }
694 
v256_shr_u64(v256 a,const unsigned int c)695 SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
696   return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
697 }
698 
v256_shr_s64(v256 a,const unsigned int c)699 SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
700   return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
701 }
702 
703 /* These intrinsics require immediate values, so we must use #defines
704    to enforce that. */
705 #define v256_shl_n_byte(a, n)                                              \
706   ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
707                                      v128_shr_n_byte(a.val[0], 16 - (n))), \
708                              v128_shl_n_byte(a.val[0], (n)))               \
709             : v256_from_v128(                                              \
710                   (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
711                   v128_zero()))
712 
713 #define v256_shr_n_byte(a, n)                                                \
714   (n == 0                                                                    \
715        ? a                                                                   \
716        : ((n) < 16                                                           \
717               ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
718                                v128_or(v128_shr_n_byte(a.val[0], n),         \
719                                        v128_shl_n_byte(a.val[1], 16 - (n)))) \
720               : v256_from_v128(                                              \
721                     v128_zero(),                                             \
722                     (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
723 
724 #define v256_align(a, b, c) \
725   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
726 
727 #define v256_shl_n_8(a, n) \
728   v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
729 #define v256_shl_n_16(a, n) \
730   v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
731 #define v256_shl_n_32(a, n) \
732   v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
733 #define v256_shl_n_64(a, n) \
734   v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
735 #define v256_shr_n_u8(a, n) \
736   v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
737 #define v256_shr_n_u16(a, n) \
738   v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
739 #define v256_shr_n_u32(a, n) \
740   v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
741 #define v256_shr_n_u64(a, n) \
742   v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
743 #define v256_shr_n_s8(a, n) \
744   v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
745 #define v256_shr_n_s16(a, n) \
746   v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
747 #define v256_shr_n_s32(a, n) \
748   v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
749 #define v256_shr_n_s64(a, n) \
750   v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
751 
752 #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
753 #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
754 
755 typedef struct {
756   sad128_internal_u16 val[2];
757 } sad256_internal_u16;
758 
v256_sad_u16_init(void)759 SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
760   sad256_internal_u16 t;
761   t.val[1] = v128_sad_u16_init();
762   t.val[0] = v128_sad_u16_init();
763   return t;
764 }
765 
766 /* Implementation dependent return value.  Result must be finalised with
767    v256_sad_u16_sum().
768    The result for more than 16 v256_sad_u16() calls is undefined. */
v256_sad_u16(sad256_internal_u16 s,v256 a,v256 b)769 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
770                                              v256 b) {
771   sad256_internal_u16 t;
772   t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
773   t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
774   return t;
775 }
776 
v256_sad_u16_sum(sad256_internal_u16 s)777 SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
778   return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
779 }
780 
781 typedef struct {
782   ssd128_internal_s16 val[2];
783 } ssd256_internal_s16;
784 
v256_ssd_s16_init(void)785 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
786   ssd256_internal_s16 t;
787   t.val[1] = v128_ssd_s16_init();
788   t.val[0] = v128_ssd_s16_init();
789   return t;
790 }
791 
792 /* Implementation dependent return value.  Result must be finalised with
793  * v256_ssd_s16_sum(). */
v256_ssd_s16(ssd256_internal_s16 s,v256 a,v256 b)794 SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
795                                              v256 b) {
796   ssd256_internal_s16 t;
797   t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
798   t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
799   return t;
800 }
801 
v256_ssd_s16_sum(ssd256_internal_s16 s)802 SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
803   return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
804 }
805 
806 #endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
807