1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
13 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
14
15 #include <emmintrin.h>
16 #if defined(__SSSE3__)
17 #include <tmmintrin.h>
18 #endif
19 #if defined(__SSE4_1__)
20 #include <smmintrin.h>
21 #endif
22
23 typedef __m128i v64;
24
v64_low_u32(v64 a)25 SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26 return (uint32_t)_mm_cvtsi128_si32(a);
27 }
28
v64_high_u32(v64 a)29 SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30 return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
31 }
32
v64_low_s32(v64 a)33 SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
34
v64_high_s32(v64 a)35 SIMD_INLINE int32_t v64_high_s32(v64 a) {
36 return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
37 }
38
v64_from_16(uint16_t a,uint16_t b,uint16_t c,uint16_t d)39 SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
40 return _mm_packs_epi32(
41 _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
42 _mm_setzero_si128());
43 }
44
v64_from_32(uint32_t x,uint32_t y)45 SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
46 return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
47 }
48
v64_from_64(uint64_t x)49 SIMD_INLINE v64 v64_from_64(uint64_t x) {
50 #ifdef __x86_64__
51 return _mm_cvtsi64_si128((int64_t)x);
52 #else
53 return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
54 #endif
55 }
56
v64_u64(v64 x)57 SIMD_INLINE uint64_t v64_u64(v64 x) {
58 return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
59 }
60
u32_load_aligned(const void * p)61 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
62 return *((uint32_t *)p);
63 }
64
u32_load_unaligned(const void * p)65 SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
66 return *((uint32_t *)p);
67 }
68
u32_store_aligned(void * p,uint32_t a)69 SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
70 *((uint32_t *)p) = a;
71 }
72
u32_store_unaligned(void * p,uint32_t a)73 SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
74 *((uint32_t *)p) = a;
75 }
76
v64_load_aligned(const void * p)77 SIMD_INLINE v64 v64_load_aligned(const void *p) {
78 return _mm_loadl_epi64((__m128i *)p);
79 }
80
v64_load_unaligned(const void * p)81 SIMD_INLINE v64 v64_load_unaligned(const void *p) {
82 return _mm_loadl_epi64((__m128i *)p);
83 }
84
v64_store_aligned(void * p,v64 a)85 SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
86 _mm_storel_epi64((__m128i *)p, a);
87 }
88
v64_store_unaligned(void * p,v64 a)89 SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
90 _mm_storel_epi64((__m128i *)p, a);
91 }
92
93 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
94 #define v64_align(a, b, c) \
95 ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
96 #else
97 #define v64_align(a, b, c) \
98 ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
99 : (b))
100 #endif
101
v64_zero(void)102 SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
103
v64_dup_8(uint8_t x)104 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
105
v64_dup_16(uint16_t x)106 SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
107
v64_dup_32(uint32_t x)108 SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
109
v64_add_8(v64 a,v64 b)110 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
111
v64_add_16(v64 a,v64 b)112 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
113
v64_sadd_u8(v64 a,v64 b)114 SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
115
v64_sadd_s8(v64 a,v64 b)116 SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
117
v64_sadd_s16(v64 a,v64 b)118 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
119
v64_add_32(v64 a,v64 b)120 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
121
v64_sub_8(v64 a,v64 b)122 SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
123
v64_ssub_u8(v64 a,v64 b)124 SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
125
v64_ssub_s8(v64 a,v64 b)126 SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
127
v64_sub_16(v64 a,v64 b)128 SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
129
v64_ssub_s16(v64 a,v64 b)130 SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
131
v64_ssub_u16(v64 a,v64 b)132 SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
133
v64_sub_32(v64 a,v64 b)134 SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
135
v64_abs_s16(v64 a)136 SIMD_INLINE v64 v64_abs_s16(v64 a) {
137 #if defined(__SSSE3__)
138 return _mm_abs_epi16(a);
139 #else
140 return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
141 #endif
142 }
143
v64_abs_s8(v64 a)144 SIMD_INLINE v64 v64_abs_s8(v64 a) {
145 #if defined(__SSSE3__)
146 return _mm_abs_epi8(a);
147 #else
148 v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
149 return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
150 #endif
151 }
152
v64_ziplo_8(v64 a,v64 b)153 SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
154
v64_ziphi_8(v64 a,v64 b)155 SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
156 return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
157 }
158
v64_ziplo_16(v64 a,v64 b)159 SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
160
v64_ziphi_16(v64 a,v64 b)161 SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
162 return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
163 }
164
v64_ziplo_32(v64 a,v64 b)165 SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
166
v64_ziphi_32(v64 a,v64 b)167 SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
168 return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
169 }
170
v64_pack_s32_s16(v64 a,v64 b)171 SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
172 __m128i t = _mm_unpacklo_epi64(b, a);
173 return _mm_packs_epi32(t, t);
174 }
175
v64_pack_s32_u16(v64 a,v64 b)176 SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
177 #if defined(__SSE4_1__)
178 __m128i t = _mm_unpacklo_epi64(b, a);
179 return _mm_packus_epi32(t, t);
180 #else
181 const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
182 const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
183 const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
184 const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
185 return v64_from_16(ah, al, bh, bl);
186 #endif
187 }
188
v64_pack_s16_u8(v64 a,v64 b)189 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
190 __m128i t = _mm_unpacklo_epi64(b, a);
191 return _mm_packus_epi16(t, t);
192 }
193
v64_pack_s16_s8(v64 a,v64 b)194 SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
195 __m128i t = _mm_unpacklo_epi64(b, a);
196 return _mm_packs_epi16(t, t);
197 }
198
v64_unziphi_8(v64 a,v64 b)199 SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
200 #if defined(__SSSE3__)
201 return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
202 v64_from_64(0x0f0d0b0907050301LL));
203 #else
204 return _mm_packus_epi16(
205 _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
206 _mm_setzero_si128());
207 #endif
208 }
209
v64_unziplo_8(v64 a,v64 b)210 SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
211 #if defined(__SSSE3__)
212 return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
213 v64_from_64(0x0e0c0a0806040200LL));
214 #else
215 return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
216 #endif
217 }
218
v64_unziphi_16(v64 a,v64 b)219 SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
220 #if defined(__SSSE3__)
221 return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
222 v64_from_64(0x0f0e0b0a07060302LL));
223 #else
224 return _mm_packs_epi32(
225 _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
226 _mm_setzero_si128());
227 #endif
228 }
229
v64_unziplo_16(v64 a,v64 b)230 SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
231 #if defined(__SSSE3__)
232 return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
233 v64_from_64(0x0d0c090805040100LL));
234 #else
235 return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
236 #endif
237 }
238
v64_unpacklo_u8_s16(v64 a)239 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
240 return _mm_unpacklo_epi8(a, _mm_setzero_si128());
241 }
242
v64_unpackhi_u8_s16(v64 a)243 SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
244 return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
245 }
246
v64_unpacklo_s8_s16(v64 a)247 SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
248 return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
249 }
250
v64_unpackhi_s8_s16(v64 a)251 SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
252 return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
253 }
254
v64_unpacklo_u16_s32(v64 a)255 SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
256 return _mm_unpacklo_epi16(a, _mm_setzero_si128());
257 }
258
v64_unpacklo_s16_s32(v64 a)259 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
260 return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
261 }
262
v64_unpackhi_u16_s32(v64 a)263 SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
264 return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
265 }
266
v64_unpackhi_s16_s32(v64 a)267 SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
268 return _mm_srli_si128(
269 _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
270 }
271
v64_shuffle_8(v64 x,v64 pattern)272 SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
273 #if defined(__SSSE3__)
274 return _mm_shuffle_epi8(x, pattern);
275 #else
276 v64 output;
277 unsigned char *input = (unsigned char *)&x;
278 unsigned char *index = (unsigned char *)&pattern;
279 unsigned char *selected = (unsigned char *)&output;
280 int counter;
281
282 for (counter = 0; counter < 8; counter++) {
283 selected[counter] = input[index[counter]];
284 }
285
286 return output;
287 #endif
288 }
289
v64_dotp_su8(v64 a,v64 b)290 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
291 __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
292 _mm_unpacklo_epi8(b, _mm_setzero_si128()));
293 t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
294 t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
295 return (int32_t)v64_low_u32(t);
296 }
297
v64_dotp_s16(v64 a,v64 b)298 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
299 __m128i r = _mm_madd_epi16(a, b);
300 #if defined(__SSE4_1__) && defined(__x86_64__)
301 __m128i x = _mm_cvtepi32_epi64(r);
302 return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
303 #else
304 return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
305 (int64_t)_mm_cvtsi128_si32(r);
306 #endif
307 }
308
v64_hadd_u8(v64 a)309 SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
310 return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
311 }
312
v64_hadd_s16(v64 a)313 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
314 return v64_dotp_s16(a, v64_dup_16(1));
315 }
316
317 typedef v64 sad64_internal;
318
v64_sad_u8_init(void)319 SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
320
321 /* Implementation dependent return value. Result must be finalised with
322 v64_sad_u8_sum().
323 The result for more than 32 v64_sad_u8() calls is undefined. */
v64_sad_u8(sad64_internal s,v64 a,v64 b)324 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
325 return _mm_add_epi64(s, _mm_sad_epu8(a, b));
326 }
327
v64_sad_u8_sum(sad64_internal s)328 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
329
330 typedef v64 ssd64_internal;
331
v64_ssd_u8_init(void)332 SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
333
334 /* Implementation dependent return value. Result must be finalised with
335 * v64_ssd_u8_sum(). */
v64_ssd_u8(ssd64_internal s,v64 a,v64 b)336 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
337 v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
338 v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
339 v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
340 return _mm_add_epi64(
341 s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
342 }
343
v64_ssd_u8_sum(sad64_internal s)344 SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
345
v64_or(v64 a,v64 b)346 SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
347
v64_xor(v64 a,v64 b)348 SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
349
v64_and(v64 a,v64 b)350 SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
351
v64_andn(v64 a,v64 b)352 SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
353
v64_mullo_s16(v64 a,v64 b)354 SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
355
v64_mulhi_s16(v64 a,v64 b)356 SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
357
v64_mullo_s32(v64 a,v64 b)358 SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
359 #if defined(__SSE4_1__)
360 return _mm_mullo_epi32(a, b);
361 #else
362 return _mm_unpacklo_epi32(
363 _mm_mul_epu32(a, b),
364 _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
365 #endif
366 }
367
v64_madd_s16(v64 a,v64 b)368 SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
369
v64_madd_us8(v64 a,v64 b)370 SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
371 #if defined(__SSSE3__)
372 return _mm_maddubs_epi16(a, b);
373 #else
374 __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
375 _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
376 return _mm_packs_epi32(t, t);
377 #endif
378 }
379
v64_avg_u8(v64 a,v64 b)380 SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
381
v64_rdavg_u8(v64 a,v64 b)382 SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
383 return _mm_sub_epi8(_mm_avg_epu8(a, b),
384 _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
385 }
386
v64_rdavg_u16(v64 a,v64 b)387 SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
388 return _mm_sub_epi16(_mm_avg_epu16(a, b),
389 _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
390 }
391
v64_avg_u16(v64 a,v64 b)392 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
393
v64_min_u8(v64 a,v64 b)394 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
395
v64_max_u8(v64 a,v64 b)396 SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
397
v64_min_s8(v64 a,v64 b)398 SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
399 #if defined(__SSE4_1__)
400 return _mm_min_epi8(a, b);
401 #else
402 v64 mask = _mm_cmplt_epi8(a, b);
403 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
404 #endif
405 }
406
v64_max_s8(v64 a,v64 b)407 SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
408 #if defined(__SSE4_1__)
409 return _mm_max_epi8(a, b);
410 #else
411 v64 mask = _mm_cmplt_epi8(b, a);
412 return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
413 #endif
414 }
415
v64_min_s16(v64 a,v64 b)416 SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
417
v64_max_s16(v64 a,v64 b)418 SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
419
v64_cmpgt_s8(v64 a,v64 b)420 SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
421
v64_cmplt_s8(v64 a,v64 b)422 SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
423
v64_cmpeq_8(v64 a,v64 b)424 SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
425
v64_cmpgt_s16(v64 a,v64 b)426 SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
427
v64_cmplt_s16(v64 a,v64 b)428 SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
429
v64_cmpeq_16(v64 a,v64 b)430 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
431
v64_shl_8(v64 a,unsigned int c)432 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
433 return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
434 _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
435 }
436
v64_shr_u8(v64 a,unsigned int c)437 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
438 return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
439 _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
440 }
441
v64_shr_s8(v64 a,unsigned int c)442 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
443 return _mm_packs_epi16(
444 _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
445 a);
446 }
447
v64_shl_16(v64 a,unsigned int c)448 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
449 return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
450 }
451
v64_shr_u16(v64 a,unsigned int c)452 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
453 return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
454 }
455
v64_shr_s16(v64 a,unsigned int c)456 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
457 return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
458 }
459
v64_shl_32(v64 a,unsigned int c)460 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
461 return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
462 }
463
v64_shr_u32(v64 a,unsigned int c)464 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
465 return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
466 }
467
v64_shr_s32(v64 a,unsigned int c)468 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
469 return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
470 }
471
472 /* These intrinsics require immediate values, so we must use #defines
473 to enforce that. */
474 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
475 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
476 #define v64_shl_n_8(a, c) \
477 _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
478 #define v64_shr_n_u8(a, c) \
479 _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
480 #define v64_shr_n_s8(a, c) \
481 _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
482 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
483 #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
484 #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
485 #define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
486 #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
487 #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
488
489 #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
490