xref: /aosp_15_r20/external/libaom/aom_dsp/simd/v64_intrinsics_x86.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
13 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
14 
15 #include <emmintrin.h>
16 #if defined(__SSSE3__)
17 #include <tmmintrin.h>
18 #endif
19 #if defined(__SSE4_1__)
20 #include <smmintrin.h>
21 #endif
22 
23 typedef __m128i v64;
24 
v64_low_u32(v64 a)25 SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26   return (uint32_t)_mm_cvtsi128_si32(a);
27 }
28 
v64_high_u32(v64 a)29 SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30   return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
31 }
32 
v64_low_s32(v64 a)33 SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
34 
v64_high_s32(v64 a)35 SIMD_INLINE int32_t v64_high_s32(v64 a) {
36   return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
37 }
38 
v64_from_16(uint16_t a,uint16_t b,uint16_t c,uint16_t d)39 SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
40   return _mm_packs_epi32(
41       _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
42       _mm_setzero_si128());
43 }
44 
v64_from_32(uint32_t x,uint32_t y)45 SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
46   return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
47 }
48 
v64_from_64(uint64_t x)49 SIMD_INLINE v64 v64_from_64(uint64_t x) {
50 #ifdef __x86_64__
51   return _mm_cvtsi64_si128((int64_t)x);
52 #else
53   return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
54 #endif
55 }
56 
v64_u64(v64 x)57 SIMD_INLINE uint64_t v64_u64(v64 x) {
58   return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
59 }
60 
u32_load_aligned(const void * p)61 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
62   return *((uint32_t *)p);
63 }
64 
u32_load_unaligned(const void * p)65 SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
66   return *((uint32_t *)p);
67 }
68 
u32_store_aligned(void * p,uint32_t a)69 SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
70   *((uint32_t *)p) = a;
71 }
72 
u32_store_unaligned(void * p,uint32_t a)73 SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
74   *((uint32_t *)p) = a;
75 }
76 
v64_load_aligned(const void * p)77 SIMD_INLINE v64 v64_load_aligned(const void *p) {
78   return _mm_loadl_epi64((__m128i *)p);
79 }
80 
v64_load_unaligned(const void * p)81 SIMD_INLINE v64 v64_load_unaligned(const void *p) {
82   return _mm_loadl_epi64((__m128i *)p);
83 }
84 
v64_store_aligned(void * p,v64 a)85 SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
86   _mm_storel_epi64((__m128i *)p, a);
87 }
88 
v64_store_unaligned(void * p,v64 a)89 SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
90   _mm_storel_epi64((__m128i *)p, a);
91 }
92 
93 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
94 #define v64_align(a, b, c) \
95   ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
96 #else
97 #define v64_align(a, b, c)                                                  \
98   ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
99        : (b))
100 #endif
101 
v64_zero(void)102 SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
103 
v64_dup_8(uint8_t x)104 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
105 
v64_dup_16(uint16_t x)106 SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
107 
v64_dup_32(uint32_t x)108 SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
109 
v64_add_8(v64 a,v64 b)110 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
111 
v64_add_16(v64 a,v64 b)112 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
113 
v64_sadd_u8(v64 a,v64 b)114 SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
115 
v64_sadd_s8(v64 a,v64 b)116 SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
117 
v64_sadd_s16(v64 a,v64 b)118 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
119 
v64_add_32(v64 a,v64 b)120 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
121 
v64_sub_8(v64 a,v64 b)122 SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
123 
v64_ssub_u8(v64 a,v64 b)124 SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
125 
v64_ssub_s8(v64 a,v64 b)126 SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
127 
v64_sub_16(v64 a,v64 b)128 SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
129 
v64_ssub_s16(v64 a,v64 b)130 SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
131 
v64_ssub_u16(v64 a,v64 b)132 SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
133 
v64_sub_32(v64 a,v64 b)134 SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
135 
v64_abs_s16(v64 a)136 SIMD_INLINE v64 v64_abs_s16(v64 a) {
137 #if defined(__SSSE3__)
138   return _mm_abs_epi16(a);
139 #else
140   return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
141 #endif
142 }
143 
v64_abs_s8(v64 a)144 SIMD_INLINE v64 v64_abs_s8(v64 a) {
145 #if defined(__SSSE3__)
146   return _mm_abs_epi8(a);
147 #else
148   v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
149   return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
150 #endif
151 }
152 
v64_ziplo_8(v64 a,v64 b)153 SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
154 
v64_ziphi_8(v64 a,v64 b)155 SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
156   return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
157 }
158 
v64_ziplo_16(v64 a,v64 b)159 SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
160 
v64_ziphi_16(v64 a,v64 b)161 SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
162   return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
163 }
164 
v64_ziplo_32(v64 a,v64 b)165 SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
166 
v64_ziphi_32(v64 a,v64 b)167 SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
168   return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
169 }
170 
v64_pack_s32_s16(v64 a,v64 b)171 SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
172   __m128i t = _mm_unpacklo_epi64(b, a);
173   return _mm_packs_epi32(t, t);
174 }
175 
v64_pack_s32_u16(v64 a,v64 b)176 SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
177 #if defined(__SSE4_1__)
178   __m128i t = _mm_unpacklo_epi64(b, a);
179   return _mm_packus_epi32(t, t);
180 #else
181   const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
182   const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
183   const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
184   const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
185   return v64_from_16(ah, al, bh, bl);
186 #endif
187 }
188 
v64_pack_s16_u8(v64 a,v64 b)189 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
190   __m128i t = _mm_unpacklo_epi64(b, a);
191   return _mm_packus_epi16(t, t);
192 }
193 
v64_pack_s16_s8(v64 a,v64 b)194 SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
195   __m128i t = _mm_unpacklo_epi64(b, a);
196   return _mm_packs_epi16(t, t);
197 }
198 
v64_unziphi_8(v64 a,v64 b)199 SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
200 #if defined(__SSSE3__)
201   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
202                           v64_from_64(0x0f0d0b0907050301LL));
203 #else
204   return _mm_packus_epi16(
205       _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
206       _mm_setzero_si128());
207 #endif
208 }
209 
v64_unziplo_8(v64 a,v64 b)210 SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
211 #if defined(__SSSE3__)
212   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
213                           v64_from_64(0x0e0c0a0806040200LL));
214 #else
215   return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
216 #endif
217 }
218 
v64_unziphi_16(v64 a,v64 b)219 SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
220 #if defined(__SSSE3__)
221   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
222                           v64_from_64(0x0f0e0b0a07060302LL));
223 #else
224   return _mm_packs_epi32(
225       _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
226       _mm_setzero_si128());
227 #endif
228 }
229 
v64_unziplo_16(v64 a,v64 b)230 SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
231 #if defined(__SSSE3__)
232   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
233                           v64_from_64(0x0d0c090805040100LL));
234 #else
235   return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
236 #endif
237 }
238 
v64_unpacklo_u8_s16(v64 a)239 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
240   return _mm_unpacklo_epi8(a, _mm_setzero_si128());
241 }
242 
v64_unpackhi_u8_s16(v64 a)243 SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
244   return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
245 }
246 
v64_unpacklo_s8_s16(v64 a)247 SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
248   return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
249 }
250 
v64_unpackhi_s8_s16(v64 a)251 SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
252   return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
253 }
254 
v64_unpacklo_u16_s32(v64 a)255 SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
256   return _mm_unpacklo_epi16(a, _mm_setzero_si128());
257 }
258 
v64_unpacklo_s16_s32(v64 a)259 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
260   return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
261 }
262 
v64_unpackhi_u16_s32(v64 a)263 SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
264   return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
265 }
266 
v64_unpackhi_s16_s32(v64 a)267 SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
268   return _mm_srli_si128(
269       _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
270 }
271 
v64_shuffle_8(v64 x,v64 pattern)272 SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
273 #if defined(__SSSE3__)
274   return _mm_shuffle_epi8(x, pattern);
275 #else
276   v64 output;
277   unsigned char *input = (unsigned char *)&x;
278   unsigned char *index = (unsigned char *)&pattern;
279   unsigned char *selected = (unsigned char *)&output;
280   int counter;
281 
282   for (counter = 0; counter < 8; counter++) {
283     selected[counter] = input[index[counter]];
284   }
285 
286   return output;
287 #endif
288 }
289 
v64_dotp_su8(v64 a,v64 b)290 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
291   __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
292                              _mm_unpacklo_epi8(b, _mm_setzero_si128()));
293   t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
294   t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
295   return (int32_t)v64_low_u32(t);
296 }
297 
v64_dotp_s16(v64 a,v64 b)298 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
299   __m128i r = _mm_madd_epi16(a, b);
300 #if defined(__SSE4_1__) && defined(__x86_64__)
301   __m128i x = _mm_cvtepi32_epi64(r);
302   return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
303 #else
304   return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
305          (int64_t)_mm_cvtsi128_si32(r);
306 #endif
307 }
308 
v64_hadd_u8(v64 a)309 SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
310   return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
311 }
312 
v64_hadd_s16(v64 a)313 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
314   return v64_dotp_s16(a, v64_dup_16(1));
315 }
316 
317 typedef v64 sad64_internal;
318 
v64_sad_u8_init(void)319 SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
320 
321 /* Implementation dependent return value.  Result must be finalised with
322    v64_sad_u8_sum().
323    The result for more than 32 v64_sad_u8() calls is undefined. */
v64_sad_u8(sad64_internal s,v64 a,v64 b)324 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
325   return _mm_add_epi64(s, _mm_sad_epu8(a, b));
326 }
327 
v64_sad_u8_sum(sad64_internal s)328 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
329 
330 typedef v64 ssd64_internal;
331 
v64_ssd_u8_init(void)332 SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
333 
334 /* Implementation dependent return value.  Result must be finalised with
335  * v64_ssd_u8_sum(). */
v64_ssd_u8(ssd64_internal s,v64 a,v64 b)336 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
337   v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
338   v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
339   v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
340   return _mm_add_epi64(
341       s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
342 }
343 
v64_ssd_u8_sum(sad64_internal s)344 SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
345 
v64_or(v64 a,v64 b)346 SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
347 
v64_xor(v64 a,v64 b)348 SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
349 
v64_and(v64 a,v64 b)350 SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
351 
v64_andn(v64 a,v64 b)352 SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
353 
v64_mullo_s16(v64 a,v64 b)354 SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
355 
v64_mulhi_s16(v64 a,v64 b)356 SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
357 
v64_mullo_s32(v64 a,v64 b)358 SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
359 #if defined(__SSE4_1__)
360   return _mm_mullo_epi32(a, b);
361 #else
362   return _mm_unpacklo_epi32(
363       _mm_mul_epu32(a, b),
364       _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
365 #endif
366 }
367 
v64_madd_s16(v64 a,v64 b)368 SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
369 
v64_madd_us8(v64 a,v64 b)370 SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
371 #if defined(__SSSE3__)
372   return _mm_maddubs_epi16(a, b);
373 #else
374   __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
375                              _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
376   return _mm_packs_epi32(t, t);
377 #endif
378 }
379 
v64_avg_u8(v64 a,v64 b)380 SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
381 
v64_rdavg_u8(v64 a,v64 b)382 SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
383   return _mm_sub_epi8(_mm_avg_epu8(a, b),
384                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
385 }
386 
v64_rdavg_u16(v64 a,v64 b)387 SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
388   return _mm_sub_epi16(_mm_avg_epu16(a, b),
389                        _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
390 }
391 
v64_avg_u16(v64 a,v64 b)392 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
393 
v64_min_u8(v64 a,v64 b)394 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
395 
v64_max_u8(v64 a,v64 b)396 SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
397 
v64_min_s8(v64 a,v64 b)398 SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
399 #if defined(__SSE4_1__)
400   return _mm_min_epi8(a, b);
401 #else
402   v64 mask = _mm_cmplt_epi8(a, b);
403   return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
404 #endif
405 }
406 
v64_max_s8(v64 a,v64 b)407 SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
408 #if defined(__SSE4_1__)
409   return _mm_max_epi8(a, b);
410 #else
411   v64 mask = _mm_cmplt_epi8(b, a);
412   return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
413 #endif
414 }
415 
v64_min_s16(v64 a,v64 b)416 SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
417 
v64_max_s16(v64 a,v64 b)418 SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
419 
v64_cmpgt_s8(v64 a,v64 b)420 SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
421 
v64_cmplt_s8(v64 a,v64 b)422 SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
423 
v64_cmpeq_8(v64 a,v64 b)424 SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
425 
v64_cmpgt_s16(v64 a,v64 b)426 SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
427 
v64_cmplt_s16(v64 a,v64 b)428 SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
429 
v64_cmpeq_16(v64 a,v64 b)430 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
431 
v64_shl_8(v64 a,unsigned int c)432 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
433   return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
434                        _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
435 }
436 
v64_shr_u8(v64 a,unsigned int c)437 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
438   return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
439                        _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
440 }
441 
v64_shr_s8(v64 a,unsigned int c)442 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
443   return _mm_packs_epi16(
444       _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
445       a);
446 }
447 
v64_shl_16(v64 a,unsigned int c)448 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
449   return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
450 }
451 
v64_shr_u16(v64 a,unsigned int c)452 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
453   return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
454 }
455 
v64_shr_s16(v64 a,unsigned int c)456 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
457   return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
458 }
459 
v64_shl_32(v64 a,unsigned int c)460 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
461   return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
462 }
463 
v64_shr_u32(v64 a,unsigned int c)464 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
465   return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
466 }
467 
v64_shr_s32(v64 a,unsigned int c)468 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
469   return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
470 }
471 
472 /* These intrinsics require immediate values, so we must use #defines
473    to enforce that. */
474 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
475 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
476 #define v64_shl_n_8(a, c) \
477   _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
478 #define v64_shr_n_u8(a, c) \
479   _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
480 #define v64_shr_n_s8(a, c) \
481   _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
482 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
483 #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
484 #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
485 #define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
486 #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
487 #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
488 
489 #endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
490