xref: /aosp_15_r20/prebuilts/clang-tools/linux-x86/lib64/clang/19/include/xmmintrin.h (revision bed243d3d9cd544cfb038bfa7be843dedc6e6bf7)
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <mmintrin.h>
18 
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22 
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27 
28 /* This header should only be included in a hosted environment as it depends on
29  * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
33 
34 /* Define the default attributes for the functions in this file. */
35 #define __DEFAULT_FN_ATTRS                                                     \
36   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37                  __min_vector_width__(128)))
38 #define __DEFAULT_FN_ATTRS_MMX                                                 \
39   __attribute__((__always_inline__, __nodebug__,                               \
40                  __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41 
42 /// Adds the 32-bit float values in the low-order bits of the operands.
43 ///
44 /// \headerfile <x86intrin.h>
45 ///
46 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47 ///
48 /// \param __a
49 ///    A 128-bit vector of [4 x float] containing one of the source operands.
50 ///    The lower 32 bits of this operand are used in the calculation.
51 /// \param __b
52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
53 ///    The lower 32 bits of this operand are used in the calculation.
54 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
56 ///    the upper 96 bits of the first source operand.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ss(__m128 __a,__m128 __b)58 _mm_add_ss(__m128 __a, __m128 __b)
59 {
60   __a[0] += __b[0];
61   return __a;
62 }
63 
64 /// Adds two 128-bit vectors of [4 x float], and returns the results of
65 ///    the addition.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70 ///
71 /// \param __a
72 ///    A 128-bit vector of [4 x float] containing one of the source operands.
73 /// \param __b
74 ///    A 128-bit vector of [4 x float] containing one of the source operands.
75 /// \returns A 128-bit vector of [4 x float] containing the sums of both
76 ///    operands.
77 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ps(__m128 __a,__m128 __b)78 _mm_add_ps(__m128 __a, __m128 __b)
79 {
80   return (__m128)((__v4sf)__a + (__v4sf)__b);
81 }
82 
83 /// Subtracts the 32-bit float value in the low-order bits of the second
84 ///    operand from the corresponding value in the first operand.
85 ///
86 /// \headerfile <x86intrin.h>
87 ///
88 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89 ///
90 /// \param __a
91 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92 ///    of this operand are used in the calculation.
93 /// \param __b
94 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95 ///    bits of this operand are used in the calculation.
96 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
98 ///    copied from the upper 96 bits of the first source operand.
99 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ss(__m128 __a,__m128 __b)100 _mm_sub_ss(__m128 __a, __m128 __b)
101 {
102   __a[0] -= __b[0];
103   return __a;
104 }
105 
106 /// Subtracts each of the values of the second operand from the first
107 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
108 ///    the results of the subtraction.
109 ///
110 /// \headerfile <x86intrin.h>
111 ///
112 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113 ///
114 /// \param __a
115 ///    A 128-bit vector of [4 x float] containing the minuend.
116 /// \param __b
117 ///    A 128-bit vector of [4 x float] containing the subtrahend.
118 /// \returns A 128-bit vector of [4 x float] containing the differences between
119 ///    both operands.
120 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ps(__m128 __a,__m128 __b)121 _mm_sub_ps(__m128 __a, __m128 __b)
122 {
123   return (__m128)((__v4sf)__a - (__v4sf)__b);
124 }
125 
126 /// Multiplies two 32-bit float values in the low-order bits of the
127 ///    operands.
128 ///
129 /// \headerfile <x86intrin.h>
130 ///
131 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132 ///
133 /// \param __a
134 ///    A 128-bit vector of [4 x float] containing one of the source operands.
135 ///    The lower 32 bits of this operand are used in the calculation.
136 /// \param __b
137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
138 ///    The lower 32 bits of this operand are used in the calculation.
139 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
140 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
141 ///    bits of the first source operand.
142 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ss(__m128 __a,__m128 __b)143 _mm_mul_ss(__m128 __a, __m128 __b)
144 {
145   __a[0] *= __b[0];
146   return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [4 x float] and returns the
150 ///    results of the multiplication.
151 ///
152 /// \headerfile <x86intrin.h>
153 ///
154 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155 ///
156 /// \param __a
157 ///    A 128-bit vector of [4 x float] containing one of the source operands.
158 /// \param __b
159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
160 /// \returns A 128-bit vector of [4 x float] containing the products of both
161 ///    operands.
162 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ps(__m128 __a,__m128 __b)163 _mm_mul_ps(__m128 __a, __m128 __b)
164 {
165   return (__m128)((__v4sf)__a * (__v4sf)__b);
166 }
167 
168 /// Divides the value in the low-order 32 bits of the first operand by
169 ///    the corresponding value in the second operand.
170 ///
171 /// \headerfile <x86intrin.h>
172 ///
173 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174 ///
175 /// \param __a
176 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
177 ///    bits of this operand are used in the calculation.
178 /// \param __b
179 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180 ///    of this operand are used in the calculation.
181 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
182 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
183 ///    upper 96 bits of the first source operand.
184 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ss(__m128 __a,__m128 __b)185 _mm_div_ss(__m128 __a, __m128 __b)
186 {
187   __a[0] /= __b[0];
188   return __a;
189 }
190 
191 /// Divides two 128-bit vectors of [4 x float].
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196 ///
197 /// \param __a
198 ///    A 128-bit vector of [4 x float] containing the dividend.
199 /// \param __b
200 ///    A 128-bit vector of [4 x float] containing the divisor.
201 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
202 ///    operands.
203 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ps(__m128 __a,__m128 __b)204 _mm_div_ps(__m128 __a, __m128 __b)
205 {
206   return (__m128)((__v4sf)__a / (__v4sf)__b);
207 }
208 
209 /// Calculates the square root of the value stored in the low-order bits
210 ///    of a 128-bit vector of [4 x float].
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215 ///
216 /// \param __a
217 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218 ///    used in the calculation.
219 /// \returns A 128-bit vector of [4 x float] containing the square root of the
220 ///    value in the low-order bits of the operand.
221 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)222 _mm_sqrt_ss(__m128 __a)
223 {
224   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225 }
226 
227 /// Calculates the square roots of the values stored in a 128-bit vector
228 ///    of [4 x float].
229 ///
230 /// \headerfile <x86intrin.h>
231 ///
232 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233 ///
234 /// \param __a
235 ///    A 128-bit vector of [4 x float].
236 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
237 ///    values in the operand.
238 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)239 _mm_sqrt_ps(__m128 __a)
240 {
241   return __builtin_ia32_sqrtps((__v4sf)__a);
242 }
243 
244 /// Calculates the approximate reciprocal of the value stored in the
245 ///    low-order bits of a 128-bit vector of [4 x float].
246 ///
247 /// \headerfile <x86intrin.h>
248 ///
249 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250 ///
251 /// \param __a
252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253 ///    used in the calculation.
254 /// \returns A 128-bit vector of [4 x float] containing the approximate
255 ///    reciprocal of the value in the low-order bits of the operand.
256 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ss(__m128 __a)257 _mm_rcp_ss(__m128 __a)
258 {
259   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260 }
261 
262 /// Calculates the approximate reciprocals of the values stored in a
263 ///    128-bit vector of [4 x float].
264 ///
265 /// \headerfile <x86intrin.h>
266 ///
267 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268 ///
269 /// \param __a
270 ///    A 128-bit vector of [4 x float].
271 /// \returns A 128-bit vector of [4 x float] containing the approximate
272 ///    reciprocals of the values in the operand.
273 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ps(__m128 __a)274 _mm_rcp_ps(__m128 __a)
275 {
276   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277 }
278 
279 /// Calculates the approximate reciprocal of the square root of the value
280 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
281 ///
282 /// \headerfile <x86intrin.h>
283 ///
284 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285 ///
286 /// \param __a
287 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288 ///    used in the calculation.
289 /// \returns A 128-bit vector of [4 x float] containing the approximate
290 ///    reciprocal of the square root of the value in the low-order bits of the
291 ///    operand.
292 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ss(__m128 __a)293 _mm_rsqrt_ss(__m128 __a)
294 {
295   return __builtin_ia32_rsqrtss((__v4sf)__a);
296 }
297 
298 /// Calculates the approximate reciprocals of the square roots of the
299 ///    values stored in a 128-bit vector of [4 x float].
300 ///
301 /// \headerfile <x86intrin.h>
302 ///
303 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304 ///
305 /// \param __a
306 ///    A 128-bit vector of [4 x float].
307 /// \returns A 128-bit vector of [4 x float] containing the approximate
308 ///    reciprocals of the square roots of the values in the operand.
309 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ps(__m128 __a)310 _mm_rsqrt_ps(__m128 __a)
311 {
312   return __builtin_ia32_rsqrtps((__v4sf)__a);
313 }
314 
315 /// Compares two 32-bit float values in the low-order bits of both
316 ///    operands and returns the lesser value in the low-order bits of the
317 ///    vector of [4 x float].
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
322 ///
323 /// \param __a
324 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
325 ///    32 bits of this operand are used in the comparison.
326 /// \param __b
327 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
328 ///    32 bits of this operand are used in the comparison.
329 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
330 ///    minimum value between both operands. The upper 96 bits are copied from
331 ///    the upper 96 bits of the first source operand.
332 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ss(__m128 __a,__m128 __b)333 _mm_min_ss(__m128 __a, __m128 __b)
334 {
335   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
336 }
337 
338 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
339 ///    of each pair of values.
340 ///
341 /// \headerfile <x86intrin.h>
342 ///
343 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
344 ///
345 /// \param __a
346 ///    A 128-bit vector of [4 x float] containing one of the operands.
347 /// \param __b
348 ///    A 128-bit vector of [4 x float] containing one of the operands.
349 /// \returns A 128-bit vector of [4 x float] containing the minimum values
350 ///    between both operands.
351 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ps(__m128 __a,__m128 __b)352 _mm_min_ps(__m128 __a, __m128 __b)
353 {
354   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
355 }
356 
357 /// Compares two 32-bit float values in the low-order bits of both
358 ///    operands and returns the greater value in the low-order bits of a 128-bit
359 ///    vector of [4 x float].
360 ///
361 /// \headerfile <x86intrin.h>
362 ///
363 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
364 ///
365 /// \param __a
366 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
367 ///    32 bits of this operand are used in the comparison.
368 /// \param __b
369 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
370 ///    32 bits of this operand are used in the comparison.
371 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
372 ///    maximum value between both operands. The upper 96 bits are copied from
373 ///    the upper 96 bits of the first source operand.
374 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ss(__m128 __a,__m128 __b)375 _mm_max_ss(__m128 __a, __m128 __b)
376 {
377   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
378 }
379 
380 /// Compares two 128-bit vectors of [4 x float] and returns the greater
381 ///    of each pair of values.
382 ///
383 /// \headerfile <x86intrin.h>
384 ///
385 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
386 ///
387 /// \param __a
388 ///    A 128-bit vector of [4 x float] containing one of the operands.
389 /// \param __b
390 ///    A 128-bit vector of [4 x float] containing one of the operands.
391 /// \returns A 128-bit vector of [4 x float] containing the maximum values
392 ///    between both operands.
393 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ps(__m128 __a,__m128 __b)394 _mm_max_ps(__m128 __a, __m128 __b)
395 {
396   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
397 }
398 
399 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
400 ///
401 /// \headerfile <x86intrin.h>
402 ///
403 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
404 ///
405 /// \param __a
406 ///    A 128-bit vector containing one of the source operands.
407 /// \param __b
408 ///    A 128-bit vector containing one of the source operands.
409 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
410 ///    values between both operands.
411 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_and_ps(__m128 __a,__m128 __b)412 _mm_and_ps(__m128 __a, __m128 __b)
413 {
414   return (__m128)((__v4su)__a & (__v4su)__b);
415 }
416 
417 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
418 ///    the one's complement of the values contained in the first source
419 ///    operand.
420 ///
421 /// \headerfile <x86intrin.h>
422 ///
423 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
424 ///
425 /// \param __a
426 ///    A 128-bit vector of [4 x float] containing the first source operand. The
427 ///    one's complement of this value is used in the bitwise AND.
428 /// \param __b
429 ///    A 128-bit vector of [4 x float] containing the second source operand.
430 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
431 ///    one's complement of the first operand and the values in the second
432 ///    operand.
433 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_andnot_ps(__m128 __a,__m128 __b)434 _mm_andnot_ps(__m128 __a, __m128 __b)
435 {
436   return (__m128)(~(__v4su)__a & (__v4su)__b);
437 }
438 
439 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
440 ///
441 /// \headerfile <x86intrin.h>
442 ///
443 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
444 ///
445 /// \param __a
446 ///    A 128-bit vector of [4 x float] containing one of the source operands.
447 /// \param __b
448 ///    A 128-bit vector of [4 x float] containing one of the source operands.
449 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
450 ///    values between both operands.
451 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_or_ps(__m128 __a,__m128 __b)452 _mm_or_ps(__m128 __a, __m128 __b)
453 {
454   return (__m128)((__v4su)__a | (__v4su)__b);
455 }
456 
457 /// Performs a bitwise exclusive OR of two 128-bit vectors of
458 ///    [4 x float].
459 ///
460 /// \headerfile <x86intrin.h>
461 ///
462 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
463 ///
464 /// \param __a
465 ///    A 128-bit vector of [4 x float] containing one of the source operands.
466 /// \param __b
467 ///    A 128-bit vector of [4 x float] containing one of the source operands.
468 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
469 ///    of the values between both operands.
470 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_xor_ps(__m128 __a,__m128 __b)471 _mm_xor_ps(__m128 __a, __m128 __b)
472 {
473   return (__m128)((__v4su)__a ^ (__v4su)__b);
474 }
475 
476 /// Compares two 32-bit float values in the low-order bits of both
477 ///    operands for equality.
478 ///
479 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
480 ///    low-order bits of a vector [4 x float].
481 ///
482 /// \headerfile <x86intrin.h>
483 ///
484 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
485 ///
486 /// \param __a
487 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
488 ///    32 bits of this operand are used in the comparison.
489 /// \param __b
490 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
491 ///    32 bits of this operand are used in the comparison.
492 /// \returns A 128-bit vector of [4 x float] containing the comparison results
493 ///    in the low-order bits.
494 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ss(__m128 __a,__m128 __b)495 _mm_cmpeq_ss(__m128 __a, __m128 __b)
496 {
497   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
498 }
499 
500 /// Compares each of the corresponding 32-bit float values of the
501 ///    128-bit vectors of [4 x float] for equality.
502 ///
503 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
504 ///
505 /// \headerfile <x86intrin.h>
506 ///
507 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
508 ///
509 /// \param __a
510 ///    A 128-bit vector of [4 x float].
511 /// \param __b
512 ///    A 128-bit vector of [4 x float].
513 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
514 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ps(__m128 __a,__m128 __b)515 _mm_cmpeq_ps(__m128 __a, __m128 __b)
516 {
517   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
518 }
519 
520 /// Compares two 32-bit float values in the low-order bits of both
521 ///    operands to determine if the value in the first operand is less than the
522 ///    corresponding value in the second operand.
523 ///
524 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
525 ///    low-order bits of a vector of [4 x float].
526 ///
527 /// \headerfile <x86intrin.h>
528 ///
529 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
530 ///
531 /// \param __a
532 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
533 ///    32 bits of this operand are used in the comparison.
534 /// \param __b
535 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
536 ///    32 bits of this operand are used in the comparison.
537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
538 ///    in the low-order bits.
539 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ss(__m128 __a,__m128 __b)540 _mm_cmplt_ss(__m128 __a, __m128 __b)
541 {
542   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
543 }
544 
545 /// Compares each of the corresponding 32-bit float values of the
546 ///    128-bit vectors of [4 x float] to determine if the values in the first
547 ///    operand are less than those in the second operand.
548 ///
549 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
550 ///
551 /// \headerfile <x86intrin.h>
552 ///
553 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
554 ///
555 /// \param __a
556 ///    A 128-bit vector of [4 x float].
557 /// \param __b
558 ///    A 128-bit vector of [4 x float].
559 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
560 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ps(__m128 __a,__m128 __b)561 _mm_cmplt_ps(__m128 __a, __m128 __b)
562 {
563   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
564 }
565 
566 /// Compares two 32-bit float values in the low-order bits of both
567 ///    operands to determine if the value in the first operand is less than or
568 ///    equal to the corresponding value in the second operand.
569 ///
570 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
571 ///    the low-order bits of a vector of [4 x float].
572 ///
573 /// \headerfile <x86intrin.h>
574 ///
575 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
576 ///
577 /// \param __a
578 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
579 ///    32 bits of this operand are used in the comparison.
580 /// \param __b
581 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
582 ///    32 bits of this operand are used in the comparison.
583 /// \returns A 128-bit vector of [4 x float] containing the comparison results
584 ///    in the low-order bits.
585 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ss(__m128 __a,__m128 __b)586 _mm_cmple_ss(__m128 __a, __m128 __b)
587 {
588   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
589 }
590 
591 /// Compares each of the corresponding 32-bit float values of the
592 ///    128-bit vectors of [4 x float] to determine if the values in the first
593 ///    operand are less than or equal to those in the second operand.
594 ///
595 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
596 ///
597 /// \headerfile <x86intrin.h>
598 ///
599 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
600 ///
601 /// \param __a
602 ///    A 128-bit vector of [4 x float].
603 /// \param __b
604 ///    A 128-bit vector of [4 x float].
605 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
606 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ps(__m128 __a,__m128 __b)607 _mm_cmple_ps(__m128 __a, __m128 __b)
608 {
609   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
610 }
611 
612 /// Compares two 32-bit float values in the low-order bits of both
613 ///    operands to determine if the value in the first operand is greater than
614 ///    the corresponding value in the second operand.
615 ///
616 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
617 ///    low-order bits of a vector of [4 x float].
618 ///
619 /// \headerfile <x86intrin.h>
620 ///
621 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
622 ///
623 /// \param __a
624 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
625 ///    32 bits of this operand are used in the comparison.
626 /// \param __b
627 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
628 ///    32 bits of this operand are used in the comparison.
629 /// \returns A 128-bit vector of [4 x float] containing the comparison results
630 ///    in the low-order bits.
631 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a,__m128 __b)632 _mm_cmpgt_ss(__m128 __a, __m128 __b)
633 {
634   return (__m128)__builtin_shufflevector((__v4sf)__a,
635                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
636                                          4, 1, 2, 3);
637 }
638 
639 /// Compares each of the corresponding 32-bit float values of the
640 ///    128-bit vectors of [4 x float] to determine if the values in the first
641 ///    operand are greater than those in the second operand.
642 ///
643 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
644 ///
645 /// \headerfile <x86intrin.h>
646 ///
647 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
648 ///
649 /// \param __a
650 ///    A 128-bit vector of [4 x float].
651 /// \param __b
652 ///    A 128-bit vector of [4 x float].
653 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
654 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ps(__m128 __a,__m128 __b)655 _mm_cmpgt_ps(__m128 __a, __m128 __b)
656 {
657   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
658 }
659 
660 /// Compares two 32-bit float values in the low-order bits of both
661 ///    operands to determine if the value in the first operand is greater than
662 ///    or equal to the corresponding value in the second operand.
663 ///
664 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
665 ///    low-order bits of a vector of [4 x float].
666 ///
667 /// \headerfile <x86intrin.h>
668 ///
669 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
670 ///
671 /// \param __a
672 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
673 ///    32 bits of this operand are used in the comparison.
674 /// \param __b
675 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
676 ///    32 bits of this operand are used in the comparison.
677 /// \returns A 128-bit vector of [4 x float] containing the comparison results
678 ///    in the low-order bits.
679 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a,__m128 __b)680 _mm_cmpge_ss(__m128 __a, __m128 __b)
681 {
682   return (__m128)__builtin_shufflevector((__v4sf)__a,
683                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
684                                          4, 1, 2, 3);
685 }
686 
687 /// Compares each of the corresponding 32-bit float values of the
688 ///    128-bit vectors of [4 x float] to determine if the values in the first
689 ///    operand are greater than or equal to those in the second operand.
690 ///
691 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
692 ///
693 /// \headerfile <x86intrin.h>
694 ///
695 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
696 ///
697 /// \param __a
698 ///    A 128-bit vector of [4 x float].
699 /// \param __b
700 ///    A 128-bit vector of [4 x float].
701 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
702 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ps(__m128 __a,__m128 __b)703 _mm_cmpge_ps(__m128 __a, __m128 __b)
704 {
705   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
706 }
707 
708 /// Compares two 32-bit float values in the low-order bits of both operands
709 ///    for inequality.
710 ///
711 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
712 ///    low-order bits of a vector of [4 x float].
713 ///
714 /// \headerfile <x86intrin.h>
715 ///
716 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
717 ///   instructions.
718 ///
719 /// \param __a
720 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
721 ///    32 bits of this operand are used in the comparison.
722 /// \param __b
723 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
724 ///    32 bits of this operand are used in the comparison.
725 /// \returns A 128-bit vector of [4 x float] containing the comparison results
726 ///    in the low-order bits.
727 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ss(__m128 __a,__m128 __b)728 _mm_cmpneq_ss(__m128 __a, __m128 __b)
729 {
730   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
731 }
732 
733 /// Compares each of the corresponding 32-bit float values of the
734 ///    128-bit vectors of [4 x float] for inequality.
735 ///
736 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
737 ///
738 /// \headerfile <x86intrin.h>
739 ///
740 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
741 ///   instructions.
742 ///
743 /// \param __a
744 ///    A 128-bit vector of [4 x float].
745 /// \param __b
746 ///    A 128-bit vector of [4 x float].
747 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
748 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ps(__m128 __a,__m128 __b)749 _mm_cmpneq_ps(__m128 __a, __m128 __b)
750 {
751   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
752 }
753 
754 /// Compares two 32-bit float values in the low-order bits of both
755 ///    operands to determine if the value in the first operand is not less than
756 ///    the corresponding value in the second operand.
757 ///
758 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
759 ///    low-order bits of a vector of [4 x float].
760 ///
761 /// \headerfile <x86intrin.h>
762 ///
763 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
764 ///   instructions.
765 ///
766 /// \param __a
767 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
768 ///    32 bits of this operand are used in the comparison.
769 /// \param __b
770 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
771 ///    32 bits of this operand are used in the comparison.
772 /// \returns A 128-bit vector of [4 x float] containing the comparison results
773 ///    in the low-order bits.
774 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ss(__m128 __a,__m128 __b)775 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
776 {
777   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
778 }
779 
780 /// Compares each of the corresponding 32-bit float values of the
781 ///    128-bit vectors of [4 x float] to determine if the values in the first
782 ///    operand are not less than those in the second operand.
783 ///
784 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
785 ///
786 /// \headerfile <x86intrin.h>
787 ///
788 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
789 ///   instructions.
790 ///
791 /// \param __a
792 ///    A 128-bit vector of [4 x float].
793 /// \param __b
794 ///    A 128-bit vector of [4 x float].
795 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
796 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ps(__m128 __a,__m128 __b)797 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
798 {
799   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
800 }
801 
802 /// Compares two 32-bit float values in the low-order bits of both
803 ///    operands to determine if the value in the first operand is not less than
804 ///    or equal to the corresponding value in the second operand.
805 ///
806 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
807 ///    low-order bits of a vector of [4 x float].
808 ///
809 /// \headerfile <x86intrin.h>
810 ///
811 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
812 ///   instructions.
813 ///
814 /// \param __a
815 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
816 ///    32 bits of this operand are used in the comparison.
817 /// \param __b
818 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
819 ///    32 bits of this operand are used in the comparison.
820 /// \returns A 128-bit vector of [4 x float] containing the comparison results
821 ///    in the low-order bits.
822 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ss(__m128 __a,__m128 __b)823 _mm_cmpnle_ss(__m128 __a, __m128 __b)
824 {
825   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
826 }
827 
828 /// Compares each of the corresponding 32-bit float values of the
829 ///    128-bit vectors of [4 x float] to determine if the values in the first
830 ///    operand are not less than or equal to those in the second operand.
831 ///
832 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
833 ///
834 /// \headerfile <x86intrin.h>
835 ///
836 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
837 ///   instructions.
838 ///
839 /// \param __a
840 ///    A 128-bit vector of [4 x float].
841 /// \param __b
842 ///    A 128-bit vector of [4 x float].
843 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
844 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ps(__m128 __a,__m128 __b)845 _mm_cmpnle_ps(__m128 __a, __m128 __b)
846 {
847   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
848 }
849 
850 /// Compares two 32-bit float values in the low-order bits of both
851 ///    operands to determine if the value in the first operand is not greater
852 ///    than the corresponding value in the second operand.
853 ///
854 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
855 ///    low-order bits of a vector of [4 x float].
856 ///
857 /// \headerfile <x86intrin.h>
858 ///
859 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
860 ///   instructions.
861 ///
862 /// \param __a
863 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
864 ///    32 bits of this operand are used in the comparison.
865 /// \param __b
866 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
867 ///    32 bits of this operand are used in the comparison.
868 /// \returns A 128-bit vector of [4 x float] containing the comparison results
869 ///    in the low-order bits.
870 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a,__m128 __b)871 _mm_cmpngt_ss(__m128 __a, __m128 __b)
872 {
873   return (__m128)__builtin_shufflevector((__v4sf)__a,
874                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
875                                          4, 1, 2, 3);
876 }
877 
878 /// Compares each of the corresponding 32-bit float values of the
879 ///    128-bit vectors of [4 x float] to determine if the values in the first
880 ///    operand are not greater than those in the second operand.
881 ///
882 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
883 ///
884 /// \headerfile <x86intrin.h>
885 ///
886 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
887 ///   instructions.
888 ///
889 /// \param __a
890 ///    A 128-bit vector of [4 x float].
891 /// \param __b
892 ///    A 128-bit vector of [4 x float].
893 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
894 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ps(__m128 __a,__m128 __b)895 _mm_cmpngt_ps(__m128 __a, __m128 __b)
896 {
897   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
898 }
899 
900 /// Compares two 32-bit float values in the low-order bits of both
901 ///    operands to determine if the value in the first operand is not greater
902 ///    than or equal to the corresponding value in the second operand.
903 ///
904 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
905 ///    low-order bits of a vector of [4 x float].
906 ///
907 /// \headerfile <x86intrin.h>
908 ///
909 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
910 ///   instructions.
911 ///
912 /// \param __a
913 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
914 ///    32 bits of this operand are used in the comparison.
915 /// \param __b
916 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
917 ///    32 bits of this operand are used in the comparison.
918 /// \returns A 128-bit vector of [4 x float] containing the comparison results
919 ///    in the low-order bits.
920 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a,__m128 __b)921 _mm_cmpnge_ss(__m128 __a, __m128 __b)
922 {
923   return (__m128)__builtin_shufflevector((__v4sf)__a,
924                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
925                                          4, 1, 2, 3);
926 }
927 
928 /// Compares each of the corresponding 32-bit float values of the
929 ///    128-bit vectors of [4 x float] to determine if the values in the first
930 ///    operand are not greater than or equal to those in the second operand.
931 ///
932 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
933 ///
934 /// \headerfile <x86intrin.h>
935 ///
936 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
937 ///   instructions.
938 ///
939 /// \param __a
940 ///    A 128-bit vector of [4 x float].
941 /// \param __b
942 ///    A 128-bit vector of [4 x float].
943 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
944 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ps(__m128 __a,__m128 __b)945 _mm_cmpnge_ps(__m128 __a, __m128 __b)
946 {
947   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
948 }
949 
950 /// Compares two 32-bit float values in the low-order bits of both
951 ///    operands to determine if the value in the first operand is ordered with
952 ///    respect to the corresponding value in the second operand.
953 ///
954 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
955 ///    low-order bits of a vector of [4 x float].
956 ///
957 /// \headerfile <x86intrin.h>
958 ///
959 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
960 ///   instructions.
961 ///
962 /// \param __a
963 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
964 ///    32 bits of this operand are used in the comparison.
965 /// \param __b
966 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
967 ///    32 bits of this operand are used in the comparison.
968 /// \returns A 128-bit vector of [4 x float] containing the comparison results
969 ///    in the low-order bits.
970 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ss(__m128 __a,__m128 __b)971 _mm_cmpord_ss(__m128 __a, __m128 __b)
972 {
973   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
974 }
975 
976 /// Compares each of the corresponding 32-bit float values of the
977 ///    128-bit vectors of [4 x float] to determine if the values in the first
978 ///    operand are ordered with respect to those in the second operand.
979 ///
980 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
981 ///
982 /// \headerfile <x86intrin.h>
983 ///
984 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
985 ///   instructions.
986 ///
987 /// \param __a
988 ///    A 128-bit vector of [4 x float].
989 /// \param __b
990 ///    A 128-bit vector of [4 x float].
991 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
992 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ps(__m128 __a,__m128 __b)993 _mm_cmpord_ps(__m128 __a, __m128 __b)
994 {
995   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
996 }
997 
998 /// Compares two 32-bit float values in the low-order bits of both
999 ///    operands to determine if the value in the first operand is unordered
1000 ///    with respect to the corresponding value in the second operand.
1001 ///
1002 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
1003 ///    low-order bits of a vector of [4 x float].
1004 ///
1005 /// \headerfile <x86intrin.h>
1006 ///
1007 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1008 ///   instructions.
1009 ///
1010 /// \param __a
1011 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1012 ///    32 bits of this operand are used in the comparison.
1013 /// \param __b
1014 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1015 ///    32 bits of this operand are used in the comparison.
1016 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1017 ///    in the low-order bits.
1018 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ss(__m128 __a,__m128 __b)1019 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1020 {
1021   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1022 }
1023 
1024 /// Compares each of the corresponding 32-bit float values of the
1025 ///    128-bit vectors of [4 x float] to determine if the values in the first
1026 ///    operand are unordered with respect to those in the second operand.
1027 ///
1028 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
1029 ///
1030 /// \headerfile <x86intrin.h>
1031 ///
1032 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1033 ///   instructions.
1034 ///
1035 /// \param __a
1036 ///    A 128-bit vector of [4 x float].
1037 /// \param __b
1038 ///    A 128-bit vector of [4 x float].
1039 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1040 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ps(__m128 __a,__m128 __b)1041 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1042 {
1043   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1044 }
1045 
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 ///    operands for equality.
1048 ///
1049 ///    The comparison returns 0 for false, 1 for true. If either of the two
1050 ///    lower floating-point values is NaN, returns 0.
1051 ///
1052 /// \headerfile <x86intrin.h>
1053 ///
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1055 ///   instructions.
1056 ///
1057 /// \param __a
1058 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1059 ///    used in the comparison.
1060 /// \param __b
1061 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1062 ///    used in the comparison.
1063 /// \returns An integer containing the comparison results.
1064 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_ss(__m128 __a,__m128 __b)1065 _mm_comieq_ss(__m128 __a, __m128 __b)
1066 {
1067   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1068 }
1069 
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 ///    operands to determine if the first operand is less than the second
1072 ///    operand.
1073 ///
1074 ///    The comparison returns 0 for false, 1 for true. If either of the two
1075 ///    lower floating-point values is NaN, returns 0.
1076 ///
1077 /// \headerfile <x86intrin.h>
1078 ///
1079 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1080 ///   instructions.
1081 ///
1082 /// \param __a
1083 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1084 ///    used in the comparison.
1085 /// \param __b
1086 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1087 ///    used in the comparison.
1088 /// \returns An integer containing the comparison results.
1089 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_ss(__m128 __a,__m128 __b)1090 _mm_comilt_ss(__m128 __a, __m128 __b)
1091 {
1092   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1093 }
1094 
1095 /// Compares two 32-bit float values in the low-order bits of both
1096 ///    operands to determine if the first operand is less than or equal to the
1097 ///    second operand.
1098 ///
1099 ///    The comparison returns 0 for false, 1 for true. If either of the two
1100 ///    lower floating-point values is NaN, returns 0.
1101 ///
1102 /// \headerfile <x86intrin.h>
1103 ///
1104 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1105 ///
1106 /// \param __a
1107 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1108 ///    used in the comparison.
1109 /// \param __b
1110 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1111 ///    used in the comparison.
1112 /// \returns An integer containing the comparison results.
1113 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_ss(__m128 __a,__m128 __b)1114 _mm_comile_ss(__m128 __a, __m128 __b)
1115 {
1116   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1117 }
1118 
1119 /// Compares two 32-bit float values in the low-order bits of both
1120 ///    operands to determine if the first operand is greater than the second
1121 ///    operand.
1122 ///
1123 ///    The comparison returns 0 for false, 1 for true. If either of the two
1124 ///    lower floating-point values is NaN, returns 0.
1125 ///
1126 /// \headerfile <x86intrin.h>
1127 ///
1128 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1129 ///
1130 /// \param __a
1131 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1132 ///    used in the comparison.
1133 /// \param __b
1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 ///    used in the comparison.
1136 /// \returns An integer containing the comparison results.
1137 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_ss(__m128 __a,__m128 __b)1138 _mm_comigt_ss(__m128 __a, __m128 __b)
1139 {
1140   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1141 }
1142 
1143 /// Compares two 32-bit float values in the low-order bits of both
1144 ///    operands to determine if the first operand is greater than or equal to
1145 ///    the second operand.
1146 ///
1147 ///    The comparison returns 0 for false, 1 for true. If either of the two
1148 ///    lower floating-point values is NaN, returns 0.
1149 ///
1150 /// \headerfile <x86intrin.h>
1151 ///
1152 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1153 ///
1154 /// \param __a
1155 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1156 ///    used in the comparison.
1157 /// \param __b
1158 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1159 ///    used in the comparison.
1160 /// \returns An integer containing the comparison results.
1161 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_ss(__m128 __a,__m128 __b)1162 _mm_comige_ss(__m128 __a, __m128 __b)
1163 {
1164   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1165 }
1166 
1167 /// Compares two 32-bit float values in the low-order bits of both
1168 ///    operands to determine if the first operand is not equal to the second
1169 ///    operand.
1170 ///
1171 ///    The comparison returns 0 for false, 1 for true. If either of the two
1172 ///    lower floating-point values is NaN, returns 0.
1173 ///
1174 /// \headerfile <x86intrin.h>
1175 ///
1176 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1177 ///
1178 /// \param __a
1179 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1180 ///    used in the comparison.
1181 /// \param __b
1182 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1183 ///    used in the comparison.
1184 /// \returns An integer containing the comparison results.
1185 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_ss(__m128 __a,__m128 __b)1186 _mm_comineq_ss(__m128 __a, __m128 __b)
1187 {
1188   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1189 }
1190 
1191 /// Performs an unordered comparison of two 32-bit float values using
1192 ///    the low-order bits of both operands to determine equality.
1193 ///
1194 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1195 ///    lower floating-point values is NaN, returns 0.
1196 ///
1197 /// \headerfile <x86intrin.h>
1198 ///
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200 ///
1201 /// \param __a
1202 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 ///    used in the comparison.
1204 /// \param __b
1205 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 ///    used in the comparison.
1207 /// \returns An integer containing the comparison results.
1208 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_ss(__m128 __a,__m128 __b)1209 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1210 {
1211   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1212 }
1213 
1214 /// Performs an unordered comparison of two 32-bit float values using
1215 ///    the low-order bits of both operands to determine if the first operand is
1216 ///    less than the second operand.
1217 ///
1218 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1219 ///    lower floating-point values is NaN, returns 0.
1220 ///
1221 /// \headerfile <x86intrin.h>
1222 ///
1223 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1224 ///
1225 /// \param __a
1226 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1227 ///    used in the comparison.
1228 /// \param __b
1229 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1230 ///    used in the comparison.
1231 /// \returns An integer containing the comparison results.
1232 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_ss(__m128 __a,__m128 __b)1233 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1234 {
1235   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1236 }
1237 
1238 /// Performs an unordered comparison of two 32-bit float values using
1239 ///    the low-order bits of both operands to determine if the first operand is
1240 ///    less than or equal to the second operand.
1241 ///
1242 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1243 ///    lower floating-point values is NaN, returns 0.
1244 ///
1245 /// \headerfile <x86intrin.h>
1246 ///
1247 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1248 ///
1249 /// \param __a
1250 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1251 ///    used in the comparison.
1252 /// \param __b
1253 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1254 ///    used in the comparison.
1255 /// \returns An integer containing the comparison results.
1256 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_ss(__m128 __a,__m128 __b)1257 _mm_ucomile_ss(__m128 __a, __m128 __b)
1258 {
1259   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1260 }
1261 
1262 /// Performs an unordered comparison of two 32-bit float values using
1263 ///    the low-order bits of both operands to determine if the first operand is
1264 ///    greater than the second operand.
1265 ///
1266 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1267 ///    lower floating-point values is NaN, returns 0.
1268 ///
1269 /// \headerfile <x86intrin.h>
1270 ///
1271 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1272 ///
1273 /// \param __a
1274 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1275 ///    used in the comparison.
1276 /// \param __b
1277 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1278 ///    used in the comparison.
1279 /// \returns An integer containing the comparison results.
1280 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_ss(__m128 __a,__m128 __b)1281 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1282 {
1283   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1284 }
1285 
1286 /// Performs an unordered comparison of two 32-bit float values using
1287 ///    the low-order bits of both operands to determine if the first operand is
1288 ///    greater than or equal to the second operand.
1289 ///
1290 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1291 ///    lower floating-point values is NaN, returns 0.
1292 ///
1293 /// \headerfile <x86intrin.h>
1294 ///
1295 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1296 ///
1297 /// \param __a
1298 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 ///    used in the comparison.
1300 /// \param __b
1301 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1302 ///    used in the comparison.
1303 /// \returns An integer containing the comparison results.
1304 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_ss(__m128 __a,__m128 __b)1305 _mm_ucomige_ss(__m128 __a, __m128 __b)
1306 {
1307   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1308 }
1309 
1310 /// Performs an unordered comparison of two 32-bit float values using
1311 ///    the low-order bits of both operands to determine inequality.
1312 ///
1313 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1314 ///    lower floating-point values is NaN, returns 0.
1315 ///
1316 /// \headerfile <x86intrin.h>
1317 ///
1318 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1319 ///
1320 /// \param __a
1321 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1322 ///    used in the comparison.
1323 /// \param __b
1324 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1325 ///    used in the comparison.
1326 /// \returns An integer containing the comparison results.
1327 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_ss(__m128 __a,__m128 __b)1328 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1329 {
1330   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1331 }
1332 
1333 /// Converts a float value contained in the lower 32 bits of a vector of
1334 ///    [4 x float] into a 32-bit integer.
1335 ///
1336 /// \headerfile <x86intrin.h>
1337 ///
1338 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1339 ///   instructions.
1340 ///
1341 /// \param __a
1342 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1343 ///    used in the conversion.
1344 /// \returns A 32-bit integer containing the converted value.
1345 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtss_si32(__m128 __a)1346 _mm_cvtss_si32(__m128 __a)
1347 {
1348   return __builtin_ia32_cvtss2si((__v4sf)__a);
1349 }
1350 
1351 /// Converts a float value contained in the lower 32 bits of a vector of
1352 ///    [4 x float] into a 32-bit integer.
1353 ///
1354 /// \headerfile <x86intrin.h>
1355 ///
1356 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1357 ///   instructions.
1358 ///
1359 /// \param __a
1360 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1361 ///    used in the conversion.
1362 /// \returns A 32-bit integer containing the converted value.
1363 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvt_ss2si(__m128 __a)1364 _mm_cvt_ss2si(__m128 __a)
1365 {
1366   return _mm_cvtss_si32(__a);
1367 }
1368 
1369 #ifdef __x86_64__
1370 
1371 /// Converts a float value contained in the lower 32 bits of a vector of
1372 ///    [4 x float] into a 64-bit integer.
1373 ///
1374 /// \headerfile <x86intrin.h>
1375 ///
1376 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1377 ///   instructions.
1378 ///
1379 /// \param __a
1380 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1381 ///    used in the conversion.
1382 /// \returns A 64-bit integer containing the converted value.
1383 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtss_si64(__m128 __a)1384 _mm_cvtss_si64(__m128 __a)
1385 {
1386   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1387 }
1388 
1389 #endif
1390 
1391 /// Converts two low-order float values in a 128-bit vector of
1392 ///    [4 x float] into a 64-bit vector of [2 x i32].
1393 ///
1394 /// \headerfile <x86intrin.h>
1395 ///
1396 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1397 ///
1398 /// \param __a
1399 ///    A 128-bit vector of [4 x float].
1400 /// \returns A 64-bit integer vector containing the converted values.
1401 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi32(__m128 __a)1402 _mm_cvtps_pi32(__m128 __a)
1403 {
1404   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1405 }
1406 
1407 /// Converts two low-order float values in a 128-bit vector of
1408 ///    [4 x float] into a 64-bit vector of [2 x i32].
1409 ///
1410 /// \headerfile <x86intrin.h>
1411 ///
1412 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1413 ///
1414 /// \param __a
1415 ///    A 128-bit vector of [4 x float].
1416 /// \returns A 64-bit integer vector containing the converted values.
1417 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_ps2pi(__m128 __a)1418 _mm_cvt_ps2pi(__m128 __a)
1419 {
1420   return _mm_cvtps_pi32(__a);
1421 }
1422 
1423 /// Converts a float value contained in the lower 32 bits of a vector of
1424 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1425 ///    inexact.
1426 ///
1427 /// \headerfile <x86intrin.h>
1428 ///
1429 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1430 ///   instructions.
1431 ///
1432 /// \param __a
1433 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1434 ///    used in the conversion.
1435 /// \returns A 32-bit integer containing the converted value.
1436 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttss_si32(__m128 __a)1437 _mm_cvttss_si32(__m128 __a)
1438 {
1439   return __builtin_ia32_cvttss2si((__v4sf)__a);
1440 }
1441 
1442 /// Converts a float value contained in the lower 32 bits of a vector of
1443 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1444 ///    inexact.
1445 ///
1446 /// \headerfile <x86intrin.h>
1447 ///
1448 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1449 ///   instructions.
1450 ///
1451 /// \param __a
1452 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1453 ///    used in the conversion.
1454 /// \returns A 32-bit integer containing the converted value.
1455 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtt_ss2si(__m128 __a)1456 _mm_cvtt_ss2si(__m128 __a)
1457 {
1458   return _mm_cvttss_si32(__a);
1459 }
1460 
1461 #ifdef __x86_64__
1462 /// Converts a float value contained in the lower 32 bits of a vector of
1463 ///    [4 x float] into a 64-bit integer, truncating the result when it is
1464 ///    inexact.
1465 ///
1466 /// \headerfile <x86intrin.h>
1467 ///
1468 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1469 ///   instructions.
1470 ///
1471 /// \param __a
1472 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1473 ///    used in the conversion.
1474 /// \returns A 64-bit integer containing the converted value.
1475 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttss_si64(__m128 __a)1476 _mm_cvttss_si64(__m128 __a)
1477 {
1478   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1479 }
1480 #endif
1481 
1482 /// Converts two low-order float values in a 128-bit vector of
1483 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1484 ///    when it is inexact.
1485 ///
1486 /// \headerfile <x86intrin.h>
1487 ///
1488 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1489 ///   instructions.
1490 ///
1491 /// \param __a
1492 ///    A 128-bit vector of [4 x float].
1493 /// \returns A 64-bit integer vector containing the converted values.
1494 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttps_pi32(__m128 __a)1495 _mm_cvttps_pi32(__m128 __a)
1496 {
1497   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1498 }
1499 
1500 /// Converts two low-order float values in a 128-bit vector of [4 x
1501 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1502 ///    is inexact.
1503 ///
1504 /// \headerfile <x86intrin.h>
1505 ///
1506 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1507 ///
1508 /// \param __a
1509 ///    A 128-bit vector of [4 x float].
1510 /// \returns A 64-bit integer vector containing the converted values.
1511 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtt_ps2pi(__m128 __a)1512 _mm_cvtt_ps2pi(__m128 __a)
1513 {
1514   return _mm_cvttps_pi32(__a);
1515 }
1516 
1517 /// Converts a 32-bit signed integer value into a floating point value
1518 ///    and writes it to the lower 32 bits of the destination. The remaining
1519 ///    higher order elements of the destination vector are copied from the
1520 ///    corresponding elements in the first operand.
1521 ///
1522 /// \headerfile <x86intrin.h>
1523 ///
1524 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1525 ///
1526 /// \param __a
1527 ///    A 128-bit vector of [4 x float].
1528 /// \param __b
1529 ///    A 32-bit signed integer operand containing the value to be converted.
1530 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1531 ///    converted value of the second operand. The upper 96 bits are copied from
1532 ///    the upper 96 bits of the first operand.
1533 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi32_ss(__m128 __a,int __b)1534 _mm_cvtsi32_ss(__m128 __a, int __b)
1535 {
1536   __a[0] = __b;
1537   return __a;
1538 }
1539 
1540 /// Converts a 32-bit signed integer value into a floating point value
1541 ///    and writes it to the lower 32 bits of the destination. The remaining
1542 ///    higher order elements of the destination are copied from the
1543 ///    corresponding elements in the first operand.
1544 ///
1545 /// \headerfile <x86intrin.h>
1546 ///
1547 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1548 ///
1549 /// \param __a
1550 ///    A 128-bit vector of [4 x float].
1551 /// \param __b
1552 ///    A 32-bit signed integer operand containing the value to be converted.
1553 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1554 ///    converted value of the second operand. The upper 96 bits are copied from
1555 ///    the upper 96 bits of the first operand.
1556 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvt_si2ss(__m128 __a,int __b)1557 _mm_cvt_si2ss(__m128 __a, int __b)
1558 {
1559   return _mm_cvtsi32_ss(__a, __b);
1560 }
1561 
1562 #ifdef __x86_64__
1563 
1564 /// Converts a 64-bit signed integer value into a floating point value
1565 ///    and writes it to the lower 32 bits of the destination. The remaining
1566 ///    higher order elements of the destination are copied from the
1567 ///    corresponding elements in the first operand.
1568 ///
1569 /// \headerfile <x86intrin.h>
1570 ///
1571 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1572 ///
1573 /// \param __a
1574 ///    A 128-bit vector of [4 x float].
1575 /// \param __b
1576 ///    A 64-bit signed integer operand containing the value to be converted.
1577 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1578 ///    converted value of the second operand. The upper 96 bits are copied from
1579 ///    the upper 96 bits of the first operand.
1580 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi64_ss(__m128 __a,long long __b)1581 _mm_cvtsi64_ss(__m128 __a, long long __b)
1582 {
1583   __a[0] = __b;
1584   return __a;
1585 }
1586 
1587 #endif
1588 
1589 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1590 ///    floating point values and writes them to the lower 64-bits of the
1591 ///    destination. The remaining higher order elements of the destination are
1592 ///    copied from the corresponding elements in the first operand.
1593 ///
1594 /// \headerfile <x86intrin.h>
1595 ///
1596 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1597 ///
1598 /// \param __a
1599 ///    A 128-bit vector of [4 x float].
1600 /// \param __b
1601 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1602 ///    and written to the corresponding low-order elements in the destination.
1603 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1604 ///    converted value of the second operand. The upper 64 bits are copied from
1605 ///    the upper 64 bits of the first operand.
1606 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_ps(__m128 __a,__m64 __b)1607 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1608 {
1609   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1610 }
1611 
1612 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1613 ///    floating point values and writes them to the lower 64-bits of the
1614 ///    destination. The remaining higher order elements of the destination are
1615 ///    copied from the corresponding elements in the first operand.
1616 ///
1617 /// \headerfile <x86intrin.h>
1618 ///
1619 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1620 ///
1621 /// \param __a
1622 ///    A 128-bit vector of [4 x float].
1623 /// \param __b
1624 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1625 ///    and written to the corresponding low-order elements in the destination.
1626 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1627 ///    converted value from the second operand. The upper 64 bits are copied
1628 ///    from the upper 64 bits of the first operand.
1629 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_pi2ps(__m128 __a,__m64 __b)1630 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1631 {
1632   return _mm_cvtpi32_ps(__a, __b);
1633 }
1634 
1635 /// Extracts a float value contained in the lower 32 bits of a vector of
1636 ///    [4 x float].
1637 ///
1638 /// \headerfile <x86intrin.h>
1639 ///
1640 /// This intrinsic has no corresponding instruction.
1641 ///
1642 /// \param __a
1643 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1644 ///    used in the extraction.
1645 /// \returns A 32-bit float containing the extracted value.
1646 static __inline__ float __DEFAULT_FN_ATTRS
_mm_cvtss_f32(__m128 __a)1647 _mm_cvtss_f32(__m128 __a)
1648 {
1649   return __a[0];
1650 }
1651 
1652 /// Loads two packed float values from the address \a __p into the
1653 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1654 ///     are copied from the low-order bits of the first operand.
1655 ///
1656 /// \headerfile <x86intrin.h>
1657 ///
1658 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1659 ///
1660 /// \param __a
1661 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1662 ///    of the destination.
1663 /// \param __p
1664 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1665 ///    [127:64] of the destination.
1666 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1667 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a,const __m64 * __p)1668 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1669 {
1670   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1671   struct __mm_loadh_pi_struct {
1672     __mm_loadh_pi_v2f32 __u;
1673   } __attribute__((__packed__, __may_alias__));
1674   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1675   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1676   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1677 }
1678 
1679 /// Loads two packed float values from the address \a __p into the
1680 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1681 ///    are copied from the high-order bits of the first operand.
1682 ///
1683 /// \headerfile <x86intrin.h>
1684 ///
1685 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1686 ///
1687 /// \param __a
1688 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1689 ///    [127:64] of the destination.
1690 /// \param __p
1691 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1692 ///    [63:0] of the destination.
1693 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1694 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a,const __m64 * __p)1695 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1696 {
1697   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1698   struct __mm_loadl_pi_struct {
1699     __mm_loadl_pi_v2f32 __u;
1700   } __attribute__((__packed__, __may_alias__));
1701   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1702   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1703   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1704 }
1705 
1706 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1707 ///    32 bits of the vector are initialized with the single-precision
1708 ///    floating-point value loaded from a specified memory location. The upper
1709 ///    96 bits are set to zero.
1710 ///
1711 /// \headerfile <x86intrin.h>
1712 ///
1713 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1714 ///
1715 /// \param __p
1716 ///    A pointer to a 32-bit memory location containing a single-precision
1717 ///    floating-point value.
1718 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1719 ///    lower 32 bits contain the value loaded from the memory location. The
1720 ///    upper 96 bits are set to zero.
1721 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ss(const float * __p)1722 _mm_load_ss(const float *__p)
1723 {
1724   struct __mm_load_ss_struct {
1725     float __u;
1726   } __attribute__((__packed__, __may_alias__));
1727   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1728   return __extension__ (__m128){ __u, 0, 0, 0 };
1729 }
1730 
1731 /// Loads a 32-bit float value and duplicates it to all four vector
1732 ///    elements of a 128-bit vector of [4 x float].
1733 ///
1734 /// \headerfile <x86intrin.h>
1735 ///
1736 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1737 ///    instruction.
1738 ///
1739 /// \param __p
1740 ///    A pointer to a float value to be loaded and duplicated.
1741 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1742 ///    duplicated values.
1743 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float * __p)1744 _mm_load1_ps(const float *__p)
1745 {
1746   struct __mm_load1_ps_struct {
1747     float __u;
1748   } __attribute__((__packed__, __may_alias__));
1749   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1750   return __extension__ (__m128){ __u, __u, __u, __u };
1751 }
1752 
1753 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1754 
1755 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1756 ///    memory location.
1757 ///
1758 /// \headerfile <x86intrin.h>
1759 ///
1760 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1761 ///
1762 /// \param __p
1763 ///    A pointer to a 128-bit memory location. The address of the memory
1764 ///    location has to be 128-bit aligned.
1765 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1766 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ps(const float * __p)1767 _mm_load_ps(const float *__p)
1768 {
1769   return *(const __m128*)__p;
1770 }
1771 
1772 /// Loads a 128-bit floating-point vector of [4 x float] from an
1773 ///    unaligned memory location.
1774 ///
1775 /// \headerfile <x86intrin.h>
1776 ///
1777 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1778 ///
1779 /// \param __p
1780 ///    A pointer to a 128-bit memory location. The address of the memory
1781 ///    location does not have to be aligned.
1782 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1783 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadu_ps(const float * __p)1784 _mm_loadu_ps(const float *__p)
1785 {
1786   struct __loadu_ps {
1787     __m128_u __v;
1788   } __attribute__((__packed__, __may_alias__));
1789   return ((const struct __loadu_ps*)__p)->__v;
1790 }
1791 
1792 /// Loads four packed float values, in reverse order, from an aligned
1793 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1794 ///
1795 /// \headerfile <x86intrin.h>
1796 ///
1797 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1798 ///    instruction.
1799 ///
1800 /// \param __p
1801 ///    A pointer to a 128-bit memory location. The address of the memory
1802 ///    location has to be 128-bit aligned.
1803 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1804 ///    in reverse order.
1805 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float * __p)1806 _mm_loadr_ps(const float *__p)
1807 {
1808   __m128 __a = _mm_load_ps(__p);
1809   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1810 }
1811 
1812 /// Create a 128-bit vector of [4 x float] with undefined values.
1813 ///
1814 /// \headerfile <x86intrin.h>
1815 ///
1816 /// This intrinsic has no corresponding instruction.
1817 ///
1818 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1819 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)1820 _mm_undefined_ps(void)
1821 {
1822   return (__m128)__builtin_ia32_undef128();
1823 }
1824 
1825 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1826 ///    32 bits of the vector are initialized with the specified single-precision
1827 ///    floating-point value. The upper 96 bits are set to zero.
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1832 ///
1833 /// \param __w
1834 ///    A single-precision floating-point value used to initialize the lower 32
1835 ///    bits of the result.
1836 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1837 ///    lower 32 bits contain the value provided in the source operand. The
1838 ///    upper 96 bits are set to zero.
1839 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ss(float __w)1840 _mm_set_ss(float __w)
1841 {
1842   return __extension__ (__m128){ __w, 0, 0, 0 };
1843 }
1844 
1845 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1846 ///    of the four single-precision floating-point vector elements set to the
1847 ///    specified single-precision floating-point value.
1848 ///
1849 /// \headerfile <x86intrin.h>
1850 ///
1851 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1852 ///
1853 /// \param __w
1854 ///    A single-precision floating-point value used to initialize each vector
1855 ///    element of the result.
1856 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1857 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set1_ps(float __w)1858 _mm_set1_ps(float __w)
1859 {
1860   return __extension__ (__m128){ __w, __w, __w, __w };
1861 }
1862 
1863 /* Microsoft specific. */
1864 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1865 ///    of the four single-precision floating-point vector elements set to the
1866 ///    specified single-precision floating-point value.
1867 ///
1868 /// \headerfile <x86intrin.h>
1869 ///
1870 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1871 ///
1872 /// \param __w
1873 ///    A single-precision floating-point value used to initialize each vector
1874 ///    element of the result.
1875 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1876 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps1(float __w)1877 _mm_set_ps1(float __w)
1878 {
1879     return _mm_set1_ps(__w);
1880 }
1881 
1882 /// Constructs a 128-bit floating-point vector of [4 x float]
1883 ///    initialized with the specified single-precision floating-point values.
1884 ///
1885 /// \headerfile <x86intrin.h>
1886 ///
1887 /// This intrinsic is a utility function and does not correspond to a specific
1888 ///    instruction.
1889 ///
1890 /// \param __z
1891 ///    A single-precision floating-point value used to initialize bits [127:96]
1892 ///    of the result.
1893 /// \param __y
1894 ///    A single-precision floating-point value used to initialize bits [95:64]
1895 ///    of the result.
1896 /// \param __x
1897 ///    A single-precision floating-point value used to initialize bits [63:32]
1898 ///    of the result.
1899 /// \param __w
1900 ///    A single-precision floating-point value used to initialize bits [31:0]
1901 ///    of the result.
1902 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1903 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps(float __z,float __y,float __x,float __w)1904 _mm_set_ps(float __z, float __y, float __x, float __w)
1905 {
1906   return __extension__ (__m128){ __w, __x, __y, __z };
1907 }
1908 
1909 /// Constructs a 128-bit floating-point vector of [4 x float],
1910 ///    initialized in reverse order with the specified 32-bit single-precision
1911 ///    float-point values.
1912 ///
1913 /// \headerfile <x86intrin.h>
1914 ///
1915 /// This intrinsic is a utility function and does not correspond to a specific
1916 ///    instruction.
1917 ///
1918 /// \param __z
1919 ///    A single-precision floating-point value used to initialize bits [31:0]
1920 ///    of the result.
1921 /// \param __y
1922 ///    A single-precision floating-point value used to initialize bits [63:32]
1923 ///    of the result.
1924 /// \param __x
1925 ///    A single-precision floating-point value used to initialize bits [95:64]
1926 ///    of the result.
1927 /// \param __w
1928 ///    A single-precision floating-point value used to initialize bits [127:96]
1929 ///    of the result.
1930 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1931 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setr_ps(float __z,float __y,float __x,float __w)1932 _mm_setr_ps(float __z, float __y, float __x, float __w)
1933 {
1934   return __extension__ (__m128){ __z, __y, __x, __w };
1935 }
1936 
1937 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1938 ///    to zero.
1939 ///
1940 /// \headerfile <x86intrin.h>
1941 ///
1942 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1943 ///
1944 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1945 ///    all elements set to zero.
1946 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setzero_ps(void)1947 _mm_setzero_ps(void)
1948 {
1949   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
1950 }
1951 
1952 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1953 ///    memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1958 ///
1959 /// \param __p
1960 ///    A pointer to a 64-bit memory location.
1961 /// \param __a
1962 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1963 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 * __p,__m128 __a)1964 _mm_storeh_pi(__m64 *__p, __m128 __a)
1965 {
1966   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1967   struct __mm_storeh_pi_struct {
1968     __mm_storeh_pi_v2f32 __u;
1969   } __attribute__((__packed__, __may_alias__));
1970   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1971 }
1972 
1973 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1974 ///     memory location.
1975 ///
1976 /// \headerfile <x86intrin.h>
1977 ///
1978 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1979 ///
1980 /// \param __p
1981 ///    A pointer to a memory location that will receive the float values.
1982 /// \param __a
1983 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1984 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 * __p,__m128 __a)1985 _mm_storel_pi(__m64 *__p, __m128 __a)
1986 {
1987   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1988   struct __mm_storeh_pi_struct {
1989     __mm_storeh_pi_v2f32 __u;
1990   } __attribute__((__packed__, __may_alias__));
1991   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1992 }
1993 
1994 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1995 ///     memory location.
1996 ///
1997 /// \headerfile <x86intrin.h>
1998 ///
1999 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2000 ///
2001 /// \param __p
2002 ///    A pointer to a 32-bit memory location.
2003 /// \param __a
2004 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2005 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ss(float * __p,__m128 __a)2006 _mm_store_ss(float *__p, __m128 __a)
2007 {
2008   struct __mm_store_ss_struct {
2009     float __u;
2010   } __attribute__((__packed__, __may_alias__));
2011   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2012 }
2013 
2014 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2015 ///    location.
2016 ///
2017 /// \headerfile <x86intrin.h>
2018 ///
2019 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2020 ///
2021 /// \param __p
2022 ///    A pointer to a 128-bit memory location. The address of the memory
2023 ///    location does not have to be aligned.
2024 /// \param __a
2025 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2026 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_ps(float * __p,__m128 __a)2027 _mm_storeu_ps(float *__p, __m128 __a)
2028 {
2029   struct __storeu_ps {
2030     __m128_u __v;
2031   } __attribute__((__packed__, __may_alias__));
2032   ((struct __storeu_ps*)__p)->__v = __a;
2033 }
2034 
2035 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2036 ///    location.
2037 ///
2038 /// \headerfile <x86intrin.h>
2039 ///
2040 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2041 ///
2042 /// \param __p
2043 ///    A pointer to a 128-bit memory location. The address of the memory
2044 ///    location has to be 16-byte aligned.
2045 /// \param __a
2046 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2047 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float * __p,__m128 __a)2048 _mm_store_ps(float *__p, __m128 __a)
2049 {
2050   *(__m128*)__p = __a;
2051 }
2052 
2053 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2054 ///    four contiguous elements in an aligned memory location.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2059 ///    instruction.
2060 ///
2061 /// \param __p
2062 ///    A pointer to a 128-bit memory location.
2063 /// \param __a
2064 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2065 ///    of the four contiguous elements pointed by \a __p.
2066 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float * __p,__m128 __a)2067 _mm_store1_ps(float *__p, __m128 __a)
2068 {
2069   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2070   _mm_store_ps(__p, __a);
2071 }
2072 
2073 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2074 ///    four contiguous elements in an aligned memory location.
2075 ///
2076 /// \headerfile <x86intrin.h>
2077 ///
2078 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2079 ///    instruction.
2080 ///
2081 /// \param __p
2082 ///    A pointer to a 128-bit memory location.
2083 /// \param __a
2084 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2085 ///    of the four contiguous elements pointed by \a __p.
2086 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float * __p,__m128 __a)2087 _mm_store_ps1(float *__p, __m128 __a)
2088 {
2089   _mm_store1_ps(__p, __a);
2090 }
2091 
2092 /// Stores float values from a 128-bit vector of [4 x float] to an
2093 ///    aligned memory location in reverse order.
2094 ///
2095 /// \headerfile <x86intrin.h>
2096 ///
2097 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2098 ///    instruction.
2099 ///
2100 /// \param __p
2101 ///    A pointer to a 128-bit memory location. The address of the memory
2102 ///    location has to be 128-bit aligned.
2103 /// \param __a
2104 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2105 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float * __p,__m128 __a)2106 _mm_storer_ps(float *__p, __m128 __a)
2107 {
2108   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2109   _mm_store_ps(__p, __a);
2110 }
2111 
2112 #define _MM_HINT_ET0 7
2113 #define _MM_HINT_ET1 6
2114 #define _MM_HINT_T0  3
2115 #define _MM_HINT_T1  2
2116 #define _MM_HINT_T2  1
2117 #define _MM_HINT_NTA 0
2118 
2119 #ifndef _MSC_VER
2120 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2121    Sema doesn't do any form of constant propagation yet. */
2122 
2123 /// Loads one cache line of data from the specified address to a location
2124 ///    closer to the processor.
2125 ///
2126 /// \headerfile <x86intrin.h>
2127 ///
2128 /// \code
2129 /// void _mm_prefetch(const void *a, const int sel);
2130 /// \endcode
2131 ///
2132 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2133 ///
2134 /// \param a
2135 ///    A pointer to a memory location containing a cache line of data.
2136 /// \param sel
2137 ///    A predefined integer constant specifying the type of prefetch
2138 ///    operation: \n
2139 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2140 ///    PREFETCHNTA instruction will be generated. \n
2141 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2142 ///    be generated. \n
2143 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2144 ///    be generated. \n
2145 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2146 ///    be generated.
2147 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2148                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2149 #endif
2150 
2151 /// Stores a 64-bit integer in the specified aligned memory location. To
2152 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2153 ///    used again soon).
2154 ///
2155 /// \headerfile <x86intrin.h>
2156 ///
2157 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2158 ///
2159 /// \param __p
2160 ///    A pointer to an aligned memory location used to store the register value.
2161 /// \param __a
2162 ///    A 64-bit integer containing the value to be stored.
2163 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_stream_pi(void * __p,__m64 __a)2164 _mm_stream_pi(void *__p, __m64 __a)
2165 {
2166   __builtin_ia32_movntq((__m64 *)__p, __a);
2167 }
2168 
2169 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2170 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2171 ///    as non-temporal (unlikely to be used again soon).
2172 ///
2173 /// \headerfile <x86intrin.h>
2174 ///
2175 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2176 ///
2177 /// \param __p
2178 ///    A pointer to a 128-bit aligned memory location that will receive the
2179 ///    single-precision floating-point values.
2180 /// \param __a
2181 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2182 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(void * __p,__m128 __a)2183 _mm_stream_ps(void *__p, __m128 __a)
2184 {
2185   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2186 }
2187 
2188 #if defined(__cplusplus)
2189 extern "C" {
2190 #endif
2191 
2192 /// Forces strong memory ordering (serialization) between store
2193 ///    instructions preceding this instruction and store instructions following
2194 ///    this instruction, ensuring the system completes all previous stores
2195 ///    before executing subsequent stores.
2196 ///
2197 /// \headerfile <x86intrin.h>
2198 ///
2199 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2200 ///
2201 void _mm_sfence(void);
2202 
2203 #if defined(__cplusplus)
2204 } // extern "C"
2205 #endif
2206 
2207 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2208 ///    returns it, as specified by the immediate integer operand.
2209 ///
2210 /// \headerfile <x86intrin.h>
2211 ///
2212 /// \code
2213 /// int _mm_extract_pi16(__m64 a, int n);
2214 /// \endcode
2215 ///
2216 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2217 ///
2218 /// \param a
2219 ///    A 64-bit vector of [4 x i16].
2220 /// \param n
2221 ///    An immediate integer operand that determines which bits are extracted: \n
2222 ///    0: Bits [15:0] are copied to the destination. \n
2223 ///    1: Bits [31:16] are copied to the destination. \n
2224 ///    2: Bits [47:32] are copied to the destination. \n
2225 ///    3: Bits [63:48] are copied to the destination.
2226 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2227 #define _mm_extract_pi16(a, n) \
2228   ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2229 
2230 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2231 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2232 ///    specified by the immediate operand \a n.
2233 ///
2234 /// \headerfile <x86intrin.h>
2235 ///
2236 /// \code
2237 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2238 /// \endcode
2239 ///
2240 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2241 ///
2242 /// \param a
2243 ///    A 64-bit vector of [4 x i16].
2244 /// \param d
2245 ///    An integer. The lower 16-bit value from this operand is written to the
2246 ///    destination at the offset specified by operand \a n.
2247 /// \param n
2248 ///    An immediate integer operant that determines which the bits to be used
2249 ///    in the destination. \n
2250 ///    0: Bits [15:0] are copied to the destination. \n
2251 ///    1: Bits [31:16] are copied to the destination. \n
2252 ///    2: Bits [47:32] are copied to the destination. \n
2253 ///    3: Bits [63:48] are copied to the destination.  \n
2254 ///    The remaining bits in the destination are copied from the corresponding
2255 ///    bits in operand \a a.
2256 /// \returns A 64-bit integer vector containing the copied packed data from the
2257 ///    operands.
2258 #define _mm_insert_pi16(a, d, n) \
2259   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2260 
2261 /// Compares each of the corresponding packed 16-bit integer values of
2262 ///    the 64-bit integer vectors, and writes the greater value to the
2263 ///    corresponding bits in the destination.
2264 ///
2265 /// \headerfile <x86intrin.h>
2266 ///
2267 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2268 ///
2269 /// \param __a
2270 ///    A 64-bit integer vector containing one of the source operands.
2271 /// \param __b
2272 ///    A 64-bit integer vector containing one of the source operands.
2273 /// \returns A 64-bit integer vector containing the comparison results.
2274 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pi16(__m64 __a,__m64 __b)2275 _mm_max_pi16(__m64 __a, __m64 __b)
2276 {
2277   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2278 }
2279 
2280 /// Compares each of the corresponding packed 8-bit unsigned integer
2281 ///    values of the 64-bit integer vectors, and writes the greater value to the
2282 ///    corresponding bits in the destination.
2283 ///
2284 /// \headerfile <x86intrin.h>
2285 ///
2286 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2287 ///
2288 /// \param __a
2289 ///    A 64-bit integer vector containing one of the source operands.
2290 /// \param __b
2291 ///    A 64-bit integer vector containing one of the source operands.
2292 /// \returns A 64-bit integer vector containing the comparison results.
2293 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pu8(__m64 __a,__m64 __b)2294 _mm_max_pu8(__m64 __a, __m64 __b)
2295 {
2296   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2297 }
2298 
2299 /// Compares each of the corresponding packed 16-bit integer values of
2300 ///    the 64-bit integer vectors, and writes the lesser value to the
2301 ///    corresponding bits in the destination.
2302 ///
2303 /// \headerfile <x86intrin.h>
2304 ///
2305 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2306 ///
2307 /// \param __a
2308 ///    A 64-bit integer vector containing one of the source operands.
2309 /// \param __b
2310 ///    A 64-bit integer vector containing one of the source operands.
2311 /// \returns A 64-bit integer vector containing the comparison results.
2312 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pi16(__m64 __a,__m64 __b)2313 _mm_min_pi16(__m64 __a, __m64 __b)
2314 {
2315   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2316 }
2317 
2318 /// Compares each of the corresponding packed 8-bit unsigned integer
2319 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2320 ///    corresponding bits in the destination.
2321 ///
2322 /// \headerfile <x86intrin.h>
2323 ///
2324 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2325 ///
2326 /// \param __a
2327 ///    A 64-bit integer vector containing one of the source operands.
2328 /// \param __b
2329 ///    A 64-bit integer vector containing one of the source operands.
2330 /// \returns A 64-bit integer vector containing the comparison results.
2331 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pu8(__m64 __a,__m64 __b)2332 _mm_min_pu8(__m64 __a, __m64 __b)
2333 {
2334   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2335 }
2336 
2337 /// Takes the most significant bit from each 8-bit element in a 64-bit
2338 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2339 ///    32-bit integer and writes it to the destination.
2340 ///
2341 /// \headerfile <x86intrin.h>
2342 ///
2343 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2344 ///
2345 /// \param __a
2346 ///    A 64-bit integer vector containing the values with bits to be extracted.
2347 /// \returns The most significant bit from each 8-bit element in \a __a,
2348 ///    written to bits [7:0].
2349 static __inline__ int __DEFAULT_FN_ATTRS_MMX
_mm_movemask_pi8(__m64 __a)2350 _mm_movemask_pi8(__m64 __a)
2351 {
2352   return __builtin_ia32_pmovmskb((__v8qi)__a);
2353 }
2354 
2355 /// Multiplies packed 16-bit unsigned integer values and writes the
2356 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2357 ///    the destination.
2358 ///
2359 /// \headerfile <x86intrin.h>
2360 ///
2361 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2362 ///
2363 /// \param __a
2364 ///    A 64-bit integer vector containing one of the source operands.
2365 /// \param __b
2366 ///    A 64-bit integer vector containing one of the source operands.
2367 /// \returns A 64-bit integer vector containing the products of both operands.
2368 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhi_pu16(__m64 __a,__m64 __b)2369 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2370 {
2371   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2372 }
2373 
2374 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2375 ///    destination, as specified by the immediate value operand.
2376 ///
2377 /// \headerfile <x86intrin.h>
2378 ///
2379 /// \code
2380 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2381 /// \endcode
2382 ///
2383 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2384 ///
2385 /// \param a
2386 ///    A 64-bit integer vector containing the values to be shuffled.
2387 /// \param n
2388 ///    An immediate value containing an 8-bit value specifying which elements to
2389 ///    copy from \a a. The destinations within the 64-bit destination are
2390 ///    assigned values as follows: \n
2391 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2392 ///    destination. \n
2393 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2394 ///    destination. \n
2395 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2396 ///    destination. \n
2397 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2398 ///    destination. \n
2399 ///    Bit value assignments: \n
2400 ///    00: assigned from bits [15:0] of \a a. \n
2401 ///    01: assigned from bits [31:16] of \a a. \n
2402 ///    10: assigned from bits [47:32] of \a a. \n
2403 ///    11: assigned from bits [63:48] of \a a. \n
2404 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2405 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2406 ///    <c>[b6, b4, b2, b0]</c>.
2407 /// \returns A 64-bit integer vector containing the shuffled values.
2408 #define _mm_shuffle_pi16(a, n) \
2409   ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2410 
2411 /// Conditionally copies the values from each 8-bit element in the first
2412 ///    64-bit integer vector operand to the specified memory location, as
2413 ///    specified by the most significant bit in the corresponding element in the
2414 ///    second 64-bit integer vector operand.
2415 ///
2416 ///    To minimize caching, the data is flagged as non-temporal
2417 ///    (unlikely to be used again soon).
2418 ///
2419 /// \headerfile <x86intrin.h>
2420 ///
2421 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2422 ///
2423 /// \param __d
2424 ///    A 64-bit integer vector containing the values with elements to be copied.
2425 /// \param __n
2426 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2427 ///    element determines whether the corresponding element in operand \a __d
2428 ///    is copied. If the most significant bit of a given element is 1, the
2429 ///    corresponding element in operand \a __d is copied.
2430 /// \param __p
2431 ///    A pointer to a 64-bit memory location that will receive the conditionally
2432 ///    copied integer values. The address of the memory location does not have
2433 ///    to be aligned.
2434 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)2435 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2436 {
2437   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2438 }
2439 
2440 /// Computes the rounded averages of the packed unsigned 8-bit integer
2441 ///    values and writes the averages to the corresponding bits in the
2442 ///    destination.
2443 ///
2444 /// \headerfile <x86intrin.h>
2445 ///
2446 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2447 ///
2448 /// \param __a
2449 ///    A 64-bit integer vector containing one of the source operands.
2450 /// \param __b
2451 ///    A 64-bit integer vector containing one of the source operands.
2452 /// \returns A 64-bit integer vector containing the averages of both operands.
2453 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu8(__m64 __a,__m64 __b)2454 _mm_avg_pu8(__m64 __a, __m64 __b)
2455 {
2456   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2457 }
2458 
2459 /// Computes the rounded averages of the packed unsigned 16-bit integer
2460 ///    values and writes the averages to the corresponding bits in the
2461 ///    destination.
2462 ///
2463 /// \headerfile <x86intrin.h>
2464 ///
2465 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2466 ///
2467 /// \param __a
2468 ///    A 64-bit integer vector containing one of the source operands.
2469 /// \param __b
2470 ///    A 64-bit integer vector containing one of the source operands.
2471 /// \returns A 64-bit integer vector containing the averages of both operands.
2472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu16(__m64 __a,__m64 __b)2473 _mm_avg_pu16(__m64 __a, __m64 __b)
2474 {
2475   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2476 }
2477 
2478 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2479 ///    64-bit vector operands and computes the absolute value for each of the
2480 ///    difference. Then sum of the 8 absolute differences is written to the
2481 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2482 ///
2483 /// \headerfile <x86intrin.h>
2484 ///
2485 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2486 ///
2487 /// \param __a
2488 ///    A 64-bit integer vector containing one of the source operands.
2489 /// \param __b
2490 ///    A 64-bit integer vector containing one of the source operands.
2491 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2492 ///    sets of absolute differences between both operands. The upper bits are
2493 ///    cleared.
2494 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sad_pu8(__m64 __a,__m64 __b)2495 _mm_sad_pu8(__m64 __a, __m64 __b)
2496 {
2497   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2498 }
2499 
2500 #if defined(__cplusplus)
2501 extern "C" {
2502 #endif
2503 
2504 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2505 ///    integer value.
2506 ///
2507 ///    There are several groups of macros associated with this
2508 ///    intrinsic, including:
2509 ///    <ul>
2510 ///    <li>
2511 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2512 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2513 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2514 ///      _MM_GET_EXCEPTION_STATE().
2515 ///    </li>
2516 ///    <li>
2517 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2518 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2519 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2520 ///    </li>
2521 ///    <li>
2522 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2523 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2524 ///      _MM_GET_ROUNDING_MODE().
2525 ///    </li>
2526 ///    <li>
2527 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2528 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2529 ///    </li>
2530 ///    <li>
2531 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2532 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2533 ///      _MM_GET_DENORMALS_ZERO_MODE().
2534 ///    </li>
2535 ///    </ul>
2536 ///
2537 ///    For example, the following expression checks if an overflow exception has
2538 ///    occurred:
2539 ///    \code
2540 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2541 ///    \endcode
2542 ///
2543 ///    The following expression gets the current rounding mode:
2544 ///    \code
2545 ///      _MM_GET_ROUNDING_MODE()
2546 ///    \endcode
2547 ///
2548 /// \headerfile <x86intrin.h>
2549 ///
2550 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2551 ///
2552 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2553 ///    register.
2554 unsigned int _mm_getcsr(void);
2555 
2556 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2557 ///
2558 ///    There are several groups of macros associated with this intrinsic,
2559 ///    including:
2560 ///    <ul>
2561 ///    <li>
2562 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2563 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2564 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2565 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2566 ///    </li>
2567 ///    <li>
2568 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2569 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2570 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2571 ///      of these macros.
2572 ///    </li>
2573 ///    <li>
2574 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2575 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2576 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2577 ///    </li>
2578 ///    <li>
2579 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2580 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2581 ///      one of these macros.
2582 ///    </li>
2583 ///    <li>
2584 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2585 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2586 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2587 ///    </li>
2588 ///    </ul>
2589 ///
2590 ///    For example, the following expression causes subsequent floating-point
2591 ///    operations to round up:
2592 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2593 ///
2594 ///    The following example sets the DAZ and FTZ flags:
2595 ///    \code
2596 ///    void setFlags() {
2597 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2598 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2599 ///    }
2600 ///    \endcode
2601 ///
2602 /// \headerfile <x86intrin.h>
2603 ///
2604 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2605 ///
2606 /// \param __i
2607 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2608 void _mm_setcsr(unsigned int __i);
2609 
2610 #if defined(__cplusplus)
2611 } // extern "C"
2612 #endif
2613 
2614 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2615 ///    specified by the immediate value operand.
2616 ///
2617 /// \headerfile <x86intrin.h>
2618 ///
2619 /// \code
2620 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2621 /// \endcode
2622 ///
2623 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2624 ///
2625 /// \param a
2626 ///    A 128-bit vector of [4 x float].
2627 /// \param b
2628 ///    A 128-bit vector of [4 x float].
2629 /// \param mask
2630 ///    An immediate value containing an 8-bit value specifying which elements to
2631 ///    copy from \a a and \a b. \n
2632 ///    Bits [3:0] specify the values copied from operand \a a. \n
2633 ///    Bits [7:4] specify the values copied from operand \a b. \n
2634 ///    The destinations within the 128-bit destination are assigned values as
2635 ///    follows: \n
2636 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2637 ///    destination. \n
2638 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2639 ///    destination. \n
2640 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2641 ///    destination. \n
2642 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2643 ///    destination. \n
2644 ///    Bit value assignments: \n
2645 ///    00: Bits [31:0] copied from the specified operand. \n
2646 ///    01: Bits [63:32] copied from the specified operand. \n
2647 ///    10: Bits [95:64] copied from the specified operand. \n
2648 ///    11: Bits [127:96] copied from the specified operand. \n
2649 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2650 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2651 ///    <c>[b6, b4, b2, b0]</c>.
2652 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2653 #define _mm_shuffle_ps(a, b, mask) \
2654   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2655                                  (int)(mask)))
2656 
2657 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2658 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2659 ///
2660 /// \headerfile <x86intrin.h>
2661 ///
2662 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2663 ///
2664 /// \param __a
2665 ///    A 128-bit vector of [4 x float]. \n
2666 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2667 ///    Bits [127:96] are written to bits [95:64] of the destination.
2668 /// \param __b
2669 ///    A 128-bit vector of [4 x float].
2670 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2671 ///    Bits [127:96] are written to bits [127:96] of the destination.
2672 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2673 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpackhi_ps(__m128 __a,__m128 __b)2674 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2675 {
2676   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2677 }
2678 
2679 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2680 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2681 ///
2682 /// \headerfile <x86intrin.h>
2683 ///
2684 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2685 ///
2686 /// \param __a
2687 ///    A 128-bit vector of [4 x float]. \n
2688 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2689 ///    Bits [63:32] are written to bits [95:64] of the destination.
2690 /// \param __b
2691 ///    A 128-bit vector of [4 x float]. \n
2692 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2693 ///    Bits [63:32] are written to bits [127:96] of the destination.
2694 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2695 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpacklo_ps(__m128 __a,__m128 __b)2696 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2697 {
2698   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2699 }
2700 
2701 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2702 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2703 ///    96 bits are set to the upper 96 bits of the first parameter.
2704 ///
2705 /// \headerfile <x86intrin.h>
2706 ///
2707 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2708 ///    instruction.
2709 ///
2710 /// \param __a
2711 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2712 ///    written to the upper 96 bits of the result.
2713 /// \param __b
2714 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2715 ///    written to the lower 32 bits of the result.
2716 /// \returns A 128-bit floating-point vector of [4 x float].
2717 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_move_ss(__m128 __a,__m128 __b)2718 _mm_move_ss(__m128 __a, __m128 __b)
2719 {
2720   __a[0] = __b[0];
2721   return __a;
2722 }
2723 
2724 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2725 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2726 ///    64 bits are set to the upper 64 bits of the first parameter.
2727 ///
2728 /// \headerfile <x86intrin.h>
2729 ///
2730 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2731 ///
2732 /// \param __a
2733 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2734 ///    written to the upper 64 bits of the result.
2735 /// \param __b
2736 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2737 ///    written to the lower 64 bits of the result.
2738 /// \returns A 128-bit floating-point vector of [4 x float].
2739 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehl_ps(__m128 __a,__m128 __b)2740 _mm_movehl_ps(__m128 __a, __m128 __b)
2741 {
2742   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2743 }
2744 
2745 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2746 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2747 ///    64 bits are set to the lower 64 bits of the second parameter.
2748 ///
2749 /// \headerfile <x86intrin.h>
2750 ///
2751 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2752 ///
2753 /// \param __a
2754 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2755 ///    written to the lower 64 bits of the result.
2756 /// \param __b
2757 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2758 ///    written to the upper 64 bits of the result.
2759 /// \returns A 128-bit floating-point vector of [4 x float].
2760 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movelh_ps(__m128 __a,__m128 __b)2761 _mm_movelh_ps(__m128 __a, __m128 __b)
2762 {
2763   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2764 }
2765 
2766 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2767 ///    float].
2768 ///
2769 /// \headerfile <x86intrin.h>
2770 ///
2771 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2772 ///
2773 /// \param __a
2774 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2775 ///    from the corresponding elements in this operand.
2776 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2777 ///    values from the operand.
2778 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi16_ps(__m64 __a)2779 _mm_cvtpi16_ps(__m64 __a)
2780 {
2781   __m64 __b, __c;
2782   __m128 __r;
2783 
2784   __b = _mm_setzero_si64();
2785   __b = _mm_cmpgt_pi16(__b, __a);
2786   __c = _mm_unpackhi_pi16(__a, __b);
2787   __r = _mm_setzero_ps();
2788   __r = _mm_cvtpi32_ps(__r, __c);
2789   __r = _mm_movelh_ps(__r, __r);
2790   __c = _mm_unpacklo_pi16(__a, __b);
2791   __r = _mm_cvtpi32_ps(__r, __c);
2792 
2793   return __r;
2794 }
2795 
2796 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2797 ///    128-bit vector of [4 x float].
2798 ///
2799 /// \headerfile <x86intrin.h>
2800 ///
2801 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2802 ///
2803 /// \param __a
2804 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2805 ///    destination are copied from the corresponding elements in this operand.
2806 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2807 ///    values from the operand.
2808 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu16_ps(__m64 __a)2809 _mm_cvtpu16_ps(__m64 __a)
2810 {
2811   __m64 __b, __c;
2812   __m128 __r;
2813 
2814   __b = _mm_setzero_si64();
2815   __c = _mm_unpackhi_pi16(__a, __b);
2816   __r = _mm_setzero_ps();
2817   __r = _mm_cvtpi32_ps(__r, __c);
2818   __r = _mm_movelh_ps(__r, __r);
2819   __c = _mm_unpacklo_pi16(__a, __b);
2820   __r = _mm_cvtpi32_ps(__r, __c);
2821 
2822   return __r;
2823 }
2824 
2825 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2826 ///    into a 128-bit vector of [4 x float].
2827 ///
2828 /// \headerfile <x86intrin.h>
2829 ///
2830 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2831 ///
2832 /// \param __a
2833 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2834 ///    from the corresponding lower 4 elements in this operand.
2835 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2836 ///    values from the operand.
2837 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi8_ps(__m64 __a)2838 _mm_cvtpi8_ps(__m64 __a)
2839 {
2840   __m64 __b;
2841 
2842   __b = _mm_setzero_si64();
2843   __b = _mm_cmpgt_pi8(__b, __a);
2844   __b = _mm_unpacklo_pi8(__a, __b);
2845 
2846   return _mm_cvtpi16_ps(__b);
2847 }
2848 
2849 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2850 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2855 ///
2856 /// \param __a
2857 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2858 ///    destination are copied from the corresponding lower 4 elements in this
2859 ///    operand.
2860 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2861 ///    values from the source operand.
2862 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu8_ps(__m64 __a)2863 _mm_cvtpu8_ps(__m64 __a)
2864 {
2865   __m64 __b;
2866 
2867   __b = _mm_setzero_si64();
2868   __b = _mm_unpacklo_pi8(__a, __b);
2869 
2870   return _mm_cvtpi16_ps(__b);
2871 }
2872 
2873 /// Converts the two 32-bit signed integer values from each 64-bit vector
2874 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2875 ///
2876 /// \headerfile <x86intrin.h>
2877 ///
2878 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2879 ///
2880 /// \param __a
2881 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2882 ///    copied from the elements in this operand.
2883 /// \param __b
2884 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2885 ///    copied from the elements in this operand.
2886 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2887 ///    copied and converted values from the first operand. The upper 64 bits
2888 ///    contain the copied and converted values from the second operand.
2889 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)2890 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2891 {
2892   __m128 __c;
2893 
2894   __c = _mm_setzero_ps();
2895   __c = _mm_cvtpi32_ps(__c, __b);
2896   __c = _mm_movelh_ps(__c, __c);
2897 
2898   return _mm_cvtpi32_ps(__c, __a);
2899 }
2900 
2901 /// Converts each single-precision floating-point element of a 128-bit
2902 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2903 ///    packs the results into a 64-bit integer vector of [4 x i16].
2904 ///
2905 ///    If the floating-point element is NaN or infinity, or if the
2906 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2907 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2908 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2909 ///
2910 /// \headerfile <x86intrin.h>
2911 ///
2912 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2913 ///
2914 /// \param __a
2915 ///    A 128-bit floating-point vector of [4 x float].
2916 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2917 ///    values.
2918 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi16(__m128 __a)2919 _mm_cvtps_pi16(__m128 __a)
2920 {
2921   __m64 __b, __c;
2922 
2923   __b = _mm_cvtps_pi32(__a);
2924   __a = _mm_movehl_ps(__a, __a);
2925   __c = _mm_cvtps_pi32(__a);
2926 
2927   return _mm_packs_pi32(__b, __c);
2928 }
2929 
2930 /// Converts each single-precision floating-point element of a 128-bit
2931 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2932 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2933 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2934 ///
2935 ///    If the floating-point element is NaN or infinity, or if the
2936 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2937 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2938 ///    than 0x7F, it is converted to 0x7F.
2939 ///
2940 /// \headerfile <x86intrin.h>
2941 ///
2942 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2943 ///
2944 /// \param __a
2945 ///    128-bit floating-point vector of [4 x float].
2946 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2947 ///    converted values and the uppper 32 bits are set to zero.
2948 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi8(__m128 __a)2949 _mm_cvtps_pi8(__m128 __a)
2950 {
2951   __m64 __b, __c;
2952 
2953   __b = _mm_cvtps_pi16(__a);
2954   __c = _mm_setzero_si64();
2955 
2956   return _mm_packs_pi16(__b, __c);
2957 }
2958 
2959 /// Extracts the sign bits from each single-precision floating-point
2960 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
2961 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2962 ///    to zero.
2963 ///
2964 /// \headerfile <x86intrin.h>
2965 ///
2966 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2967 ///
2968 /// \param __a
2969 ///    A 128-bit floating-point vector of [4 x float].
2970 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2971 ///    single-precision floating-point element of the parameter. Bits [31:4] are
2972 ///    set to zero.
2973 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_ps(__m128 __a)2974 _mm_movemask_ps(__m128 __a)
2975 {
2976   return __builtin_ia32_movmskps((__v4sf)__a);
2977 }
2978 
2979 /* Compare */
2980 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
2981 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
2982 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
2983 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
2984 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
2985 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
2986 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
2987 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
2988 
2989 /// Compares each of the corresponding values of two 128-bit vectors of
2990 ///    [4 x float], using the operation specified by the immediate integer
2991 ///    operand.
2992 ///
2993 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
2994 ///
2995 /// \headerfile <x86intrin.h>
2996 ///
2997 /// \code
2998 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
2999 /// \endcode
3000 ///
3001 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3002 ///
3003 /// \param a
3004 ///    A 128-bit vector of [4 x float].
3005 /// \param b
3006 ///    A 128-bit vector of [4 x float].
3007 /// \param c
3008 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3009 ///    operation to use: \n
3010 ///    0x00: Equal (ordered, non-signaling) \n
3011 ///    0x01: Less-than (ordered, signaling) \n
3012 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3013 ///    0x03: Unordered (non-signaling) \n
3014 ///    0x04: Not-equal (unordered, non-signaling) \n
3015 ///    0x05: Not-less-than (unordered, signaling) \n
3016 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3017 ///    0x07: Ordered (non-signaling) \n
3018 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3019 #define _mm_cmp_ps(a, b, c)                                                    \
3020   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3021 
3022 /// Compares each of the corresponding scalar values of two 128-bit
3023 ///    vectors of [4 x float], using the operation specified by the immediate
3024 ///    integer operand.
3025 ///
3026 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3027 ///
3028 /// \headerfile <x86intrin.h>
3029 ///
3030 /// \code
3031 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3032 /// \endcode
3033 ///
3034 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3035 ///
3036 /// \param a
3037 ///    A 128-bit vector of [4 x float].
3038 /// \param b
3039 ///    A 128-bit vector of [4 x float].
3040 /// \param c
3041 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3042 ///    operation to use: \n
3043 ///    0x00: Equal (ordered, non-signaling) \n
3044 ///    0x01: Less-than (ordered, signaling) \n
3045 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3046 ///    0x03: Unordered (non-signaling) \n
3047 ///    0x04: Not-equal (unordered, non-signaling) \n
3048 ///    0x05: Not-less-than (unordered, signaling) \n
3049 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3050 ///    0x07: Ordered (non-signaling) \n
3051 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3052 #define _mm_cmp_ss(a, b, c)                                                    \
3053   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3054 
3055 #define _MM_ALIGN16 __attribute__((aligned(16)))
3056 
3057 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3058 
3059 #define _MM_EXCEPT_INVALID    (0x0001U)
3060 #define _MM_EXCEPT_DENORM     (0x0002U)
3061 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3062 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3063 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3064 #define _MM_EXCEPT_INEXACT    (0x0020U)
3065 #define _MM_EXCEPT_MASK       (0x003fU)
3066 
3067 #define _MM_MASK_INVALID      (0x0080U)
3068 #define _MM_MASK_DENORM       (0x0100U)
3069 #define _MM_MASK_DIV_ZERO     (0x0200U)
3070 #define _MM_MASK_OVERFLOW     (0x0400U)
3071 #define _MM_MASK_UNDERFLOW    (0x0800U)
3072 #define _MM_MASK_INEXACT      (0x1000U)
3073 #define _MM_MASK_MASK         (0x1f80U)
3074 
3075 #define _MM_ROUND_NEAREST     (0x0000U)
3076 #define _MM_ROUND_DOWN        (0x2000U)
3077 #define _MM_ROUND_UP          (0x4000U)
3078 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3079 #define _MM_ROUND_MASK        (0x6000U)
3080 
3081 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3082 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3083 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3084 
3085 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3086 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3087 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3088 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3089 
3090 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3091 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3092 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3093 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3094 
3095 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3096 do { \
3097   __m128 tmp3, tmp2, tmp1, tmp0; \
3098   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3099   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3100   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3101   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3102   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3103   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3104   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3105   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3106 } while (0)
3107 
3108 /* Aliases for compatibility. */
3109 #define _m_pextrw _mm_extract_pi16
3110 #define _m_pinsrw _mm_insert_pi16
3111 #define _m_pmaxsw _mm_max_pi16
3112 #define _m_pmaxub _mm_max_pu8
3113 #define _m_pminsw _mm_min_pi16
3114 #define _m_pminub _mm_min_pu8
3115 #define _m_pmovmskb _mm_movemask_pi8
3116 #define _m_pmulhuw _mm_mulhi_pu16
3117 #define _m_pshufw _mm_shuffle_pi16
3118 #define _m_maskmovq _mm_maskmove_si64
3119 #define _m_pavgb _mm_avg_pu8
3120 #define _m_pavgw _mm_avg_pu16
3121 #define _m_psadbw _mm_sad_pu8
3122 #define _m_ _mm_
3123 
3124 #undef __DEFAULT_FN_ATTRS
3125 #undef __DEFAULT_FN_ATTRS_MMX
3126 
3127 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3128 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3129 #include <emmintrin.h>
3130 #endif
3131 
3132 #endif /* __XMMINTRIN_H */
3133