xref: /aosp_15_r20/external/clang/lib/Headers/avxintrin.h (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2*67e74705SXin Li  *
3*67e74705SXin Li  * Permission is hereby granted, free of charge, to any person obtaining a copy
4*67e74705SXin Li  * of this software and associated documentation files (the "Software"), to deal
5*67e74705SXin Li  * in the Software without restriction, including without limitation the rights
6*67e74705SXin Li  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7*67e74705SXin Li  * copies of the Software, and to permit persons to whom the Software is
8*67e74705SXin Li  * furnished to do so, subject to the following conditions:
9*67e74705SXin Li  *
10*67e74705SXin Li  * The above copyright notice and this permission notice shall be included in
11*67e74705SXin Li  * all copies or substantial portions of the Software.
12*67e74705SXin Li  *
13*67e74705SXin Li  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14*67e74705SXin Li  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15*67e74705SXin Li  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16*67e74705SXin Li  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17*67e74705SXin Li  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18*67e74705SXin Li  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19*67e74705SXin Li  * THE SOFTWARE.
20*67e74705SXin Li  *
21*67e74705SXin Li  *===-----------------------------------------------------------------------===
22*67e74705SXin Li  */
23*67e74705SXin Li 
24*67e74705SXin Li #ifndef __IMMINTRIN_H
25*67e74705SXin Li #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26*67e74705SXin Li #endif
27*67e74705SXin Li 
28*67e74705SXin Li #ifndef __AVXINTRIN_H
29*67e74705SXin Li #define __AVXINTRIN_H
30*67e74705SXin Li 
31*67e74705SXin Li typedef double __v4df __attribute__ ((__vector_size__ (32)));
32*67e74705SXin Li typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33*67e74705SXin Li typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34*67e74705SXin Li typedef int __v8si __attribute__ ((__vector_size__ (32)));
35*67e74705SXin Li typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36*67e74705SXin Li typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37*67e74705SXin Li 
38*67e74705SXin Li /* Unsigned types */
39*67e74705SXin Li typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40*67e74705SXin Li typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41*67e74705SXin Li typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42*67e74705SXin Li typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43*67e74705SXin Li 
44*67e74705SXin Li /* We need an explicitly signed variant for char. Note that this shouldn't
45*67e74705SXin Li  * appear in the interface though. */
46*67e74705SXin Li typedef signed char __v32qs __attribute__((__vector_size__(32)));
47*67e74705SXin Li 
48*67e74705SXin Li typedef float __m256 __attribute__ ((__vector_size__ (32)));
49*67e74705SXin Li typedef double __m256d __attribute__((__vector_size__(32)));
50*67e74705SXin Li typedef long long __m256i __attribute__((__vector_size__(32)));
51*67e74705SXin Li 
52*67e74705SXin Li /* Define the default attributes for the functions in this file. */
53*67e74705SXin Li #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
54*67e74705SXin Li 
55*67e74705SXin Li /* Arithmetic */
56*67e74705SXin Li /// \brief Adds two 256-bit vectors of [4 x double].
57*67e74705SXin Li ///
58*67e74705SXin Li /// \headerfile <x86intrin.h>
59*67e74705SXin Li ///
60*67e74705SXin Li /// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
61*67e74705SXin Li ///
62*67e74705SXin Li /// \param __a
63*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
64*67e74705SXin Li /// \param __b
65*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
66*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the sums of both
67*67e74705SXin Li ///    operands.
68*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_add_pd(__m256d __a,__m256d __b)69*67e74705SXin Li _mm256_add_pd(__m256d __a, __m256d __b)
70*67e74705SXin Li {
71*67e74705SXin Li   return (__m256d)((__v4df)__a+(__v4df)__b);
72*67e74705SXin Li }
73*67e74705SXin Li 
74*67e74705SXin Li /// \brief Adds two 256-bit vectors of [8 x float].
75*67e74705SXin Li ///
76*67e74705SXin Li /// \headerfile <x86intrin.h>
77*67e74705SXin Li ///
78*67e74705SXin Li /// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
79*67e74705SXin Li ///
80*67e74705SXin Li /// \param __a
81*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
82*67e74705SXin Li /// \param __b
83*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
84*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the sums of both
85*67e74705SXin Li ///    operands.
86*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_add_ps(__m256 __a,__m256 __b)87*67e74705SXin Li _mm256_add_ps(__m256 __a, __m256 __b)
88*67e74705SXin Li {
89*67e74705SXin Li   return (__m256)((__v8sf)__a+(__v8sf)__b);
90*67e74705SXin Li }
91*67e74705SXin Li 
92*67e74705SXin Li /// \brief Subtracts two 256-bit vectors of [4 x double].
93*67e74705SXin Li ///
94*67e74705SXin Li /// \headerfile <x86intrin.h>
95*67e74705SXin Li ///
96*67e74705SXin Li /// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
97*67e74705SXin Li ///
98*67e74705SXin Li /// \param __a
99*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the minuend.
100*67e74705SXin Li /// \param __b
101*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the subtrahend.
102*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the differences between
103*67e74705SXin Li ///    both operands.
104*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_sub_pd(__m256d __a,__m256d __b)105*67e74705SXin Li _mm256_sub_pd(__m256d __a, __m256d __b)
106*67e74705SXin Li {
107*67e74705SXin Li   return (__m256d)((__v4df)__a-(__v4df)__b);
108*67e74705SXin Li }
109*67e74705SXin Li 
110*67e74705SXin Li /// \brief Subtracts two 256-bit vectors of [8 x float].
111*67e74705SXin Li ///
112*67e74705SXin Li /// \headerfile <x86intrin.h>
113*67e74705SXin Li ///
114*67e74705SXin Li /// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
115*67e74705SXin Li ///
116*67e74705SXin Li /// \param __a
117*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the minuend.
118*67e74705SXin Li /// \param __b
119*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the subtrahend.
120*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the differences between
121*67e74705SXin Li ///    both operands.
122*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_sub_ps(__m256 __a,__m256 __b)123*67e74705SXin Li _mm256_sub_ps(__m256 __a, __m256 __b)
124*67e74705SXin Li {
125*67e74705SXin Li   return (__m256)((__v8sf)__a-(__v8sf)__b);
126*67e74705SXin Li }
127*67e74705SXin Li 
128*67e74705SXin Li /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129*67e74705SXin Li ///    two 256-bit vectors of [4 x double].
130*67e74705SXin Li ///
131*67e74705SXin Li /// \headerfile <x86intrin.h>
132*67e74705SXin Li ///
133*67e74705SXin Li /// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
134*67e74705SXin Li ///
135*67e74705SXin Li /// \param __a
136*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the left source operand.
137*67e74705SXin Li /// \param __b
138*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the right source operand.
139*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the alternating sums
140*67e74705SXin Li ///    and differences between both operands.
141*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_addsub_pd(__m256d __a,__m256d __b)142*67e74705SXin Li _mm256_addsub_pd(__m256d __a, __m256d __b)
143*67e74705SXin Li {
144*67e74705SXin Li   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145*67e74705SXin Li }
146*67e74705SXin Li 
147*67e74705SXin Li /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148*67e74705SXin Li ///    two 256-bit vectors of [8 x float].
149*67e74705SXin Li ///
150*67e74705SXin Li /// \headerfile <x86intrin.h>
151*67e74705SXin Li ///
152*67e74705SXin Li /// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
153*67e74705SXin Li ///
154*67e74705SXin Li /// \param __a
155*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the left source operand.
156*67e74705SXin Li /// \param __b
157*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the right source operand.
158*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159*67e74705SXin Li ///    differences between both operands.
160*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_addsub_ps(__m256 __a,__m256 __b)161*67e74705SXin Li _mm256_addsub_ps(__m256 __a, __m256 __b)
162*67e74705SXin Li {
163*67e74705SXin Li   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
164*67e74705SXin Li }
165*67e74705SXin Li 
166*67e74705SXin Li /// \brief Divides two 256-bit vectors of [4 x double].
167*67e74705SXin Li ///
168*67e74705SXin Li /// \headerfile <x86intrin.h>
169*67e74705SXin Li ///
170*67e74705SXin Li /// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
171*67e74705SXin Li ///
172*67e74705SXin Li /// \param __a
173*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the dividend.
174*67e74705SXin Li /// \param __b
175*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the divisor.
176*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the quotients of both
177*67e74705SXin Li ///    operands.
178*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_div_pd(__m256d __a,__m256d __b)179*67e74705SXin Li _mm256_div_pd(__m256d __a, __m256d __b)
180*67e74705SXin Li {
181*67e74705SXin Li   return (__m256d)((__v4df)__a/(__v4df)__b);
182*67e74705SXin Li }
183*67e74705SXin Li 
184*67e74705SXin Li /// \brief Divides two 256-bit vectors of [8 x float].
185*67e74705SXin Li ///
186*67e74705SXin Li /// \headerfile <x86intrin.h>
187*67e74705SXin Li ///
188*67e74705SXin Li /// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
189*67e74705SXin Li ///
190*67e74705SXin Li /// \param __a
191*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the dividend.
192*67e74705SXin Li /// \param __b
193*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the divisor.
194*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the quotients of both
195*67e74705SXin Li ///    operands.
196*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_div_ps(__m256 __a,__m256 __b)197*67e74705SXin Li _mm256_div_ps(__m256 __a, __m256 __b)
198*67e74705SXin Li {
199*67e74705SXin Li   return (__m256)((__v8sf)__a/(__v8sf)__b);
200*67e74705SXin Li }
201*67e74705SXin Li 
202*67e74705SXin Li /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203*67e74705SXin Li ///    of each pair of values.
204*67e74705SXin Li ///
205*67e74705SXin Li /// \headerfile <x86intrin.h>
206*67e74705SXin Li ///
207*67e74705SXin Li /// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
208*67e74705SXin Li ///
209*67e74705SXin Li /// \param __a
210*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
211*67e74705SXin Li /// \param __b
212*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
213*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the maximum values
214*67e74705SXin Li ///    between both operands.
215*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_max_pd(__m256d __a,__m256d __b)216*67e74705SXin Li _mm256_max_pd(__m256d __a, __m256d __b)
217*67e74705SXin Li {
218*67e74705SXin Li   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219*67e74705SXin Li }
220*67e74705SXin Li 
221*67e74705SXin Li /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222*67e74705SXin Li ///    of each pair of values.
223*67e74705SXin Li ///
224*67e74705SXin Li /// \headerfile <x86intrin.h>
225*67e74705SXin Li ///
226*67e74705SXin Li /// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
227*67e74705SXin Li ///
228*67e74705SXin Li /// \param __a
229*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
230*67e74705SXin Li /// \param __b
231*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
232*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the maximum values
233*67e74705SXin Li ///    between both operands.
234*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_max_ps(__m256 __a,__m256 __b)235*67e74705SXin Li _mm256_max_ps(__m256 __a, __m256 __b)
236*67e74705SXin Li {
237*67e74705SXin Li   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238*67e74705SXin Li }
239*67e74705SXin Li 
240*67e74705SXin Li /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241*67e74705SXin Li ///    of each pair of values.
242*67e74705SXin Li ///
243*67e74705SXin Li /// \headerfile <x86intrin.h>
244*67e74705SXin Li ///
245*67e74705SXin Li /// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
246*67e74705SXin Li ///
247*67e74705SXin Li /// \param __a
248*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
249*67e74705SXin Li /// \param __b
250*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
251*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the minimum values
252*67e74705SXin Li ///    between both operands.
253*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_min_pd(__m256d __a,__m256d __b)254*67e74705SXin Li _mm256_min_pd(__m256d __a, __m256d __b)
255*67e74705SXin Li {
256*67e74705SXin Li   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257*67e74705SXin Li }
258*67e74705SXin Li 
259*67e74705SXin Li /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260*67e74705SXin Li ///    of each pair of values.
261*67e74705SXin Li ///
262*67e74705SXin Li /// \headerfile <x86intrin.h>
263*67e74705SXin Li ///
264*67e74705SXin Li /// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
265*67e74705SXin Li ///
266*67e74705SXin Li /// \param __a
267*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
268*67e74705SXin Li /// \param __b
269*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
270*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the minimum values
271*67e74705SXin Li ///    between both operands.
272*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_min_ps(__m256 __a,__m256 __b)273*67e74705SXin Li _mm256_min_ps(__m256 __a, __m256 __b)
274*67e74705SXin Li {
275*67e74705SXin Li   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276*67e74705SXin Li }
277*67e74705SXin Li 
278*67e74705SXin Li /// \brief Multiplies two 256-bit vectors of [4 x double].
279*67e74705SXin Li ///
280*67e74705SXin Li /// \headerfile <x86intrin.h>
281*67e74705SXin Li ///
282*67e74705SXin Li /// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
283*67e74705SXin Li ///
284*67e74705SXin Li /// \param __a
285*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
286*67e74705SXin Li /// \param __b
287*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the operands.
288*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the products of both
289*67e74705SXin Li ///    operands.
290*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_mul_pd(__m256d __a,__m256d __b)291*67e74705SXin Li _mm256_mul_pd(__m256d __a, __m256d __b)
292*67e74705SXin Li {
293*67e74705SXin Li   return (__m256d)((__v4df)__a * (__v4df)__b);
294*67e74705SXin Li }
295*67e74705SXin Li 
296*67e74705SXin Li /// \brief Multiplies two 256-bit vectors of [8 x float].
297*67e74705SXin Li ///
298*67e74705SXin Li /// \headerfile <x86intrin.h>
299*67e74705SXin Li ///
300*67e74705SXin Li /// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
301*67e74705SXin Li ///
302*67e74705SXin Li /// \param __a
303*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
304*67e74705SXin Li /// \param __b
305*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the operands.
306*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the products of both
307*67e74705SXin Li ///    operands.
308*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_mul_ps(__m256 __a,__m256 __b)309*67e74705SXin Li _mm256_mul_ps(__m256 __a, __m256 __b)
310*67e74705SXin Li {
311*67e74705SXin Li   return (__m256)((__v8sf)__a * (__v8sf)__b);
312*67e74705SXin Li }
313*67e74705SXin Li 
314*67e74705SXin Li /// \brief Calculates the square roots of the values in a 256-bit vector of
315*67e74705SXin Li ///    [4 x double].
316*67e74705SXin Li ///
317*67e74705SXin Li /// \headerfile <x86intrin.h>
318*67e74705SXin Li ///
319*67e74705SXin Li /// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
320*67e74705SXin Li ///
321*67e74705SXin Li /// \param __a
322*67e74705SXin Li ///    A 256-bit vector of [4 x double].
323*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the square roots of the
324*67e74705SXin Li ///    values in the operand.
325*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_sqrt_pd(__m256d __a)326*67e74705SXin Li _mm256_sqrt_pd(__m256d __a)
327*67e74705SXin Li {
328*67e74705SXin Li   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329*67e74705SXin Li }
330*67e74705SXin Li 
331*67e74705SXin Li /// \brief Calculates the square roots of the values in a 256-bit vector of
332*67e74705SXin Li ///    [8 x float].
333*67e74705SXin Li ///
334*67e74705SXin Li /// \headerfile <x86intrin.h>
335*67e74705SXin Li ///
336*67e74705SXin Li /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
337*67e74705SXin Li ///
338*67e74705SXin Li /// \param __a
339*67e74705SXin Li ///    A 256-bit vector of [8 x float].
340*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the square roots of the
341*67e74705SXin Li ///    values in the operand.
342*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_sqrt_ps(__m256 __a)343*67e74705SXin Li _mm256_sqrt_ps(__m256 __a)
344*67e74705SXin Li {
345*67e74705SXin Li   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346*67e74705SXin Li }
347*67e74705SXin Li 
348*67e74705SXin Li /// \brief Calculates the reciprocal square roots of the values in a 256-bit
349*67e74705SXin Li ///    vector of [8 x float].
350*67e74705SXin Li ///
351*67e74705SXin Li /// \headerfile <x86intrin.h>
352*67e74705SXin Li ///
353*67e74705SXin Li /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
354*67e74705SXin Li ///
355*67e74705SXin Li /// \param __a
356*67e74705SXin Li ///    A 256-bit vector of [8 x float].
357*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358*67e74705SXin Li ///    roots of the values in the operand.
359*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_rsqrt_ps(__m256 __a)360*67e74705SXin Li _mm256_rsqrt_ps(__m256 __a)
361*67e74705SXin Li {
362*67e74705SXin Li   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363*67e74705SXin Li }
364*67e74705SXin Li 
365*67e74705SXin Li /// \brief Calculates the reciprocals of the values in a 256-bit vector of
366*67e74705SXin Li ///    [8 x float].
367*67e74705SXin Li ///
368*67e74705SXin Li /// \headerfile <x86intrin.h>
369*67e74705SXin Li ///
370*67e74705SXin Li /// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
371*67e74705SXin Li ///
372*67e74705SXin Li /// \param __a
373*67e74705SXin Li ///    A 256-bit vector of [8 x float].
374*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375*67e74705SXin Li ///    values in the operand.
376*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_rcp_ps(__m256 __a)377*67e74705SXin Li _mm256_rcp_ps(__m256 __a)
378*67e74705SXin Li {
379*67e74705SXin Li   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380*67e74705SXin Li }
381*67e74705SXin Li 
382*67e74705SXin Li /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383*67e74705SXin Li ///    by the byte operand. The source values are rounded to integer values and
384*67e74705SXin Li ///    returned as 64-bit double-precision floating-point values.
385*67e74705SXin Li ///
386*67e74705SXin Li /// \headerfile <x86intrin.h>
387*67e74705SXin Li ///
388*67e74705SXin Li /// \code
389*67e74705SXin Li /// __m256d _mm256_round_pd(__m256d V, const int M);
390*67e74705SXin Li /// \endcode
391*67e74705SXin Li ///
392*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
393*67e74705SXin Li ///
394*67e74705SXin Li /// \param V
395*67e74705SXin Li ///    A 256-bit vector of [4 x double].
396*67e74705SXin Li /// \param M
397*67e74705SXin Li ///    An integer value that specifies the rounding operation.
398*67e74705SXin Li ///    Bits [7:4] are reserved.
399*67e74705SXin Li ///    Bit [3] is a precision exception value:
400*67e74705SXin Li ///    0: A normal PE exception is used.
401*67e74705SXin Li ///    1: The PE field is not updated.
402*67e74705SXin Li ///    Bit [2] is the rounding control source:
403*67e74705SXin Li ///    0: Use bits [1:0] of M.
404*67e74705SXin Li ///    1: Use the current MXCSR setting.
405*67e74705SXin Li ///    Bits [1:0] contain the rounding control definition:
406*67e74705SXin Li ///    00: Nearest.
407*67e74705SXin Li ///    01: Downward (toward negative infinity).
408*67e74705SXin Li ///    10: Upward (toward positive infinity).
409*67e74705SXin Li ///    11: Truncated.
410*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the rounded values.
411*67e74705SXin Li #define _mm256_round_pd(V, M) __extension__ ({ \
412*67e74705SXin Li     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
413*67e74705SXin Li 
414*67e74705SXin Li /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415*67e74705SXin Li ///    specified by the byte operand. The source values are rounded to integer
416*67e74705SXin Li ///    values and returned as floating-point values.
417*67e74705SXin Li ///
418*67e74705SXin Li /// \headerfile <x86intrin.h>
419*67e74705SXin Li ///
420*67e74705SXin Li /// \code
421*67e74705SXin Li /// __m256 _mm256_round_ps(__m256 V, const int M);
422*67e74705SXin Li /// \endcode
423*67e74705SXin Li ///
424*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
425*67e74705SXin Li ///
426*67e74705SXin Li /// \param V
427*67e74705SXin Li ///    A 256-bit vector of [8 x float].
428*67e74705SXin Li /// \param M
429*67e74705SXin Li ///    An integer value that specifies the rounding operation.
430*67e74705SXin Li ///    Bits [7:4] are reserved.
431*67e74705SXin Li ///    Bit [3] is a precision exception value:
432*67e74705SXin Li ///    0: A normal PE exception is used.
433*67e74705SXin Li ///    1: The PE field is not updated.
434*67e74705SXin Li ///    Bit [2] is the rounding control source:
435*67e74705SXin Li ///    0: Use bits [1:0] of M.
436*67e74705SXin Li ///    1: Use the current MXCSR setting.
437*67e74705SXin Li ///    Bits [1:0] contain the rounding control definition:
438*67e74705SXin Li ///    00: Nearest.
439*67e74705SXin Li ///    01: Downward (toward negative infinity).
440*67e74705SXin Li ///    10: Upward (toward positive infinity).
441*67e74705SXin Li ///    11: Truncated.
442*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the rounded values.
443*67e74705SXin Li #define _mm256_round_ps(V, M) __extension__ ({ \
444*67e74705SXin Li   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
445*67e74705SXin Li 
446*67e74705SXin Li /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
447*67e74705SXin Li ///    source values are rounded up to integer values and returned as 64-bit
448*67e74705SXin Li ///    double-precision floating-point values.
449*67e74705SXin Li ///
450*67e74705SXin Li /// \headerfile <x86intrin.h>
451*67e74705SXin Li ///
452*67e74705SXin Li /// \code
453*67e74705SXin Li /// __m256d _mm256_ceil_pd(__m256d V);
454*67e74705SXin Li /// \endcode
455*67e74705SXin Li ///
456*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
457*67e74705SXin Li ///
458*67e74705SXin Li /// \param V
459*67e74705SXin Li ///    A 256-bit vector of [4 x double].
460*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461*67e74705SXin Li #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
462*67e74705SXin Li 
463*67e74705SXin Li /// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
464*67e74705SXin Li ///    The source values are rounded down to integer values and returned as
465*67e74705SXin Li ///    64-bit double-precision floating-point values.
466*67e74705SXin Li ///
467*67e74705SXin Li /// \headerfile <x86intrin.h>
468*67e74705SXin Li ///
469*67e74705SXin Li /// \code
470*67e74705SXin Li /// __m256d _mm256_floor_pd(__m256d V);
471*67e74705SXin Li /// \endcode
472*67e74705SXin Li ///
473*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
474*67e74705SXin Li ///
475*67e74705SXin Li /// \param V
476*67e74705SXin Li ///    A 256-bit vector of [4 x double].
477*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the rounded down
478*67e74705SXin Li ///    values.
479*67e74705SXin Li #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480*67e74705SXin Li 
481*67e74705SXin Li /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
482*67e74705SXin Li ///    source values are rounded up to integer values and returned as
483*67e74705SXin Li ///    floating-point values.
484*67e74705SXin Li ///
485*67e74705SXin Li /// \headerfile <x86intrin.h>
486*67e74705SXin Li ///
487*67e74705SXin Li /// \code
488*67e74705SXin Li /// __m256 _mm256_ceil_ps(__m256 V);
489*67e74705SXin Li /// \endcode
490*67e74705SXin Li ///
491*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
492*67e74705SXin Li ///
493*67e74705SXin Li /// \param V
494*67e74705SXin Li ///    A 256-bit vector of [8 x float].
495*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496*67e74705SXin Li #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
497*67e74705SXin Li 
498*67e74705SXin Li /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
499*67e74705SXin Li ///    source values are rounded down to integer values and returned as
500*67e74705SXin Li ///    floating-point values.
501*67e74705SXin Li ///
502*67e74705SXin Li /// \headerfile <x86intrin.h>
503*67e74705SXin Li ///
504*67e74705SXin Li /// \code
505*67e74705SXin Li /// __m256 _mm256_floor_ps(__m256 V);
506*67e74705SXin Li /// \endcode
507*67e74705SXin Li ///
508*67e74705SXin Li /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
509*67e74705SXin Li ///
510*67e74705SXin Li /// \param V
511*67e74705SXin Li ///    A 256-bit vector of [8 x float].
512*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513*67e74705SXin Li #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514*67e74705SXin Li 
515*67e74705SXin Li /* Logical */
516*67e74705SXin Li /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517*67e74705SXin Li ///
518*67e74705SXin Li /// \headerfile <x86intrin.h>
519*67e74705SXin Li ///
520*67e74705SXin Li /// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
521*67e74705SXin Li ///
522*67e74705SXin Li /// \param __a
523*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
524*67e74705SXin Li /// \param __b
525*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
526*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527*67e74705SXin Li ///    values between both operands.
528*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_and_pd(__m256d __a,__m256d __b)529*67e74705SXin Li _mm256_and_pd(__m256d __a, __m256d __b)
530*67e74705SXin Li {
531*67e74705SXin Li   return (__m256d)((__v4du)__a & (__v4du)__b);
532*67e74705SXin Li }
533*67e74705SXin Li 
534*67e74705SXin Li /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535*67e74705SXin Li ///
536*67e74705SXin Li /// \headerfile <x86intrin.h>
537*67e74705SXin Li ///
538*67e74705SXin Li /// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
539*67e74705SXin Li ///
540*67e74705SXin Li /// \param __a
541*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
542*67e74705SXin Li /// \param __b
543*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
544*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545*67e74705SXin Li ///    values between both operands.
546*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_and_ps(__m256 __a,__m256 __b)547*67e74705SXin Li _mm256_and_ps(__m256 __a, __m256 __b)
548*67e74705SXin Li {
549*67e74705SXin Li   return (__m256)((__v8su)__a & (__v8su)__b);
550*67e74705SXin Li }
551*67e74705SXin Li 
552*67e74705SXin Li /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553*67e74705SXin Li ///    the one's complement of the values contained in the first source operand.
554*67e74705SXin Li ///
555*67e74705SXin Li /// \headerfile <x86intrin.h>
556*67e74705SXin Li ///
557*67e74705SXin Li /// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
558*67e74705SXin Li ///
559*67e74705SXin Li /// \param __a
560*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the left source operand. The
561*67e74705SXin Li ///    one's complement of this value is used in the bitwise AND.
562*67e74705SXin Li /// \param __b
563*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing the right source operand.
564*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565*67e74705SXin Li ///    values of the second operand and the one's complement of the first
566*67e74705SXin Li ///    operand.
567*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_andnot_pd(__m256d __a,__m256d __b)568*67e74705SXin Li _mm256_andnot_pd(__m256d __a, __m256d __b)
569*67e74705SXin Li {
570*67e74705SXin Li   return (__m256d)(~(__v4du)__a & (__v4du)__b);
571*67e74705SXin Li }
572*67e74705SXin Li 
573*67e74705SXin Li /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574*67e74705SXin Li ///    the one's complement of the values contained in the first source operand.
575*67e74705SXin Li ///
576*67e74705SXin Li /// \headerfile <x86intrin.h>
577*67e74705SXin Li ///
578*67e74705SXin Li /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
579*67e74705SXin Li ///
580*67e74705SXin Li /// \param __a
581*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the left source operand. The
582*67e74705SXin Li ///    one's complement of this value is used in the bitwise AND.
583*67e74705SXin Li /// \param __b
584*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing the right source operand.
585*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586*67e74705SXin Li ///    values of the second operand and the one's complement of the first
587*67e74705SXin Li ///    operand.
588*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_andnot_ps(__m256 __a,__m256 __b)589*67e74705SXin Li _mm256_andnot_ps(__m256 __a, __m256 __b)
590*67e74705SXin Li {
591*67e74705SXin Li   return (__m256)(~(__v8su)__a & (__v8su)__b);
592*67e74705SXin Li }
593*67e74705SXin Li 
594*67e74705SXin Li /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595*67e74705SXin Li ///
596*67e74705SXin Li /// \headerfile <x86intrin.h>
597*67e74705SXin Li ///
598*67e74705SXin Li /// This intrinsic corresponds to the \c VORPD / ORPD instruction.
599*67e74705SXin Li ///
600*67e74705SXin Li /// \param __a
601*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
602*67e74705SXin Li /// \param __b
603*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
604*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605*67e74705SXin Li ///    values between both operands.
606*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_or_pd(__m256d __a,__m256d __b)607*67e74705SXin Li _mm256_or_pd(__m256d __a, __m256d __b)
608*67e74705SXin Li {
609*67e74705SXin Li   return (__m256d)((__v4du)__a | (__v4du)__b);
610*67e74705SXin Li }
611*67e74705SXin Li 
612*67e74705SXin Li /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613*67e74705SXin Li ///
614*67e74705SXin Li /// \headerfile <x86intrin.h>
615*67e74705SXin Li ///
616*67e74705SXin Li /// This intrinsic corresponds to the \c VORPS / ORPS instruction.
617*67e74705SXin Li ///
618*67e74705SXin Li /// \param __a
619*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
620*67e74705SXin Li /// \param __b
621*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
622*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623*67e74705SXin Li ///    values between both operands.
624*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_or_ps(__m256 __a,__m256 __b)625*67e74705SXin Li _mm256_or_ps(__m256 __a, __m256 __b)
626*67e74705SXin Li {
627*67e74705SXin Li   return (__m256)((__v8su)__a | (__v8su)__b);
628*67e74705SXin Li }
629*67e74705SXin Li 
630*67e74705SXin Li /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631*67e74705SXin Li ///
632*67e74705SXin Li /// \headerfile <x86intrin.h>
633*67e74705SXin Li ///
634*67e74705SXin Li /// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
635*67e74705SXin Li ///
636*67e74705SXin Li /// \param __a
637*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
638*67e74705SXin Li /// \param __b
639*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
640*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641*67e74705SXin Li ///    values between both operands.
642*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_xor_pd(__m256d __a,__m256d __b)643*67e74705SXin Li _mm256_xor_pd(__m256d __a, __m256d __b)
644*67e74705SXin Li {
645*67e74705SXin Li   return (__m256d)((__v4du)__a ^ (__v4du)__b);
646*67e74705SXin Li }
647*67e74705SXin Li 
648*67e74705SXin Li /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649*67e74705SXin Li ///
650*67e74705SXin Li /// \headerfile <x86intrin.h>
651*67e74705SXin Li ///
652*67e74705SXin Li /// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
653*67e74705SXin Li ///
654*67e74705SXin Li /// \param __a
655*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
656*67e74705SXin Li /// \param __b
657*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
658*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659*67e74705SXin Li ///    values between both operands.
660*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_xor_ps(__m256 __a,__m256 __b)661*67e74705SXin Li _mm256_xor_ps(__m256 __a, __m256 __b)
662*67e74705SXin Li {
663*67e74705SXin Li   return (__m256)((__v8su)__a ^ (__v8su)__b);
664*67e74705SXin Li }
665*67e74705SXin Li 
666*67e74705SXin Li /* Horizontal arithmetic */
667*67e74705SXin Li /// \brief Horizontally adds the adjacent pairs of values contained in two
668*67e74705SXin Li ///    256-bit vectors of [4 x double].
669*67e74705SXin Li ///
670*67e74705SXin Li /// \headerfile <x86intrin.h>
671*67e74705SXin Li ///
672*67e74705SXin Li /// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
673*67e74705SXin Li ///
674*67e74705SXin Li /// \param __a
675*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
676*67e74705SXin Li ///    The horizontal sums of the values are returned in the even-indexed
677*67e74705SXin Li ///    elements of a vector of [4 x double].
678*67e74705SXin Li /// \param __b
679*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
680*67e74705SXin Li ///    The horizontal sums of the values are returned in the odd-indexed
681*67e74705SXin Li ///    elements of a vector of [4 x double].
682*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683*67e74705SXin Li ///    both operands.
684*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_hadd_pd(__m256d __a,__m256d __b)685*67e74705SXin Li _mm256_hadd_pd(__m256d __a, __m256d __b)
686*67e74705SXin Li {
687*67e74705SXin Li   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688*67e74705SXin Li }
689*67e74705SXin Li 
690*67e74705SXin Li /// \brief Horizontally adds the adjacent pairs of values contained in two
691*67e74705SXin Li ///    256-bit vectors of [8 x float].
692*67e74705SXin Li ///
693*67e74705SXin Li /// \headerfile <x86intrin.h>
694*67e74705SXin Li ///
695*67e74705SXin Li /// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
696*67e74705SXin Li ///
697*67e74705SXin Li /// \param __a
698*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
699*67e74705SXin Li ///    The horizontal sums of the values are returned in the elements with
700*67e74705SXin Li ///    index 0, 1, 4, 5 of a vector of [8 x float].
701*67e74705SXin Li /// \param __b
702*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
703*67e74705SXin Li ///    The horizontal sums of the values are returned in the elements with
704*67e74705SXin Li ///    index 2, 3, 6, 7 of a vector of [8 x float].
705*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706*67e74705SXin Li ///    both operands.
707*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_hadd_ps(__m256 __a,__m256 __b)708*67e74705SXin Li _mm256_hadd_ps(__m256 __a, __m256 __b)
709*67e74705SXin Li {
710*67e74705SXin Li   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711*67e74705SXin Li }
712*67e74705SXin Li 
713*67e74705SXin Li /// \brief Horizontally subtracts the adjacent pairs of values contained in two
714*67e74705SXin Li ///    256-bit vectors of [4 x double].
715*67e74705SXin Li ///
716*67e74705SXin Li /// \headerfile <x86intrin.h>
717*67e74705SXin Li ///
718*67e74705SXin Li /// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
719*67e74705SXin Li ///
720*67e74705SXin Li /// \param __a
721*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
722*67e74705SXin Li ///    The horizontal differences between the values are returned in the
723*67e74705SXin Li ///    even-indexed elements of a vector of [4 x double].
724*67e74705SXin Li /// \param __b
725*67e74705SXin Li ///    A 256-bit vector of [4 x double] containing one of the source operands.
726*67e74705SXin Li ///    The horizontal differences between the values are returned in the
727*67e74705SXin Li ///    odd-indexed elements of a vector of [4 x double].
728*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the horizontal
729*67e74705SXin Li ///    differences of both operands.
730*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_hsub_pd(__m256d __a,__m256d __b)731*67e74705SXin Li _mm256_hsub_pd(__m256d __a, __m256d __b)
732*67e74705SXin Li {
733*67e74705SXin Li   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734*67e74705SXin Li }
735*67e74705SXin Li 
736*67e74705SXin Li /// \brief Horizontally subtracts the adjacent pairs of values contained in two
737*67e74705SXin Li ///    256-bit vectors of [8 x float].
738*67e74705SXin Li ///
739*67e74705SXin Li /// \headerfile <x86intrin.h>
740*67e74705SXin Li ///
741*67e74705SXin Li /// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
742*67e74705SXin Li ///
743*67e74705SXin Li /// \param __a
744*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
745*67e74705SXin Li ///    The horizontal differences between the values are returned in the
746*67e74705SXin Li ///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
747*67e74705SXin Li /// \param __b
748*67e74705SXin Li ///    A 256-bit vector of [8 x float] containing one of the source operands.
749*67e74705SXin Li ///    The horizontal differences between the values are returned in the
750*67e74705SXin Li ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
751*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the horizontal
752*67e74705SXin Li ///    differences of both operands.
753*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_hsub_ps(__m256 __a,__m256 __b)754*67e74705SXin Li _mm256_hsub_ps(__m256 __a, __m256 __b)
755*67e74705SXin Li {
756*67e74705SXin Li   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757*67e74705SXin Li }
758*67e74705SXin Li 
759*67e74705SXin Li /* Vector permutations */
760*67e74705SXin Li /// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761*67e74705SXin Li ///    by the 128-bit integer vector operand.
762*67e74705SXin Li ///
763*67e74705SXin Li /// \headerfile <x86intrin.h>
764*67e74705SXin Li ///
765*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
766*67e74705SXin Li ///
767*67e74705SXin Li /// \param __a
768*67e74705SXin Li ///    A 128-bit vector of [2 x double].
769*67e74705SXin Li /// \param __c
770*67e74705SXin Li ///    A 128-bit integer vector operand specifying how the values are to be
771*67e74705SXin Li ///    copied.
772*67e74705SXin Li ///    Bit [1]:
773*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [63:0] of the
774*67e74705SXin Li ///    returned vector.
775*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [63:0] of the
776*67e74705SXin Li ///    returned vector.
777*67e74705SXin Li ///    Bit [65]:
778*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [127:64] of the
779*67e74705SXin Li ///    returned vector.
780*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [127:64] of the
781*67e74705SXin Li ///    returned vector.
782*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the copied values.
783*67e74705SXin Li static __inline __m128d __DEFAULT_FN_ATTRS
_mm_permutevar_pd(__m128d __a,__m128i __c)784*67e74705SXin Li _mm_permutevar_pd(__m128d __a, __m128i __c)
785*67e74705SXin Li {
786*67e74705SXin Li   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787*67e74705SXin Li }
788*67e74705SXin Li 
789*67e74705SXin Li /// \brief Copies the values in a 256-bit vector of [4 x double] as
790*67e74705SXin Li ///    specified by the 256-bit integer vector operand.
791*67e74705SXin Li ///
792*67e74705SXin Li /// \headerfile <x86intrin.h>
793*67e74705SXin Li ///
794*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
795*67e74705SXin Li ///
796*67e74705SXin Li /// \param __a
797*67e74705SXin Li ///    A 256-bit vector of [4 x double].
798*67e74705SXin Li /// \param __c
799*67e74705SXin Li ///    A 256-bit integer vector operand specifying how the values are to be
800*67e74705SXin Li ///    copied.
801*67e74705SXin Li ///    Bit [1]:
802*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [63:0] of the
803*67e74705SXin Li ///    returned vector.
804*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [63:0] of the
805*67e74705SXin Li ///    returned vector.
806*67e74705SXin Li ///    Bit [65]:
807*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [127:64] of the
808*67e74705SXin Li ///    returned vector.
809*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [127:64] of the
810*67e74705SXin Li ///    returned vector.
811*67e74705SXin Li ///    Bit [129]:
812*67e74705SXin Li ///    0: Bits [191:128] of the source are copied to bits [191:128] of the
813*67e74705SXin Li ///    returned vector.
814*67e74705SXin Li ///    1: Bits [255:192] of the source are copied to bits [191:128] of the
815*67e74705SXin Li ///    returned vector.
816*67e74705SXin Li ///    Bit [193]:
817*67e74705SXin Li ///    0: Bits [191:128] of the source are copied to bits [255:192] of the
818*67e74705SXin Li ///    returned vector.
819*67e74705SXin Li ///    1: Bits [255:192] of the source are copied to bits [255:192] of the
820*67e74705SXin Li ///    returned vector.
821*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the copied values.
822*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_permutevar_pd(__m256d __a,__m256i __c)823*67e74705SXin Li _mm256_permutevar_pd(__m256d __a, __m256i __c)
824*67e74705SXin Li {
825*67e74705SXin Li   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826*67e74705SXin Li }
827*67e74705SXin Li 
828*67e74705SXin Li /// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829*67e74705SXin Li ///    specified by the 128-bit integer vector operand.
830*67e74705SXin Li ///
831*67e74705SXin Li /// \headerfile <x86intrin.h>
832*67e74705SXin Li ///
833*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
834*67e74705SXin Li ///
835*67e74705SXin Li /// \param __a
836*67e74705SXin Li ///    A 128-bit vector of [4 x float].
837*67e74705SXin Li /// \param __c
838*67e74705SXin Li ///    A 128-bit integer vector operand specifying how the values are to be
839*67e74705SXin Li ///    copied.
840*67e74705SXin Li ///    Bits [1:0]:
841*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [31:0] of the
842*67e74705SXin Li ///    returned vector.
843*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [31:0] of the
844*67e74705SXin Li ///    returned vector.
845*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [31:0] of the
846*67e74705SXin Li ///    returned vector.
847*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [31:0] of the
848*67e74705SXin Li ///    returned vector.
849*67e74705SXin Li ///    Bits [33:32]:
850*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [63:32] of the
851*67e74705SXin Li ///    returned vector.
852*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [63:32] of the
853*67e74705SXin Li ///    returned vector.
854*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [63:32] of the
855*67e74705SXin Li ///    returned vector.
856*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [63:32] of the
857*67e74705SXin Li ///    returned vector.
858*67e74705SXin Li ///    Bits [65:64]:
859*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [95:64] of the
860*67e74705SXin Li ///    returned vector.
861*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [95:64] of the
862*67e74705SXin Li ///    returned vector.
863*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [95:64] of the
864*67e74705SXin Li ///    returned vector.
865*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [95:64] of the
866*67e74705SXin Li ///    returned vector.
867*67e74705SXin Li ///    Bits [97:96]:
868*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [127:96] of the
869*67e74705SXin Li ///    returned vector.
870*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [127:96] of the
871*67e74705SXin Li ///    returned vector.
872*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [127:96] of the
873*67e74705SXin Li ///    returned vector.
874*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [127:96] of the
875*67e74705SXin Li ///    returned vector.
876*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the copied values.
877*67e74705SXin Li static __inline __m128 __DEFAULT_FN_ATTRS
_mm_permutevar_ps(__m128 __a,__m128i __c)878*67e74705SXin Li _mm_permutevar_ps(__m128 __a, __m128i __c)
879*67e74705SXin Li {
880*67e74705SXin Li   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
881*67e74705SXin Li }
882*67e74705SXin Li 
883*67e74705SXin Li /// \brief Copies the values stored in a 256-bit vector of [8 x float] as
884*67e74705SXin Li ///    specified by the 256-bit integer vector operand.
885*67e74705SXin Li ///
886*67e74705SXin Li /// \headerfile <x86intrin.h>
887*67e74705SXin Li ///
888*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
889*67e74705SXin Li ///
890*67e74705SXin Li /// \param __a
891*67e74705SXin Li ///    A 256-bit vector of [8 x float].
892*67e74705SXin Li /// \param __c
893*67e74705SXin Li ///    A 256-bit integer vector operand specifying how the values are to be
894*67e74705SXin Li ///    copied.
895*67e74705SXin Li ///    Bits [1:0]:
896*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [31:0] of the
897*67e74705SXin Li ///    returned vector.
898*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [31:0] of the
899*67e74705SXin Li ///    returned vector.
900*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [31:0] of the
901*67e74705SXin Li ///    returned vector.
902*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [31:0] of the
903*67e74705SXin Li ///    returned vector.
904*67e74705SXin Li ///    Bits [33:32]:
905*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [63:32] of the
906*67e74705SXin Li ///    returned vector.
907*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [63:32] of the
908*67e74705SXin Li ///    returned vector.
909*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [63:32] of the
910*67e74705SXin Li ///    returned vector.
911*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [63:32] of the
912*67e74705SXin Li ///    returned vector.
913*67e74705SXin Li ///    Bits [65:64]:
914*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [95:64] of the
915*67e74705SXin Li ///    returned vector.
916*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [95:64] of the
917*67e74705SXin Li ///    returned vector.
918*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [95:64] of the
919*67e74705SXin Li ///    returned vector.
920*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [95:64] of the
921*67e74705SXin Li ///    returned vector.
922*67e74705SXin Li ///    Bits [97:96]:
923*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [127:96] of the
924*67e74705SXin Li ///    returned vector.
925*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [127:96] of the
926*67e74705SXin Li ///    returned vector.
927*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [127:96] of the
928*67e74705SXin Li ///    returned vector.
929*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [127:96] of the
930*67e74705SXin Li ///    returned vector.
931*67e74705SXin Li ///    Bits [129:128]:
932*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [159:128] of the
933*67e74705SXin Li ///    returned vector.
934*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [159:128] of the
935*67e74705SXin Li ///    returned vector.
936*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [159:128] of the
937*67e74705SXin Li ///    returned vector.
938*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [159:128] of the
939*67e74705SXin Li ///    returned vector.
940*67e74705SXin Li ///    Bits [161:160]:
941*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [191:160] of the
942*67e74705SXin Li ///    returned vector.
943*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [191:160] of the
944*67e74705SXin Li ///    returned vector.
945*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [191:160] of the
946*67e74705SXin Li ///    returned vector.
947*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [191:160] of the
948*67e74705SXin Li ///    returned vector.
949*67e74705SXin Li ///    Bits [193:192]:
950*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [223:192] of the
951*67e74705SXin Li ///    returned vector.
952*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [223:192] of the
953*67e74705SXin Li ///    returned vector.
954*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [223:192] of the
955*67e74705SXin Li ///    returned vector.
956*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [223:192] of the
957*67e74705SXin Li ///    returned vector.
958*67e74705SXin Li ///    Bits [225:224]:
959*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [255:224] of the
960*67e74705SXin Li ///    returned vector.
961*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [255:224] of the
962*67e74705SXin Li ///    returned vector.
963*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [255:224] of the
964*67e74705SXin Li ///    returned vector.
965*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [255:224] of the
966*67e74705SXin Li ///    returned vector.
967*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the copied values.
968*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_permutevar_ps(__m256 __a,__m256i __c)969*67e74705SXin Li _mm256_permutevar_ps(__m256 __a, __m256i __c)
970*67e74705SXin Li {
971*67e74705SXin Li   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
972*67e74705SXin Li }
973*67e74705SXin Li 
974*67e74705SXin Li /// \brief Copies the values in a 128-bit vector of [2 x double] as
975*67e74705SXin Li ///    specified by the immediate integer operand.
976*67e74705SXin Li ///
977*67e74705SXin Li /// \headerfile <x86intrin.h>
978*67e74705SXin Li ///
979*67e74705SXin Li /// \code
980*67e74705SXin Li /// __m128d _mm_permute_pd(__m128d A, const int C);
981*67e74705SXin Li /// \endcode
982*67e74705SXin Li ///
983*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
984*67e74705SXin Li ///
985*67e74705SXin Li /// \param A
986*67e74705SXin Li ///    A 128-bit vector of [2 x double].
987*67e74705SXin Li /// \param C
988*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be copied.
989*67e74705SXin Li ///    Bit [0]:
990*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [63:0] of the
991*67e74705SXin Li ///    returned vector.
992*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [63:0] of the
993*67e74705SXin Li ///    returned vector.
994*67e74705SXin Li ///    Bit [1]:
995*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [127:64] of the
996*67e74705SXin Li ///    returned vector.
997*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [127:64] of the
998*67e74705SXin Li ///    returned vector.
999*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the copied values.
1000*67e74705SXin Li #define _mm_permute_pd(A, C) __extension__ ({ \
1001*67e74705SXin Li   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
1002*67e74705SXin Li                                    (__v2df)_mm_undefined_pd(), \
1003*67e74705SXin Li                                    ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
1004*67e74705SXin Li 
1005*67e74705SXin Li /// \brief Copies the values in a 256-bit vector of [4 x double] as
1006*67e74705SXin Li ///    specified by the immediate integer operand.
1007*67e74705SXin Li ///
1008*67e74705SXin Li /// \headerfile <x86intrin.h>
1009*67e74705SXin Li ///
1010*67e74705SXin Li /// \code
1011*67e74705SXin Li /// __m256d _mm256_permute_pd(__m256d A, const int C);
1012*67e74705SXin Li /// \endcode
1013*67e74705SXin Li ///
1014*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
1015*67e74705SXin Li ///
1016*67e74705SXin Li /// \param A
1017*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1018*67e74705SXin Li /// \param C
1019*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be copied.
1020*67e74705SXin Li ///    Bit [0]:
1021*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [63:0] of the
1022*67e74705SXin Li ///    returned vector.
1023*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [63:0] of the
1024*67e74705SXin Li ///    returned vector.
1025*67e74705SXin Li ///    Bit [1]:
1026*67e74705SXin Li ///    0: Bits [63:0] of the source are copied to bits [127:64] of the
1027*67e74705SXin Li ///    returned vector.
1028*67e74705SXin Li ///    1: Bits [127:64] of the source are copied to bits [127:64] of the
1029*67e74705SXin Li ///    returned vector.
1030*67e74705SXin Li ///    Bit [2]:
1031*67e74705SXin Li ///    0: Bits [191:128] of the source are copied to bits [191:128] of the
1032*67e74705SXin Li ///    returned vector.
1033*67e74705SXin Li ///    1: Bits [255:192] of the source are copied to bits [191:128] of the
1034*67e74705SXin Li ///    returned vector.
1035*67e74705SXin Li ///    Bit [3]:
1036*67e74705SXin Li ///    0: Bits [191:128] of the source are copied to bits [255:192] of the
1037*67e74705SXin Li ///    returned vector.
1038*67e74705SXin Li ///    1: Bits [255:192] of the source are copied to bits [255:192] of the
1039*67e74705SXin Li ///    returned vector.
1040*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the copied values.
1041*67e74705SXin Li #define _mm256_permute_pd(A, C) __extension__ ({ \
1042*67e74705SXin Li   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1043*67e74705SXin Li                                    (__v4df)_mm256_undefined_pd(), \
1044*67e74705SXin Li                                    0 + (((C) >> 0) & 0x1), \
1045*67e74705SXin Li                                    0 + (((C) >> 1) & 0x1), \
1046*67e74705SXin Li                                    2 + (((C) >> 2) & 0x1), \
1047*67e74705SXin Li                                    2 + (((C) >> 3) & 0x1)); })
1048*67e74705SXin Li 
1049*67e74705SXin Li /// \brief Copies the values in a 128-bit vector of [4 x float] as
1050*67e74705SXin Li ///    specified by the immediate integer operand.
1051*67e74705SXin Li ///
1052*67e74705SXin Li /// \headerfile <x86intrin.h>
1053*67e74705SXin Li ///
1054*67e74705SXin Li /// \code
1055*67e74705SXin Li /// __m128 _mm_permute_ps(__m128 A, const int C);
1056*67e74705SXin Li /// \endcode
1057*67e74705SXin Li ///
1058*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1059*67e74705SXin Li ///
1060*67e74705SXin Li /// \param A
1061*67e74705SXin Li ///    A 128-bit vector of [4 x float].
1062*67e74705SXin Li /// \param C
1063*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be copied.
1064*67e74705SXin Li ///    Bits [1:0]:
1065*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [31:0] of the
1066*67e74705SXin Li ///    returned vector.
1067*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [31:0] of the
1068*67e74705SXin Li ///    returned vector.
1069*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [31:0] of the
1070*67e74705SXin Li ///    returned vector.
1071*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [31:0] of the
1072*67e74705SXin Li ///    returned vector.
1073*67e74705SXin Li ///    Bits [3:2]:
1074*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [63:32] of the
1075*67e74705SXin Li ///    returned vector.
1076*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [63:32] of the
1077*67e74705SXin Li ///    returned vector.
1078*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [63:32] of the
1079*67e74705SXin Li ///    returned vector.
1080*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [63:32] of the
1081*67e74705SXin Li ///    returned vector.
1082*67e74705SXin Li ///    Bits [5:4]:
1083*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [95:64] of the
1084*67e74705SXin Li ///    returned vector.
1085*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [95:64] of the
1086*67e74705SXin Li ///    returned vector.
1087*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [95:64] of the
1088*67e74705SXin Li ///    returned vector.
1089*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [95:64] of the
1090*67e74705SXin Li ///    returned vector.
1091*67e74705SXin Li ///    Bits [7:6]:
1092*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [127:96] of the
1093*67e74705SXin Li ///    returned vector.
1094*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [127:96] of the
1095*67e74705SXin Li ///    returned vector.
1096*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [127:96] of the
1097*67e74705SXin Li ///    returned vector.
1098*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [127:96] of the
1099*67e74705SXin Li ///    returned vector.
1100*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the copied values.
1101*67e74705SXin Li #define _mm_permute_ps(A, C) __extension__ ({ \
1102*67e74705SXin Li   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1103*67e74705SXin Li                                   (__v4sf)_mm_undefined_ps(), \
1104*67e74705SXin Li                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1105*67e74705SXin Li                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
1106*67e74705SXin Li 
1107*67e74705SXin Li /// \brief Copies the values in a 256-bit vector of [8 x float] as
1108*67e74705SXin Li ///    specified by the immediate integer operand.
1109*67e74705SXin Li ///
1110*67e74705SXin Li /// \headerfile <x86intrin.h>
1111*67e74705SXin Li ///
1112*67e74705SXin Li /// \code
1113*67e74705SXin Li /// __m256 _mm256_permute_ps(__m256 A, const int C);
1114*67e74705SXin Li /// \endcode
1115*67e74705SXin Li ///
1116*67e74705SXin Li /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1117*67e74705SXin Li ///
1118*67e74705SXin Li /// \param A
1119*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1120*67e74705SXin Li /// \param C
1121*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be copied.
1122*67e74705SXin Li ///    Bits [1:0]:
1123*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [31:0] of the
1124*67e74705SXin Li ///    returned vector.
1125*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [31:0] of the
1126*67e74705SXin Li ///    returned vector.
1127*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [31:0] of the
1128*67e74705SXin Li ///    returned vector.
1129*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [31:0] of the
1130*67e74705SXin Li ///    returned vector.
1131*67e74705SXin Li ///    Bits [3:2]:
1132*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [63:32] of the
1133*67e74705SXin Li ///    returned vector.
1134*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [63:32] of the
1135*67e74705SXin Li ///    returned vector.
1136*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [63:32] of the
1137*67e74705SXin Li ///    returned vector.
1138*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [63:32] of the
1139*67e74705SXin Li ///    returned vector.
1140*67e74705SXin Li ///    Bits [5:4]:
1141*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [95:64] of the
1142*67e74705SXin Li ///    returned vector.
1143*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [95:64] of the
1144*67e74705SXin Li ///    returned vector.
1145*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [95:64] of the
1146*67e74705SXin Li ///    returned vector.
1147*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [95:64] of the
1148*67e74705SXin Li ///    returned vector.
1149*67e74705SXin Li ///    Bits [7:6]:
1150*67e74705SXin Li ///    00: Bits [31:0] of the source are copied to bits [127:96] of the
1151*67e74705SXin Li ///    returned vector.
1152*67e74705SXin Li ///    01: Bits [63:32] of the source are copied to bits [127:96] of the
1153*67e74705SXin Li ///    returned vector.
1154*67e74705SXin Li ///    10: Bits [95:64] of the source are copied to bits [127:96] of the
1155*67e74705SXin Li ///    returned vector.
1156*67e74705SXin Li ///    11: Bits [127:96] of the source are copied to bits [127:96] of the
1157*67e74705SXin Li ///    returned vector.
1158*67e74705SXin Li ///    Bits [1:0]:
1159*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [159:128] of the
1160*67e74705SXin Li ///    returned vector.
1161*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [159:128] of the
1162*67e74705SXin Li ///    returned vector.
1163*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [159:128] of the
1164*67e74705SXin Li ///    returned vector.
1165*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [159:128] of the
1166*67e74705SXin Li ///    returned vector.
1167*67e74705SXin Li ///    Bits [3:2]:
1168*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [191:160] of the
1169*67e74705SXin Li ///    returned vector.
1170*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [191:160] of the
1171*67e74705SXin Li ///    returned vector.
1172*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [191:160] of the
1173*67e74705SXin Li ///    returned vector.
1174*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [191:160] of the
1175*67e74705SXin Li ///    returned vector.
1176*67e74705SXin Li ///    Bits [5:4]:
1177*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [223:192] of the
1178*67e74705SXin Li ///    returned vector.
1179*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [223:192] of the
1180*67e74705SXin Li ///    returned vector.
1181*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [223:192] of the
1182*67e74705SXin Li ///    returned vector.
1183*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [223:192] of the
1184*67e74705SXin Li ///    returned vector.
1185*67e74705SXin Li ///    Bits [7:6]:
1186*67e74705SXin Li ///    00: Bits [159:128] of the source are copied to bits [255:224] of the
1187*67e74705SXin Li ///    returned vector.
1188*67e74705SXin Li ///    01: Bits [191:160] of the source are copied to bits [255:224] of the
1189*67e74705SXin Li ///    returned vector.
1190*67e74705SXin Li ///    10: Bits [223:192] of the source are copied to bits [255:224] of the
1191*67e74705SXin Li ///    returned vector.
1192*67e74705SXin Li ///    11: Bits [255:224] of the source are copied to bits [255:224] of the
1193*67e74705SXin Li ///    returned vector.
1194*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the copied values.
1195*67e74705SXin Li #define _mm256_permute_ps(A, C) __extension__ ({ \
1196*67e74705SXin Li   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1197*67e74705SXin Li                                   (__v8sf)_mm256_undefined_ps(), \
1198*67e74705SXin Li                                   0 + (((C) >> 0) & 0x3), \
1199*67e74705SXin Li                                   0 + (((C) >> 2) & 0x3), \
1200*67e74705SXin Li                                   0 + (((C) >> 4) & 0x3), \
1201*67e74705SXin Li                                   0 + (((C) >> 6) & 0x3), \
1202*67e74705SXin Li                                   4 + (((C) >> 0) & 0x3), \
1203*67e74705SXin Li                                   4 + (((C) >> 2) & 0x3), \
1204*67e74705SXin Li                                   4 + (((C) >> 4) & 0x3), \
1205*67e74705SXin Li                                   4 + (((C) >> 6) & 0x3)); })
1206*67e74705SXin Li 
1207*67e74705SXin Li /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1208*67e74705SXin Li ///    [4 x double], as specified by the immediate integer operand.
1209*67e74705SXin Li ///
1210*67e74705SXin Li /// \headerfile <x86intrin.h>
1211*67e74705SXin Li ///
1212*67e74705SXin Li /// \code
1213*67e74705SXin Li /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1214*67e74705SXin Li /// \endcode
1215*67e74705SXin Li ///
1216*67e74705SXin Li /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
1217*67e74705SXin Li ///
1218*67e74705SXin Li /// \param V1
1219*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1220*67e74705SXin Li /// \param V2
1221*67e74705SXin Li ///    A 256-bit vector of [4 x double.
1222*67e74705SXin Li /// \param M
1223*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be
1224*67e74705SXin Li ///    permuted.
1225*67e74705SXin Li ///    Bits [1:0]:
1226*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1227*67e74705SXin Li ///    destination.
1228*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1229*67e74705SXin Li ///    destination.
1230*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1231*67e74705SXin Li ///    destination.
1232*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1233*67e74705SXin Li ///    destination.
1234*67e74705SXin Li ///    Bits [5:4]:
1235*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1236*67e74705SXin Li ///    destination.
1237*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1238*67e74705SXin Li ///    destination.
1239*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1240*67e74705SXin Li ///    destination.
1241*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1242*67e74705SXin Li ///    destination.
1243*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the copied values.
1244*67e74705SXin Li #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1245*67e74705SXin Li   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1246*67e74705SXin Li                                            (__v4df)(__m256d)(V2), (M)); })
1247*67e74705SXin Li 
1248*67e74705SXin Li /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1249*67e74705SXin Li ///    [8 x float], as specified by the immediate integer operand.
1250*67e74705SXin Li ///
1251*67e74705SXin Li /// \headerfile <x86intrin.h>
1252*67e74705SXin Li ///
1253*67e74705SXin Li /// \code
1254*67e74705SXin Li /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1255*67e74705SXin Li /// \endcode
1256*67e74705SXin Li ///
1257*67e74705SXin Li /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
1258*67e74705SXin Li ///
1259*67e74705SXin Li /// \param V1
1260*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1261*67e74705SXin Li /// \param V2
1262*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1263*67e74705SXin Li /// \param M
1264*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be
1265*67e74705SXin Li ///    permuted.
1266*67e74705SXin Li ///    Bits [1:0]:
1267*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1268*67e74705SXin Li ///    destination.
1269*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1270*67e74705SXin Li ///    destination.
1271*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1272*67e74705SXin Li ///    destination.
1273*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1274*67e74705SXin Li ///    destination.
1275*67e74705SXin Li ///    Bits [5:4]:
1276*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1277*67e74705SXin Li ///    destination.
1278*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1279*67e74705SXin Li ///    destination.
1280*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1281*67e74705SXin Li ///    destination.
1282*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1283*67e74705SXin Li ///    destination.
1284*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the copied values.
1285*67e74705SXin Li #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1286*67e74705SXin Li   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1287*67e74705SXin Li                                           (__v8sf)(__m256)(V2), (M)); })
1288*67e74705SXin Li 
1289*67e74705SXin Li /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1290*67e74705SXin Li ///    as specified by the immediate integer operand.
1291*67e74705SXin Li ///
1292*67e74705SXin Li /// \headerfile <x86intrin.h>
1293*67e74705SXin Li ///
1294*67e74705SXin Li /// \code
1295*67e74705SXin Li /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1296*67e74705SXin Li /// \endcode
1297*67e74705SXin Li ///
1298*67e74705SXin Li /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
1299*67e74705SXin Li ///
1300*67e74705SXin Li /// \param V1
1301*67e74705SXin Li ///    A 256-bit integer vector.
1302*67e74705SXin Li /// \param V2
1303*67e74705SXin Li ///    A 256-bit integer vector.
1304*67e74705SXin Li /// \param M
1305*67e74705SXin Li ///    An immediate integer operand specifying how the values are to be copied.
1306*67e74705SXin Li ///    Bits [1:0]:
1307*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1308*67e74705SXin Li ///    destination.
1309*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1310*67e74705SXin Li ///    destination.
1311*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1312*67e74705SXin Li ///    destination.
1313*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1314*67e74705SXin Li ///    destination.
1315*67e74705SXin Li ///    Bits [5:4]:
1316*67e74705SXin Li ///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1317*67e74705SXin Li ///    destination.
1318*67e74705SXin Li ///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1319*67e74705SXin Li ///    destination.
1320*67e74705SXin Li ///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1321*67e74705SXin Li ///    destination.
1322*67e74705SXin Li ///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1323*67e74705SXin Li ///    destination.
1324*67e74705SXin Li /// \returns A 256-bit integer vector containing the copied values.
1325*67e74705SXin Li #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1326*67e74705SXin Li   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1327*67e74705SXin Li                                            (__v8si)(__m256i)(V2), (M)); })
1328*67e74705SXin Li 
1329*67e74705SXin Li /* Vector Blend */
1330*67e74705SXin Li /// \brief Merges 64-bit double-precision data values stored in either of the
1331*67e74705SXin Li ///    two 256-bit vectors of [4 x double], as specified by the immediate
1332*67e74705SXin Li ///    integer operand.
1333*67e74705SXin Li ///
1334*67e74705SXin Li /// \headerfile <x86intrin.h>
1335*67e74705SXin Li ///
1336*67e74705SXin Li /// \code
1337*67e74705SXin Li /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1338*67e74705SXin Li /// \endcode
1339*67e74705SXin Li ///
1340*67e74705SXin Li /// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
1341*67e74705SXin Li ///
1342*67e74705SXin Li /// \param V1
1343*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1344*67e74705SXin Li /// \param V2
1345*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1346*67e74705SXin Li /// \param M
1347*67e74705SXin Li ///    An immediate integer operand, with mask bits [3:0] specifying how the
1348*67e74705SXin Li ///    values are to be copied. The position of the mask bit corresponds to the
1349*67e74705SXin Li ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1350*67e74705SXin Li ///    element in operand V1 is copied to the same position in the destination.
1351*67e74705SXin Li ///    When a mask bit is 1, the corresponding 64-bit element in operand V2 is
1352*67e74705SXin Li ///    copied to the same position in the destination.
1353*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the copied values.
1354*67e74705SXin Li #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1355*67e74705SXin Li   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1356*67e74705SXin Li                                    (__v4df)(__m256d)(V2), \
1357*67e74705SXin Li                                    (((M) & 0x01) ? 4 : 0), \
1358*67e74705SXin Li                                    (((M) & 0x02) ? 5 : 1), \
1359*67e74705SXin Li                                    (((M) & 0x04) ? 6 : 2), \
1360*67e74705SXin Li                                    (((M) & 0x08) ? 7 : 3)); })
1361*67e74705SXin Li 
1362*67e74705SXin Li /// \brief Merges 32-bit single-precision data values stored in either of the
1363*67e74705SXin Li ///    two 256-bit vectors of [8 x float], as specified by the immediate
1364*67e74705SXin Li ///    integer operand.
1365*67e74705SXin Li ///
1366*67e74705SXin Li /// \headerfile <x86intrin.h>
1367*67e74705SXin Li ///
1368*67e74705SXin Li /// \code
1369*67e74705SXin Li /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1370*67e74705SXin Li /// \endcode
1371*67e74705SXin Li ///
1372*67e74705SXin Li /// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
1373*67e74705SXin Li ///
1374*67e74705SXin Li /// \param V1
1375*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1376*67e74705SXin Li /// \param V2
1377*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1378*67e74705SXin Li /// \param M
1379*67e74705SXin Li ///    An immediate integer operand, with mask bits [7:0] specifying how the
1380*67e74705SXin Li ///    values are to be copied. The position of the mask bit corresponds to the
1381*67e74705SXin Li ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1382*67e74705SXin Li ///    element in operand V1 is copied to the same position in the destination.
1383*67e74705SXin Li ///    When a mask bit is 1, the corresponding 32-bit element in operand V2 is
1384*67e74705SXin Li ///    copied to the same position in the destination.
1385*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the copied values.
1386*67e74705SXin Li #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1387*67e74705SXin Li   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1388*67e74705SXin Li                                   (__v8sf)(__m256)(V2), \
1389*67e74705SXin Li                                   (((M) & 0x01) ?  8 : 0), \
1390*67e74705SXin Li                                   (((M) & 0x02) ?  9 : 1), \
1391*67e74705SXin Li                                   (((M) & 0x04) ? 10 : 2), \
1392*67e74705SXin Li                                   (((M) & 0x08) ? 11 : 3), \
1393*67e74705SXin Li                                   (((M) & 0x10) ? 12 : 4), \
1394*67e74705SXin Li                                   (((M) & 0x20) ? 13 : 5), \
1395*67e74705SXin Li                                   (((M) & 0x40) ? 14 : 6), \
1396*67e74705SXin Li                                   (((M) & 0x80) ? 15 : 7)); })
1397*67e74705SXin Li 
1398*67e74705SXin Li /// \brief Merges 64-bit double-precision data values stored in either of the
1399*67e74705SXin Li ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1400*67e74705SXin Li ///    operand.
1401*67e74705SXin Li ///
1402*67e74705SXin Li /// \headerfile <x86intrin.h>
1403*67e74705SXin Li ///
1404*67e74705SXin Li /// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
1405*67e74705SXin Li ///
1406*67e74705SXin Li /// \param __a
1407*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1408*67e74705SXin Li /// \param __b
1409*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1410*67e74705SXin Li /// \param __c
1411*67e74705SXin Li ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1412*67e74705SXin Li ///    how the values are to be copied. The position of the mask bit corresponds
1413*67e74705SXin Li ///    to the most significant bit of a copied value. When a mask bit is 0, the
1414*67e74705SXin Li ///    corresponding 64-bit element in operand __a is copied to the same
1415*67e74705SXin Li ///    position in the destination. When a mask bit is 1, the corresponding
1416*67e74705SXin Li ///    64-bit element in operand __b is copied to the same position in the
1417*67e74705SXin Li ///    destination.
1418*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the copied values.
1419*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_blendv_pd(__m256d __a,__m256d __b,__m256d __c)1420*67e74705SXin Li _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1421*67e74705SXin Li {
1422*67e74705SXin Li   return (__m256d)__builtin_ia32_blendvpd256(
1423*67e74705SXin Li     (__v4df)__a, (__v4df)__b, (__v4df)__c);
1424*67e74705SXin Li }
1425*67e74705SXin Li 
1426*67e74705SXin Li /// \brief Merges 32-bit single-precision data values stored in either of the
1427*67e74705SXin Li ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1428*67e74705SXin Li ///    operand.
1429*67e74705SXin Li ///
1430*67e74705SXin Li /// \headerfile <x86intrin.h>
1431*67e74705SXin Li ///
1432*67e74705SXin Li /// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
1433*67e74705SXin Li ///
1434*67e74705SXin Li /// \param __a
1435*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1436*67e74705SXin Li /// \param __b
1437*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1438*67e74705SXin Li /// \param __c
1439*67e74705SXin Li ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1440*67e74705SXin Li ///    and 31 specifying how the values are to be copied. The position of the
1441*67e74705SXin Li ///    mask bit corresponds to the most significant bit of a copied value. When
1442*67e74705SXin Li ///    a mask bit is 0, the corresponding 32-bit element in operand __a is
1443*67e74705SXin Li ///    copied to the same position in the destination. When a mask bit is 1, the
1444*67e74705SXin Li ///    corresponding 32-bit element in operand __b is copied to the same
1445*67e74705SXin Li ///    position in the destination.
1446*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the copied values.
1447*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_blendv_ps(__m256 __a,__m256 __b,__m256 __c)1448*67e74705SXin Li _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1449*67e74705SXin Li {
1450*67e74705SXin Li   return (__m256)__builtin_ia32_blendvps256(
1451*67e74705SXin Li     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1452*67e74705SXin Li }
1453*67e74705SXin Li 
1454*67e74705SXin Li /* Vector Dot Product */
1455*67e74705SXin Li /// \brief Computes two dot products in parallel, using the lower and upper
1456*67e74705SXin Li ///    halves of two [8 x float] vectors as input to the two computations, and
1457*67e74705SXin Li ///    returning the two dot products in the lower and upper halves of the
1458*67e74705SXin Li ///    [8 x float] result. The immediate integer operand controls which
1459*67e74705SXin Li ///    input elements will contribute to the dot product, and where the final
1460*67e74705SXin Li ///    results are returned. In general, for each dot product, the four
1461*67e74705SXin Li ///    corresponding elements of the input vectors are multiplied; the first
1462*67e74705SXin Li ///    two and second two products are summed, then the two sums are added to
1463*67e74705SXin Li ///    form the final result.
1464*67e74705SXin Li ///
1465*67e74705SXin Li /// \headerfile <x86intrin.h>
1466*67e74705SXin Li ///
1467*67e74705SXin Li /// \code
1468*67e74705SXin Li /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1469*67e74705SXin Li /// \endcode
1470*67e74705SXin Li ///
1471*67e74705SXin Li /// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
1472*67e74705SXin Li ///
1473*67e74705SXin Li /// \param V1
1474*67e74705SXin Li ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1475*67e74705SXin Li /// \param V2
1476*67e74705SXin Li ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1477*67e74705SXin Li /// \param M
1478*67e74705SXin Li ///    An immediate integer argument. Bits [7:4] determine which elements of
1479*67e74705SXin Li ///    the input vectors are used, with bit [4] corresponding to the lowest
1480*67e74705SXin Li ///    element and bit [7] corresponding to the highest element of each [4 x
1481*67e74705SXin Li ///    float] subvector. If a bit is set, the corresponding elements from the
1482*67e74705SXin Li ///    two input vectors are used as an input for dot product; otherwise that
1483*67e74705SXin Li ///    input is treated as zero. Bits [3:0] determine which elements of the
1484*67e74705SXin Li ///    result will receive a copy of the final dot product, with bit [0]
1485*67e74705SXin Li ///    corresponding to the lowest element and bit [3] corresponding to the
1486*67e74705SXin Li ///    highest element of each [4 x float] subvector. If a bit is set, the dot
1487*67e74705SXin Li ///    product is returned in the corresponding element; otherwise that element
1488*67e74705SXin Li ///    is set to zero. The bitmask is applied in the same way to each of the
1489*67e74705SXin Li ///    two parallel dot product computations.
1490*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the two dot products.
1491*67e74705SXin Li #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1492*67e74705SXin Li   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1493*67e74705SXin Li                                  (__v8sf)(__m256)(V2), (M)); })
1494*67e74705SXin Li 
1495*67e74705SXin Li /* Vector shuffle */
1496*67e74705SXin Li /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1497*67e74705SXin Li ///    specified by the immediate value operand. The four selected elements in
1498*67e74705SXin Li ///    each operand are copied to the destination according to the bits
1499*67e74705SXin Li ///    specified in the immediate operand. The selected elements from the first
1500*67e74705SXin Li ///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
1501*67e74705SXin Li ///    destination, and the selected elements from the second 256-bit operand
1502*67e74705SXin Li ///    are copied to bits [127:64] and bits [255:192] of the destination. For
1503*67e74705SXin Li ///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
1504*67e74705SXin Li ///    the 256-bit destination vector would contain the following values: b[7],
1505*67e74705SXin Li ///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1506*67e74705SXin Li ///
1507*67e74705SXin Li /// \headerfile <x86intrin.h>
1508*67e74705SXin Li ///
1509*67e74705SXin Li /// \code
1510*67e74705SXin Li /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1511*67e74705SXin Li /// \endcode
1512*67e74705SXin Li ///
1513*67e74705SXin Li /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
1514*67e74705SXin Li ///
1515*67e74705SXin Li /// \param a
1516*67e74705SXin Li ///    A 256-bit vector of [8 x float]. The four selected elements in this
1517*67e74705SXin Li ///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1518*67e74705SXin Li ///    according to the bits specified in the immediate operand.
1519*67e74705SXin Li /// \param b
1520*67e74705SXin Li ///    A 256-bit vector of [8 x float]. The four selected elements in this
1521*67e74705SXin Li ///    operand are copied to bits [127:64] and bits [255:192] in the
1522*67e74705SXin Li ///    destination, according to the bits specified in the immediate operand.
1523*67e74705SXin Li /// \param mask
1524*67e74705SXin Li ///    An immediate value containing an 8-bit value specifying which elements to
1525*67e74705SXin Li ///    copy from a and b. Bits [3:0] specify the values copied from operand a.
1526*67e74705SXin Li ///    Bits [7:4] specify the values copied from operand b.
1527*67e74705SXin Li ///    The destinations within the 256-bit destination are assigned values as
1528*67e74705SXin Li ///    follows, according to the bit value assignments described below:
1529*67e74705SXin Li ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1530*67e74705SXin Li ///    destination.
1531*67e74705SXin Li ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1532*67e74705SXin Li ///    destination.
1533*67e74705SXin Li ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1534*67e74705SXin Li ///    destination.
1535*67e74705SXin Li ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1536*67e74705SXin Li ///    the destination.
1537*67e74705SXin Li ///    Bit value assignments:
1538*67e74705SXin Li ///    00: Bits [31:0] and [159:128] are copied from the selected operand.
1539*67e74705SXin Li ///    01: Bits [63:32] and [191:160] are copied from the selected operand.
1540*67e74705SXin Li ///    10: Bits [95:64] and [223:192] are copied from the selected operand.
1541*67e74705SXin Li ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1542*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1543*67e74705SXin Li #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1544*67e74705SXin Li   (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1545*67e74705SXin Li                                   (__v8sf)(__m256)(b), \
1546*67e74705SXin Li                                   0  + (((mask) >> 0) & 0x3), \
1547*67e74705SXin Li                                   0  + (((mask) >> 2) & 0x3), \
1548*67e74705SXin Li                                   8  + (((mask) >> 4) & 0x3), \
1549*67e74705SXin Li                                   8  + (((mask) >> 6) & 0x3), \
1550*67e74705SXin Li                                   4  + (((mask) >> 0) & 0x3), \
1551*67e74705SXin Li                                   4  + (((mask) >> 2) & 0x3), \
1552*67e74705SXin Li                                   12 + (((mask) >> 4) & 0x3), \
1553*67e74705SXin Li                                   12 + (((mask) >> 6) & 0x3)); })
1554*67e74705SXin Li 
1555*67e74705SXin Li /// \brief Selects four double-precision values from the 256-bit operands of
1556*67e74705SXin Li ///    [4 x double], as specified by the immediate value operand. The selected
1557*67e74705SXin Li ///    elements from the first 256-bit operand are copied to bits [63:0] and
1558*67e74705SXin Li ///    bits [191:128] in the destination, and the selected elements from the
1559*67e74705SXin Li ///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
1560*67e74705SXin Li ///    the destination. For example, if bits [3:0] of the immediate operand
1561*67e74705SXin Li ///    contain a value of 0xF, the 256-bit destination vector would contain the
1562*67e74705SXin Li ///    following values: b[3], a[3], b[1], a[1].
1563*67e74705SXin Li ///
1564*67e74705SXin Li /// \headerfile <x86intrin.h>
1565*67e74705SXin Li ///
1566*67e74705SXin Li /// \code
1567*67e74705SXin Li /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1568*67e74705SXin Li /// \endcode
1569*67e74705SXin Li ///
1570*67e74705SXin Li /// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
1571*67e74705SXin Li ///
1572*67e74705SXin Li /// \param a
1573*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1574*67e74705SXin Li /// \param b
1575*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1576*67e74705SXin Li /// \param mask
1577*67e74705SXin Li ///    An immediate value containing 8-bit values specifying which elements to
1578*67e74705SXin Li ///    copy from a and b:
1579*67e74705SXin Li ///    Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
1580*67e74705SXin Li ///    destination.
1581*67e74705SXin Li ///    Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
1582*67e74705SXin Li ///    destination.
1583*67e74705SXin Li ///    Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
1584*67e74705SXin Li ///    destination.
1585*67e74705SXin Li ///    Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
1586*67e74705SXin Li ///    destination.
1587*67e74705SXin Li ///    Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
1588*67e74705SXin Li ///    destination.
1589*67e74705SXin Li ///    Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
1590*67e74705SXin Li ///    destination.
1591*67e74705SXin Li ///    Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
1592*67e74705SXin Li ///    destination.
1593*67e74705SXin Li ///    Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
1594*67e74705SXin Li ///    destination.
1595*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1596*67e74705SXin Li #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1597*67e74705SXin Li   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1598*67e74705SXin Li                                    (__v4df)(__m256d)(b), \
1599*67e74705SXin Li                                    0 + (((mask) >> 0) & 0x1), \
1600*67e74705SXin Li                                    4 + (((mask) >> 1) & 0x1), \
1601*67e74705SXin Li                                    2 + (((mask) >> 2) & 0x1), \
1602*67e74705SXin Li                                    6 + (((mask) >> 3) & 0x1)); })
1603*67e74705SXin Li 
1604*67e74705SXin Li /* Compare */
1605*67e74705SXin Li #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1606*67e74705SXin Li #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1607*67e74705SXin Li #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1608*67e74705SXin Li #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1609*67e74705SXin Li #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1610*67e74705SXin Li #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1611*67e74705SXin Li #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1612*67e74705SXin Li #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
1613*67e74705SXin Li #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1614*67e74705SXin Li #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
1615*67e74705SXin Li #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1616*67e74705SXin Li #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1617*67e74705SXin Li #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1618*67e74705SXin Li #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1619*67e74705SXin Li #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1620*67e74705SXin Li #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1621*67e74705SXin Li #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1622*67e74705SXin Li #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1623*67e74705SXin Li #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1624*67e74705SXin Li #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1625*67e74705SXin Li #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1626*67e74705SXin Li #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1627*67e74705SXin Li #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
1628*67e74705SXin Li #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1629*67e74705SXin Li #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1630*67e74705SXin Li #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
1631*67e74705SXin Li #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1632*67e74705SXin Li #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1633*67e74705SXin Li #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1634*67e74705SXin Li #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1635*67e74705SXin Li #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1636*67e74705SXin Li #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1637*67e74705SXin Li 
1638*67e74705SXin Li /// \brief Compares each of the corresponding double-precision values of two
1639*67e74705SXin Li ///    128-bit vectors of [2 x double], using the operation specified by the
1640*67e74705SXin Li ///    immediate integer operand. Returns a [2 x double] vector consisting of
1641*67e74705SXin Li ///    two doubles corresponding to the two comparison results: zero if the
1642*67e74705SXin Li ///    comparison is false, and all 1's if the comparison is true.
1643*67e74705SXin Li ///
1644*67e74705SXin Li /// \headerfile <x86intrin.h>
1645*67e74705SXin Li ///
1646*67e74705SXin Li /// \code
1647*67e74705SXin Li /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1648*67e74705SXin Li /// \endcode
1649*67e74705SXin Li ///
1650*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
1651*67e74705SXin Li ///
1652*67e74705SXin Li /// \param a
1653*67e74705SXin Li ///    A 128-bit vector of [2 x double].
1654*67e74705SXin Li /// \param b
1655*67e74705SXin Li ///    A 128-bit vector of [2 x double].
1656*67e74705SXin Li /// \param c
1657*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1658*67e74705SXin Li ///    operation to use:
1659*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1660*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1661*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1662*67e74705SXin Li ///                        operands)
1663*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1664*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1665*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1666*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1667*67e74705SXin Li ///                        (swapped operands)
1668*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1669*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the comparison results.
1670*67e74705SXin Li #define _mm_cmp_pd(a, b, c) __extension__ ({ \
1671*67e74705SXin Li   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1672*67e74705SXin Li                                 (__v2df)(__m128d)(b), (c)); })
1673*67e74705SXin Li 
1674*67e74705SXin Li /// \brief Compares each of the corresponding values of two 128-bit vectors of
1675*67e74705SXin Li ///    [4 x float], using the operation specified by the immediate integer
1676*67e74705SXin Li ///    operand. Returns a [4 x float] vector consisting of four floats
1677*67e74705SXin Li ///    corresponding to the four comparison results: zero if the comparison is
1678*67e74705SXin Li ///    false, and all 1's if the comparison is true.
1679*67e74705SXin Li ///
1680*67e74705SXin Li /// \headerfile <x86intrin.h>
1681*67e74705SXin Li ///
1682*67e74705SXin Li /// \code
1683*67e74705SXin Li /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1684*67e74705SXin Li /// \endcode
1685*67e74705SXin Li ///
1686*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
1687*67e74705SXin Li ///
1688*67e74705SXin Li /// \param a
1689*67e74705SXin Li ///    A 128-bit vector of [4 x float].
1690*67e74705SXin Li /// \param b
1691*67e74705SXin Li ///    A 128-bit vector of [4 x float].
1692*67e74705SXin Li /// \param c
1693*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1694*67e74705SXin Li ///    operation to use:
1695*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1696*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1697*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1698*67e74705SXin Li ///                        operands)
1699*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1700*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1701*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1702*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1703*67e74705SXin Li ///                       (swapped operands)
1704*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1705*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1706*67e74705SXin Li #define _mm_cmp_ps(a, b, c) __extension__ ({ \
1707*67e74705SXin Li   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1708*67e74705SXin Li                                (__v4sf)(__m128)(b), (c)); })
1709*67e74705SXin Li 
1710*67e74705SXin Li /// \brief Compares each of the corresponding double-precision values of two
1711*67e74705SXin Li ///    256-bit vectors of [4 x double], using the operation specified by the
1712*67e74705SXin Li ///    immediate integer operand. Returns a [4 x double] vector consisting of
1713*67e74705SXin Li ///    four doubles corresponding to the four comparison results: zero if the
1714*67e74705SXin Li ///    comparison is false, and all 1's if the comparison is true.
1715*67e74705SXin Li ///
1716*67e74705SXin Li /// \headerfile <x86intrin.h>
1717*67e74705SXin Li ///
1718*67e74705SXin Li /// \code
1719*67e74705SXin Li /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1720*67e74705SXin Li /// \endcode
1721*67e74705SXin Li ///
1722*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
1723*67e74705SXin Li ///
1724*67e74705SXin Li /// \param a
1725*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1726*67e74705SXin Li /// \param b
1727*67e74705SXin Li ///    A 256-bit vector of [4 x double].
1728*67e74705SXin Li /// \param c
1729*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1730*67e74705SXin Li ///    operation to use:
1731*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1732*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1733*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1734*67e74705SXin Li ///                        operands)
1735*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1736*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1737*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1738*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1739*67e74705SXin Li ///                        (swapped operands)
1740*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1741*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the comparison results.
1742*67e74705SXin Li #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1743*67e74705SXin Li   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1744*67e74705SXin Li                                    (__v4df)(__m256d)(b), (c)); })
1745*67e74705SXin Li 
1746*67e74705SXin Li /// \brief Compares each of the corresponding values of two 256-bit vectors of
1747*67e74705SXin Li ///    [8 x float], using the operation specified by the immediate integer
1748*67e74705SXin Li ///    operand. Returns a [8 x float] vector consisting of eight floats
1749*67e74705SXin Li ///    corresponding to the eight comparison results: zero if the comparison is
1750*67e74705SXin Li ///    false, and all 1's if the comparison is true.
1751*67e74705SXin Li ///
1752*67e74705SXin Li /// \headerfile <x86intrin.h>
1753*67e74705SXin Li ///
1754*67e74705SXin Li /// \code
1755*67e74705SXin Li /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1756*67e74705SXin Li /// \endcode
1757*67e74705SXin Li ///
1758*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
1759*67e74705SXin Li ///
1760*67e74705SXin Li /// \param a
1761*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1762*67e74705SXin Li /// \param b
1763*67e74705SXin Li ///    A 256-bit vector of [8 x float].
1764*67e74705SXin Li /// \param c
1765*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1766*67e74705SXin Li ///    operation to use:
1767*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1768*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1769*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1770*67e74705SXin Li ///                        operands)
1771*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1772*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1773*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1774*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1775*67e74705SXin Li ///                       (swapped operands)
1776*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1777*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the comparison results.
1778*67e74705SXin Li #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1779*67e74705SXin Li   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1780*67e74705SXin Li                                   (__v8sf)(__m256)(b), (c)); })
1781*67e74705SXin Li 
1782*67e74705SXin Li /// \brief Compares each of the corresponding scalar double-precision values of
1783*67e74705SXin Li ///    two 128-bit vectors of [2 x double], using the operation specified by the
1784*67e74705SXin Li ///    immediate integer operand. If the result is true, all 64 bits of the
1785*67e74705SXin Li ///    destination vector are set; otherwise they are cleared.
1786*67e74705SXin Li ///
1787*67e74705SXin Li /// \headerfile <x86intrin.h>
1788*67e74705SXin Li ///
1789*67e74705SXin Li /// \code
1790*67e74705SXin Li /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1791*67e74705SXin Li /// \endcode
1792*67e74705SXin Li ///
1793*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
1794*67e74705SXin Li ///
1795*67e74705SXin Li /// \param a
1796*67e74705SXin Li ///    A 128-bit vector of [2 x double].
1797*67e74705SXin Li /// \param b
1798*67e74705SXin Li ///    A 128-bit vector of [2 x double].
1799*67e74705SXin Li /// \param c
1800*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1801*67e74705SXin Li ///    operation to use:
1802*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1803*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1804*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1805*67e74705SXin Li ///                        operands)
1806*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1807*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1808*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1809*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1810*67e74705SXin Li ///                       (swapped operands)
1811*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1812*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the comparison results.
1813*67e74705SXin Li #define _mm_cmp_sd(a, b, c) __extension__ ({ \
1814*67e74705SXin Li   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1815*67e74705SXin Li                                 (__v2df)(__m128d)(b), (c)); })
1816*67e74705SXin Li 
1817*67e74705SXin Li /// \brief Compares each of the corresponding scalar values of two 128-bit
1818*67e74705SXin Li ///    vectors of [4 x float], using the operation specified by the immediate
1819*67e74705SXin Li ///    integer operand. If the result is true, all 32 bits of the destination
1820*67e74705SXin Li ///    vector are set; otherwise they are cleared.
1821*67e74705SXin Li ///
1822*67e74705SXin Li /// \headerfile <x86intrin.h>
1823*67e74705SXin Li ///
1824*67e74705SXin Li /// \code
1825*67e74705SXin Li /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1826*67e74705SXin Li /// \endcode
1827*67e74705SXin Li ///
1828*67e74705SXin Li /// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
1829*67e74705SXin Li ///
1830*67e74705SXin Li /// \param a
1831*67e74705SXin Li ///    A 128-bit vector of [4 x float].
1832*67e74705SXin Li /// \param b
1833*67e74705SXin Li ///    A 128-bit vector of [4 x float].
1834*67e74705SXin Li /// \param c
1835*67e74705SXin Li ///    An immediate integer operand, with bits [4:0] specifying which comparison
1836*67e74705SXin Li ///    operation to use:
1837*67e74705SXin Li ///    00h, 08h, 10h, 18h: Equal
1838*67e74705SXin Li ///    01h, 09h, 11h, 19h: Less than
1839*67e74705SXin Li ///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1840*67e74705SXin Li ///                        operands)
1841*67e74705SXin Li ///    03h, 0Bh, 13h, 1Bh: Unordered
1842*67e74705SXin Li ///    04h, 0Ch, 14h, 1Ch: Not equal
1843*67e74705SXin Li ///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1844*67e74705SXin Li ///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1845*67e74705SXin Li ///                       (swapped operands)
1846*67e74705SXin Li ///    07h, 0Fh, 17h, 1Fh: Ordered
1847*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1848*67e74705SXin Li #define _mm_cmp_ss(a, b, c) __extension__ ({ \
1849*67e74705SXin Li   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1850*67e74705SXin Li                                (__v4sf)(__m128)(b), (c)); })
1851*67e74705SXin Li 
1852*67e74705SXin Li /// \brief Takes a [8 x i32] vector and returns the vector element value
1853*67e74705SXin Li ///    indexed by the immediate constant operand.
1854*67e74705SXin Li ///
1855*67e74705SXin Li /// \headerfile <x86intrin.h>
1856*67e74705SXin Li ///
1857*67e74705SXin Li /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
1858*67e74705SXin Li ///   EXTRACTF128+COMPOSITE instruction.
1859*67e74705SXin Li ///
1860*67e74705SXin Li /// \param __a
1861*67e74705SXin Li ///    A 256-bit vector of [8 x i32].
1862*67e74705SXin Li /// \param __imm
1863*67e74705SXin Li ///    An immediate integer operand with bits [2:0] determining which vector
1864*67e74705SXin Li ///    element is extracted and returned.
1865*67e74705SXin Li /// \returns A 32-bit integer containing the extracted 32 bits of extended
1866*67e74705SXin Li ///    packed data.
1867*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_extract_epi32(__m256i __a,const int __imm)1868*67e74705SXin Li _mm256_extract_epi32(__m256i __a, const int __imm)
1869*67e74705SXin Li {
1870*67e74705SXin Li   __v8si __b = (__v8si)__a;
1871*67e74705SXin Li   return __b[__imm & 7];
1872*67e74705SXin Li }
1873*67e74705SXin Li 
1874*67e74705SXin Li /// \brief Takes a [16 x i16] vector and returns the vector element value
1875*67e74705SXin Li ///    indexed by the immediate constant operand.
1876*67e74705SXin Li ///
1877*67e74705SXin Li /// \headerfile <x86intrin.h>
1878*67e74705SXin Li ///
1879*67e74705SXin Li /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
1880*67e74705SXin Li ///    EXTRACTF128+COMPOSITE instruction.
1881*67e74705SXin Li ///
1882*67e74705SXin Li /// \param __a
1883*67e74705SXin Li ///    A 256-bit integer vector of [16 x i16].
1884*67e74705SXin Li /// \param __imm
1885*67e74705SXin Li ///    An immediate integer operand with bits [3:0] determining which vector
1886*67e74705SXin Li ///    element is extracted and returned.
1887*67e74705SXin Li /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1888*67e74705SXin Li ///    packed data.
1889*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_extract_epi16(__m256i __a,const int __imm)1890*67e74705SXin Li _mm256_extract_epi16(__m256i __a, const int __imm)
1891*67e74705SXin Li {
1892*67e74705SXin Li   __v16hi __b = (__v16hi)__a;
1893*67e74705SXin Li   return (unsigned short)__b[__imm & 15];
1894*67e74705SXin Li }
1895*67e74705SXin Li 
1896*67e74705SXin Li /// \brief Takes a [32 x i8] vector and returns the vector element value
1897*67e74705SXin Li ///    indexed by the immediate constant operand.
1898*67e74705SXin Li ///
1899*67e74705SXin Li /// \headerfile <x86intrin.h>
1900*67e74705SXin Li ///
1901*67e74705SXin Li /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
1902*67e74705SXin Li ///    EXTRACTF128+COMPOSITE instruction.
1903*67e74705SXin Li ///
1904*67e74705SXin Li /// \param __a
1905*67e74705SXin Li ///    A 256-bit integer vector of [32 x i8].
1906*67e74705SXin Li /// \param __imm
1907*67e74705SXin Li ///    An immediate integer operand with bits [4:0] determining which vector
1908*67e74705SXin Li ///    element is extracted and returned.
1909*67e74705SXin Li /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
1910*67e74705SXin Li ///    packed data.
1911*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_extract_epi8(__m256i __a,const int __imm)1912*67e74705SXin Li _mm256_extract_epi8(__m256i __a, const int __imm)
1913*67e74705SXin Li {
1914*67e74705SXin Li   __v32qi __b = (__v32qi)__a;
1915*67e74705SXin Li   return (unsigned char)__b[__imm & 31];
1916*67e74705SXin Li }
1917*67e74705SXin Li 
1918*67e74705SXin Li #ifdef __x86_64__
1919*67e74705SXin Li /// \brief Takes a [4 x i64] vector and returns the vector element value
1920*67e74705SXin Li ///    indexed by the immediate constant operand.
1921*67e74705SXin Li ///
1922*67e74705SXin Li /// \headerfile <x86intrin.h>
1923*67e74705SXin Li ///
1924*67e74705SXin Li /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
1925*67e74705SXin Li ///    EXTRACTF128+COMPOSITE instruction.
1926*67e74705SXin Li ///
1927*67e74705SXin Li /// \param __a
1928*67e74705SXin Li ///    A 256-bit integer vector of [4 x i64].
1929*67e74705SXin Li /// \param __imm
1930*67e74705SXin Li ///    An immediate integer operand with bits [1:0] determining which vector
1931*67e74705SXin Li ///    element is extracted and returned.
1932*67e74705SXin Li /// \returns A 64-bit integer containing the extracted 64 bits of extended
1933*67e74705SXin Li ///    packed data.
1934*67e74705SXin Li static __inline long long  __DEFAULT_FN_ATTRS
_mm256_extract_epi64(__m256i __a,const int __imm)1935*67e74705SXin Li _mm256_extract_epi64(__m256i __a, const int __imm)
1936*67e74705SXin Li {
1937*67e74705SXin Li   __v4di __b = (__v4di)__a;
1938*67e74705SXin Li   return __b[__imm & 3];
1939*67e74705SXin Li }
1940*67e74705SXin Li #endif
1941*67e74705SXin Li 
1942*67e74705SXin Li /// \brief Takes a [8 x i32] vector and replaces the vector element value
1943*67e74705SXin Li ///    indexed by the immediate constant operand by a new value. Returns the
1944*67e74705SXin Li ///    modified vector.
1945*67e74705SXin Li ///
1946*67e74705SXin Li /// \headerfile <x86intrin.h>
1947*67e74705SXin Li ///
1948*67e74705SXin Li /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
1949*67e74705SXin Li ///    INSERTF128+COMPOSITE instruction.
1950*67e74705SXin Li ///
1951*67e74705SXin Li /// \param __a
1952*67e74705SXin Li ///    A vector of [8 x i32] to be used by the insert operation.
1953*67e74705SXin Li /// \param __b
1954*67e74705SXin Li ///    An integer value. The replacement value for the insert operation.
1955*67e74705SXin Li /// \param __imm
1956*67e74705SXin Li ///    An immediate integer specifying the index of the vector element to be
1957*67e74705SXin Li ///    replaced.
1958*67e74705SXin Li /// \returns A copy of vector __a, after replacing its element indexed by __imm
1959*67e74705SXin Li ///     with __b.
1960*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi32(__m256i __a,int __b,int const __imm)1961*67e74705SXin Li _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
1962*67e74705SXin Li {
1963*67e74705SXin Li   __v8si __c = (__v8si)__a;
1964*67e74705SXin Li   __c[__imm & 7] = __b;
1965*67e74705SXin Li   return (__m256i)__c;
1966*67e74705SXin Li }
1967*67e74705SXin Li 
1968*67e74705SXin Li 
1969*67e74705SXin Li /// \brief Takes a [16 x i16] vector and replaces the vector element value
1970*67e74705SXin Li ///    indexed by the immediate constant operand with a new value. Returns the
1971*67e74705SXin Li ///    modified vector.
1972*67e74705SXin Li ///
1973*67e74705SXin Li /// \headerfile <x86intrin.h>
1974*67e74705SXin Li ///
1975*67e74705SXin Li /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
1976*67e74705SXin Li ///    INSERTF128+COMPOSITE instruction.
1977*67e74705SXin Li ///
1978*67e74705SXin Li /// \param __a
1979*67e74705SXin Li ///    A vector of [16 x i16] to be used by the insert operation.
1980*67e74705SXin Li /// \param __b
1981*67e74705SXin Li ///    An i16 integer value. The replacement value for the insert operation.
1982*67e74705SXin Li /// \param __imm
1983*67e74705SXin Li ///    An immediate integer specifying the index of the vector element to be
1984*67e74705SXin Li ///    replaced.
1985*67e74705SXin Li /// \returns A copy of vector __a, after replacing its element indexed by __imm
1986*67e74705SXin Li ///     with __b.
1987*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi16(__m256i __a,int __b,int const __imm)1988*67e74705SXin Li _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
1989*67e74705SXin Li {
1990*67e74705SXin Li   __v16hi __c = (__v16hi)__a;
1991*67e74705SXin Li   __c[__imm & 15] = __b;
1992*67e74705SXin Li   return (__m256i)__c;
1993*67e74705SXin Li }
1994*67e74705SXin Li 
1995*67e74705SXin Li /// \brief Takes a [32 x i8] vector and replaces the vector element value
1996*67e74705SXin Li ///    indexed by the immediate constant operand with a new value. Returns the
1997*67e74705SXin Li ///    modified vector.
1998*67e74705SXin Li ///
1999*67e74705SXin Li /// \headerfile <x86intrin.h>
2000*67e74705SXin Li ///
2001*67e74705SXin Li /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
2002*67e74705SXin Li ///    INSERTF128+COMPOSITE instruction.
2003*67e74705SXin Li ///
2004*67e74705SXin Li /// \param __a
2005*67e74705SXin Li ///    A vector of [32 x i8] to be used by the insert operation.
2006*67e74705SXin Li /// \param __b
2007*67e74705SXin Li ///    An i8 integer value. The replacement value for the insert operation.
2008*67e74705SXin Li /// \param __imm
2009*67e74705SXin Li ///    An immediate integer specifying the index of the vector element to be
2010*67e74705SXin Li ///    replaced.
2011*67e74705SXin Li /// \returns A copy of vector __a, after replacing its element indexed by __imm
2012*67e74705SXin Li ///    with __b.
2013*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi8(__m256i __a,int __b,int const __imm)2014*67e74705SXin Li _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
2015*67e74705SXin Li {
2016*67e74705SXin Li   __v32qi __c = (__v32qi)__a;
2017*67e74705SXin Li   __c[__imm & 31] = __b;
2018*67e74705SXin Li   return (__m256i)__c;
2019*67e74705SXin Li }
2020*67e74705SXin Li 
2021*67e74705SXin Li #ifdef __x86_64__
2022*67e74705SXin Li /// \brief Takes a [4 x i64] vector and replaces the vector element value
2023*67e74705SXin Li ///    indexed by the immediate constant operand with a new value. Returns the
2024*67e74705SXin Li ///    modified vector.
2025*67e74705SXin Li ///
2026*67e74705SXin Li /// \headerfile <x86intrin.h>
2027*67e74705SXin Li ///
2028*67e74705SXin Li /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
2029*67e74705SXin Li ///    INSERTF128+COMPOSITE instruction.
2030*67e74705SXin Li ///
2031*67e74705SXin Li /// \param __a
2032*67e74705SXin Li ///    A vector of [4 x i64] to be used by the insert operation.
2033*67e74705SXin Li /// \param __b
2034*67e74705SXin Li ///    A 64-bit integer value. The replacement value for the insert operation.
2035*67e74705SXin Li /// \param __imm
2036*67e74705SXin Li ///    An immediate integer specifying the index of the vector element to be
2037*67e74705SXin Li ///    replaced.
2038*67e74705SXin Li /// \returns A copy of vector __a, after replacing its element indexed by __imm
2039*67e74705SXin Li ///     with __b.
2040*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi64(__m256i __a,long long __b,int const __imm)2041*67e74705SXin Li _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
2042*67e74705SXin Li {
2043*67e74705SXin Li   __v4di __c = (__v4di)__a;
2044*67e74705SXin Li   __c[__imm & 3] = __b;
2045*67e74705SXin Li   return (__m256i)__c;
2046*67e74705SXin Li }
2047*67e74705SXin Li #endif
2048*67e74705SXin Li 
2049*67e74705SXin Li /* Conversion */
2050*67e74705SXin Li /// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2051*67e74705SXin Li ///
2052*67e74705SXin Li /// \headerfile <x86intrin.h>
2053*67e74705SXin Li ///
2054*67e74705SXin Li /// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
2055*67e74705SXin Li ///
2056*67e74705SXin Li /// \param __a
2057*67e74705SXin Li ///    A 128-bit integer vector of [4 x i32].
2058*67e74705SXin Li /// \returns A 256-bit vector of [4 x double] containing the converted values.
2059*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_cvtepi32_pd(__m128i __a)2060*67e74705SXin Li _mm256_cvtepi32_pd(__m128i __a)
2061*67e74705SXin Li {
2062*67e74705SXin Li   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2063*67e74705SXin Li }
2064*67e74705SXin Li 
2065*67e74705SXin Li /// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2066*67e74705SXin Li ///
2067*67e74705SXin Li /// \headerfile <x86intrin.h>
2068*67e74705SXin Li ///
2069*67e74705SXin Li /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
2070*67e74705SXin Li ///
2071*67e74705SXin Li /// \param __a
2072*67e74705SXin Li ///    A 256-bit integer vector.
2073*67e74705SXin Li /// \returns A 256-bit vector of [8 x float] containing the converted values.
2074*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_cvtepi32_ps(__m256i __a)2075*67e74705SXin Li _mm256_cvtepi32_ps(__m256i __a)
2076*67e74705SXin Li {
2077*67e74705SXin Li   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
2078*67e74705SXin Li }
2079*67e74705SXin Li 
2080*67e74705SXin Li /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2081*67e74705SXin Li ///    [4 x float].
2082*67e74705SXin Li ///
2083*67e74705SXin Li /// \headerfile <x86intrin.h>
2084*67e74705SXin Li ///
2085*67e74705SXin Li /// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
2086*67e74705SXin Li ///
2087*67e74705SXin Li /// \param __a
2088*67e74705SXin Li ///    A 256-bit vector of [4 x double].
2089*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the converted values.
2090*67e74705SXin Li static __inline __m128 __DEFAULT_FN_ATTRS
_mm256_cvtpd_ps(__m256d __a)2091*67e74705SXin Li _mm256_cvtpd_ps(__m256d __a)
2092*67e74705SXin Li {
2093*67e74705SXin Li   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2094*67e74705SXin Li }
2095*67e74705SXin Li 
2096*67e74705SXin Li /// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2097*67e74705SXin Li ///
2098*67e74705SXin Li /// \headerfile <x86intrin.h>
2099*67e74705SXin Li ///
2100*67e74705SXin Li /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
2101*67e74705SXin Li ///
2102*67e74705SXin Li /// \param __a
2103*67e74705SXin Li ///    A 256-bit vector of [8 x float].
2104*67e74705SXin Li /// \returns A 256-bit integer vector containing the converted values.
2105*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_cvtps_epi32(__m256 __a)2106*67e74705SXin Li _mm256_cvtps_epi32(__m256 __a)
2107*67e74705SXin Li {
2108*67e74705SXin Li   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2109*67e74705SXin Li }
2110*67e74705SXin Li 
2111*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_cvtps_pd(__m128 __a)2112*67e74705SXin Li _mm256_cvtps_pd(__m128 __a)
2113*67e74705SXin Li {
2114*67e74705SXin Li   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2115*67e74705SXin Li }
2116*67e74705SXin Li 
2117*67e74705SXin Li static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_cvttpd_epi32(__m256d __a)2118*67e74705SXin Li _mm256_cvttpd_epi32(__m256d __a)
2119*67e74705SXin Li {
2120*67e74705SXin Li   return (__m128i)__builtin_convertvector((__v4df) __a, __v4si);
2121*67e74705SXin Li }
2122*67e74705SXin Li 
2123*67e74705SXin Li static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_cvtpd_epi32(__m256d __a)2124*67e74705SXin Li _mm256_cvtpd_epi32(__m256d __a)
2125*67e74705SXin Li {
2126*67e74705SXin Li   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2127*67e74705SXin Li }
2128*67e74705SXin Li 
2129*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_cvttps_epi32(__m256 __a)2130*67e74705SXin Li _mm256_cvttps_epi32(__m256 __a)
2131*67e74705SXin Li {
2132*67e74705SXin Li   return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si);
2133*67e74705SXin Li }
2134*67e74705SXin Li 
2135*67e74705SXin Li static __inline double __DEFAULT_FN_ATTRS
_mm256_cvtsd_f64(__m256d __a)2136*67e74705SXin Li _mm256_cvtsd_f64(__m256d __a)
2137*67e74705SXin Li {
2138*67e74705SXin Li  return __a[0];
2139*67e74705SXin Li }
2140*67e74705SXin Li 
2141*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_cvtsi256_si32(__m256i __a)2142*67e74705SXin Li _mm256_cvtsi256_si32(__m256i __a)
2143*67e74705SXin Li {
2144*67e74705SXin Li  __v8si __b = (__v8si)__a;
2145*67e74705SXin Li  return __b[0];
2146*67e74705SXin Li }
2147*67e74705SXin Li 
2148*67e74705SXin Li static __inline float __DEFAULT_FN_ATTRS
_mm256_cvtss_f32(__m256 __a)2149*67e74705SXin Li _mm256_cvtss_f32(__m256 __a)
2150*67e74705SXin Li {
2151*67e74705SXin Li  return __a[0];
2152*67e74705SXin Li }
2153*67e74705SXin Li 
2154*67e74705SXin Li /* Vector replicate */
2155*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_movehdup_ps(__m256 __a)2156*67e74705SXin Li _mm256_movehdup_ps(__m256 __a)
2157*67e74705SXin Li {
2158*67e74705SXin Li   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2159*67e74705SXin Li }
2160*67e74705SXin Li 
2161*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_moveldup_ps(__m256 __a)2162*67e74705SXin Li _mm256_moveldup_ps(__m256 __a)
2163*67e74705SXin Li {
2164*67e74705SXin Li   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2165*67e74705SXin Li }
2166*67e74705SXin Li 
2167*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_movedup_pd(__m256d __a)2168*67e74705SXin Li _mm256_movedup_pd(__m256d __a)
2169*67e74705SXin Li {
2170*67e74705SXin Li   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2171*67e74705SXin Li }
2172*67e74705SXin Li 
2173*67e74705SXin Li /* Unpack and Interleave */
2174*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpackhi_pd(__m256d __a,__m256d __b)2175*67e74705SXin Li _mm256_unpackhi_pd(__m256d __a, __m256d __b)
2176*67e74705SXin Li {
2177*67e74705SXin Li   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2178*67e74705SXin Li }
2179*67e74705SXin Li 
2180*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpacklo_pd(__m256d __a,__m256d __b)2181*67e74705SXin Li _mm256_unpacklo_pd(__m256d __a, __m256d __b)
2182*67e74705SXin Li {
2183*67e74705SXin Li   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2184*67e74705SXin Li }
2185*67e74705SXin Li 
2186*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpackhi_ps(__m256 __a,__m256 __b)2187*67e74705SXin Li _mm256_unpackhi_ps(__m256 __a, __m256 __b)
2188*67e74705SXin Li {
2189*67e74705SXin Li   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2190*67e74705SXin Li }
2191*67e74705SXin Li 
2192*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpacklo_ps(__m256 __a,__m256 __b)2193*67e74705SXin Li _mm256_unpacklo_ps(__m256 __a, __m256 __b)
2194*67e74705SXin Li {
2195*67e74705SXin Li   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2196*67e74705SXin Li }
2197*67e74705SXin Li 
2198*67e74705SXin Li /* Bit Test */
2199*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testz_pd(__m128d __a,__m128d __b)2200*67e74705SXin Li _mm_testz_pd(__m128d __a, __m128d __b)
2201*67e74705SXin Li {
2202*67e74705SXin Li   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2203*67e74705SXin Li }
2204*67e74705SXin Li 
2205*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testc_pd(__m128d __a,__m128d __b)2206*67e74705SXin Li _mm_testc_pd(__m128d __a, __m128d __b)
2207*67e74705SXin Li {
2208*67e74705SXin Li   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2209*67e74705SXin Li }
2210*67e74705SXin Li 
2211*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testnzc_pd(__m128d __a,__m128d __b)2212*67e74705SXin Li _mm_testnzc_pd(__m128d __a, __m128d __b)
2213*67e74705SXin Li {
2214*67e74705SXin Li   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2215*67e74705SXin Li }
2216*67e74705SXin Li 
2217*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testz_ps(__m128 __a,__m128 __b)2218*67e74705SXin Li _mm_testz_ps(__m128 __a, __m128 __b)
2219*67e74705SXin Li {
2220*67e74705SXin Li   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2221*67e74705SXin Li }
2222*67e74705SXin Li 
2223*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testc_ps(__m128 __a,__m128 __b)2224*67e74705SXin Li _mm_testc_ps(__m128 __a, __m128 __b)
2225*67e74705SXin Li {
2226*67e74705SXin Li   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2227*67e74705SXin Li }
2228*67e74705SXin Li 
2229*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm_testnzc_ps(__m128 __a,__m128 __b)2230*67e74705SXin Li _mm_testnzc_ps(__m128 __a, __m128 __b)
2231*67e74705SXin Li {
2232*67e74705SXin Li   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2233*67e74705SXin Li }
2234*67e74705SXin Li 
2235*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_pd(__m256d __a,__m256d __b)2236*67e74705SXin Li _mm256_testz_pd(__m256d __a, __m256d __b)
2237*67e74705SXin Li {
2238*67e74705SXin Li   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2239*67e74705SXin Li }
2240*67e74705SXin Li 
2241*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_pd(__m256d __a,__m256d __b)2242*67e74705SXin Li _mm256_testc_pd(__m256d __a, __m256d __b)
2243*67e74705SXin Li {
2244*67e74705SXin Li   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2245*67e74705SXin Li }
2246*67e74705SXin Li 
2247*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_pd(__m256d __a,__m256d __b)2248*67e74705SXin Li _mm256_testnzc_pd(__m256d __a, __m256d __b)
2249*67e74705SXin Li {
2250*67e74705SXin Li   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2251*67e74705SXin Li }
2252*67e74705SXin Li 
2253*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_ps(__m256 __a,__m256 __b)2254*67e74705SXin Li _mm256_testz_ps(__m256 __a, __m256 __b)
2255*67e74705SXin Li {
2256*67e74705SXin Li   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2257*67e74705SXin Li }
2258*67e74705SXin Li 
2259*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_ps(__m256 __a,__m256 __b)2260*67e74705SXin Li _mm256_testc_ps(__m256 __a, __m256 __b)
2261*67e74705SXin Li {
2262*67e74705SXin Li   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2263*67e74705SXin Li }
2264*67e74705SXin Li 
2265*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_ps(__m256 __a,__m256 __b)2266*67e74705SXin Li _mm256_testnzc_ps(__m256 __a, __m256 __b)
2267*67e74705SXin Li {
2268*67e74705SXin Li   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2269*67e74705SXin Li }
2270*67e74705SXin Li 
2271*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_si256(__m256i __a,__m256i __b)2272*67e74705SXin Li _mm256_testz_si256(__m256i __a, __m256i __b)
2273*67e74705SXin Li {
2274*67e74705SXin Li   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2275*67e74705SXin Li }
2276*67e74705SXin Li 
2277*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_si256(__m256i __a,__m256i __b)2278*67e74705SXin Li _mm256_testc_si256(__m256i __a, __m256i __b)
2279*67e74705SXin Li {
2280*67e74705SXin Li   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2281*67e74705SXin Li }
2282*67e74705SXin Li 
2283*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_si256(__m256i __a,__m256i __b)2284*67e74705SXin Li _mm256_testnzc_si256(__m256i __a, __m256i __b)
2285*67e74705SXin Li {
2286*67e74705SXin Li   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2287*67e74705SXin Li }
2288*67e74705SXin Li 
2289*67e74705SXin Li /* Vector extract sign mask */
2290*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_movemask_pd(__m256d __a)2291*67e74705SXin Li _mm256_movemask_pd(__m256d __a)
2292*67e74705SXin Li {
2293*67e74705SXin Li   return __builtin_ia32_movmskpd256((__v4df)__a);
2294*67e74705SXin Li }
2295*67e74705SXin Li 
2296*67e74705SXin Li static __inline int __DEFAULT_FN_ATTRS
_mm256_movemask_ps(__m256 __a)2297*67e74705SXin Li _mm256_movemask_ps(__m256 __a)
2298*67e74705SXin Li {
2299*67e74705SXin Li   return __builtin_ia32_movmskps256((__v8sf)__a);
2300*67e74705SXin Li }
2301*67e74705SXin Li 
2302*67e74705SXin Li /* Vector __zero */
2303*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_zeroall(void)2304*67e74705SXin Li _mm256_zeroall(void)
2305*67e74705SXin Li {
2306*67e74705SXin Li   __builtin_ia32_vzeroall();
2307*67e74705SXin Li }
2308*67e74705SXin Li 
2309*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_zeroupper(void)2310*67e74705SXin Li _mm256_zeroupper(void)
2311*67e74705SXin Li {
2312*67e74705SXin Li   __builtin_ia32_vzeroupper();
2313*67e74705SXin Li }
2314*67e74705SXin Li 
2315*67e74705SXin Li /* Vector load with broadcast */
2316*67e74705SXin Li static __inline __m128 __DEFAULT_FN_ATTRS
_mm_broadcast_ss(float const * __a)2317*67e74705SXin Li _mm_broadcast_ss(float const *__a)
2318*67e74705SXin Li {
2319*67e74705SXin Li   float __f = *__a;
2320*67e74705SXin Li   return (__m128)(__v4sf){ __f, __f, __f, __f };
2321*67e74705SXin Li }
2322*67e74705SXin Li 
2323*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_sd(double const * __a)2324*67e74705SXin Li _mm256_broadcast_sd(double const *__a)
2325*67e74705SXin Li {
2326*67e74705SXin Li   double __d = *__a;
2327*67e74705SXin Li   return (__m256d)(__v4df){ __d, __d, __d, __d };
2328*67e74705SXin Li }
2329*67e74705SXin Li 
2330*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_ss(float const * __a)2331*67e74705SXin Li _mm256_broadcast_ss(float const *__a)
2332*67e74705SXin Li {
2333*67e74705SXin Li   float __f = *__a;
2334*67e74705SXin Li   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
2335*67e74705SXin Li }
2336*67e74705SXin Li 
2337*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_pd(__m128d const * __a)2338*67e74705SXin Li _mm256_broadcast_pd(__m128d const *__a)
2339*67e74705SXin Li {
2340*67e74705SXin Li   return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
2341*67e74705SXin Li }
2342*67e74705SXin Li 
2343*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_ps(__m128 const * __a)2344*67e74705SXin Li _mm256_broadcast_ps(__m128 const *__a)
2345*67e74705SXin Li {
2346*67e74705SXin Li   return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
2347*67e74705SXin Li }
2348*67e74705SXin Li 
2349*67e74705SXin Li /* SIMD load ops */
2350*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_load_pd(double const * __p)2351*67e74705SXin Li _mm256_load_pd(double const *__p)
2352*67e74705SXin Li {
2353*67e74705SXin Li   return *(__m256d *)__p;
2354*67e74705SXin Li }
2355*67e74705SXin Li 
2356*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_load_ps(float const * __p)2357*67e74705SXin Li _mm256_load_ps(float const *__p)
2358*67e74705SXin Li {
2359*67e74705SXin Li   return *(__m256 *)__p;
2360*67e74705SXin Li }
2361*67e74705SXin Li 
2362*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu_pd(double const * __p)2363*67e74705SXin Li _mm256_loadu_pd(double const *__p)
2364*67e74705SXin Li {
2365*67e74705SXin Li   struct __loadu_pd {
2366*67e74705SXin Li     __m256d __v;
2367*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2368*67e74705SXin Li   return ((struct __loadu_pd*)__p)->__v;
2369*67e74705SXin Li }
2370*67e74705SXin Li 
2371*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu_ps(float const * __p)2372*67e74705SXin Li _mm256_loadu_ps(float const *__p)
2373*67e74705SXin Li {
2374*67e74705SXin Li   struct __loadu_ps {
2375*67e74705SXin Li     __m256 __v;
2376*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2377*67e74705SXin Li   return ((struct __loadu_ps*)__p)->__v;
2378*67e74705SXin Li }
2379*67e74705SXin Li 
2380*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_load_si256(__m256i const * __p)2381*67e74705SXin Li _mm256_load_si256(__m256i const *__p)
2382*67e74705SXin Li {
2383*67e74705SXin Li   return *__p;
2384*67e74705SXin Li }
2385*67e74705SXin Li 
2386*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu_si256(__m256i const * __p)2387*67e74705SXin Li _mm256_loadu_si256(__m256i const *__p)
2388*67e74705SXin Li {
2389*67e74705SXin Li   struct __loadu_si256 {
2390*67e74705SXin Li     __m256i __v;
2391*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2392*67e74705SXin Li   return ((struct __loadu_si256*)__p)->__v;
2393*67e74705SXin Li }
2394*67e74705SXin Li 
2395*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_lddqu_si256(__m256i const * __p)2396*67e74705SXin Li _mm256_lddqu_si256(__m256i const *__p)
2397*67e74705SXin Li {
2398*67e74705SXin Li   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
2399*67e74705SXin Li }
2400*67e74705SXin Li 
2401*67e74705SXin Li /* SIMD store ops */
2402*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_store_pd(double * __p,__m256d __a)2403*67e74705SXin Li _mm256_store_pd(double *__p, __m256d __a)
2404*67e74705SXin Li {
2405*67e74705SXin Li   *(__m256d *)__p = __a;
2406*67e74705SXin Li }
2407*67e74705SXin Li 
2408*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_store_ps(float * __p,__m256 __a)2409*67e74705SXin Li _mm256_store_ps(float *__p, __m256 __a)
2410*67e74705SXin Li {
2411*67e74705SXin Li   *(__m256 *)__p = __a;
2412*67e74705SXin Li }
2413*67e74705SXin Li 
2414*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_pd(double * __p,__m256d __a)2415*67e74705SXin Li _mm256_storeu_pd(double *__p, __m256d __a)
2416*67e74705SXin Li {
2417*67e74705SXin Li   struct __storeu_pd {
2418*67e74705SXin Li     __m256d __v;
2419*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2420*67e74705SXin Li   ((struct __storeu_pd*)__p)->__v = __a;
2421*67e74705SXin Li }
2422*67e74705SXin Li 
2423*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_ps(float * __p,__m256 __a)2424*67e74705SXin Li _mm256_storeu_ps(float *__p, __m256 __a)
2425*67e74705SXin Li {
2426*67e74705SXin Li   struct __storeu_ps {
2427*67e74705SXin Li     __m256 __v;
2428*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2429*67e74705SXin Li   ((struct __storeu_ps*)__p)->__v = __a;
2430*67e74705SXin Li }
2431*67e74705SXin Li 
2432*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_store_si256(__m256i * __p,__m256i __a)2433*67e74705SXin Li _mm256_store_si256(__m256i *__p, __m256i __a)
2434*67e74705SXin Li {
2435*67e74705SXin Li   *__p = __a;
2436*67e74705SXin Li }
2437*67e74705SXin Li 
2438*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_si256(__m256i * __p,__m256i __a)2439*67e74705SXin Li _mm256_storeu_si256(__m256i *__p, __m256i __a)
2440*67e74705SXin Li {
2441*67e74705SXin Li   struct __storeu_si256 {
2442*67e74705SXin Li     __m256i __v;
2443*67e74705SXin Li   } __attribute__((__packed__, __may_alias__));
2444*67e74705SXin Li   ((struct __storeu_si256*)__p)->__v = __a;
2445*67e74705SXin Li }
2446*67e74705SXin Li 
2447*67e74705SXin Li /* Conditional load ops */
2448*67e74705SXin Li static __inline __m128d __DEFAULT_FN_ATTRS
_mm_maskload_pd(double const * __p,__m128i __m)2449*67e74705SXin Li _mm_maskload_pd(double const *__p, __m128i __m)
2450*67e74705SXin Li {
2451*67e74705SXin Li   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
2452*67e74705SXin Li }
2453*67e74705SXin Li 
2454*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_maskload_pd(double const * __p,__m256i __m)2455*67e74705SXin Li _mm256_maskload_pd(double const *__p, __m256i __m)
2456*67e74705SXin Li {
2457*67e74705SXin Li   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
2458*67e74705SXin Li                                                (__v4di)__m);
2459*67e74705SXin Li }
2460*67e74705SXin Li 
2461*67e74705SXin Li static __inline __m128 __DEFAULT_FN_ATTRS
_mm_maskload_ps(float const * __p,__m128i __m)2462*67e74705SXin Li _mm_maskload_ps(float const *__p, __m128i __m)
2463*67e74705SXin Li {
2464*67e74705SXin Li   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
2465*67e74705SXin Li }
2466*67e74705SXin Li 
2467*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_maskload_ps(float const * __p,__m256i __m)2468*67e74705SXin Li _mm256_maskload_ps(float const *__p, __m256i __m)
2469*67e74705SXin Li {
2470*67e74705SXin Li   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
2471*67e74705SXin Li }
2472*67e74705SXin Li 
2473*67e74705SXin Li /* Conditional store ops */
2474*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_maskstore_ps(float * __p,__m256i __m,__m256 __a)2475*67e74705SXin Li _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
2476*67e74705SXin Li {
2477*67e74705SXin Li   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
2478*67e74705SXin Li }
2479*67e74705SXin Li 
2480*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm_maskstore_pd(double * __p,__m128i __m,__m128d __a)2481*67e74705SXin Li _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
2482*67e74705SXin Li {
2483*67e74705SXin Li   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
2484*67e74705SXin Li }
2485*67e74705SXin Li 
2486*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_maskstore_pd(double * __p,__m256i __m,__m256d __a)2487*67e74705SXin Li _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
2488*67e74705SXin Li {
2489*67e74705SXin Li   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
2490*67e74705SXin Li }
2491*67e74705SXin Li 
2492*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm_maskstore_ps(float * __p,__m128i __m,__m128 __a)2493*67e74705SXin Li _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
2494*67e74705SXin Li {
2495*67e74705SXin Li   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
2496*67e74705SXin Li }
2497*67e74705SXin Li 
2498*67e74705SXin Li /* Cacheability support ops */
2499*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i * __a,__m256i __b)2500*67e74705SXin Li _mm256_stream_si256(__m256i *__a, __m256i __b)
2501*67e74705SXin Li {
2502*67e74705SXin Li   __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
2503*67e74705SXin Li }
2504*67e74705SXin Li 
2505*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double * __a,__m256d __b)2506*67e74705SXin Li _mm256_stream_pd(double *__a, __m256d __b)
2507*67e74705SXin Li {
2508*67e74705SXin Li   __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
2509*67e74705SXin Li }
2510*67e74705SXin Li 
2511*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float * __p,__m256 __a)2512*67e74705SXin Li _mm256_stream_ps(float *__p, __m256 __a)
2513*67e74705SXin Li {
2514*67e74705SXin Li   __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
2515*67e74705SXin Li }
2516*67e74705SXin Li 
2517*67e74705SXin Li /* Create vectors */
2518*67e74705SXin Li static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_undefined_pd(void)2519*67e74705SXin Li _mm256_undefined_pd(void)
2520*67e74705SXin Li {
2521*67e74705SXin Li   return (__m256d)__builtin_ia32_undef256();
2522*67e74705SXin Li }
2523*67e74705SXin Li 
2524*67e74705SXin Li static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_undefined_ps(void)2525*67e74705SXin Li _mm256_undefined_ps(void)
2526*67e74705SXin Li {
2527*67e74705SXin Li   return (__m256)__builtin_ia32_undef256();
2528*67e74705SXin Li }
2529*67e74705SXin Li 
2530*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_undefined_si256(void)2531*67e74705SXin Li _mm256_undefined_si256(void)
2532*67e74705SXin Li {
2533*67e74705SXin Li   return (__m256i)__builtin_ia32_undef256();
2534*67e74705SXin Li }
2535*67e74705SXin Li 
2536*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set_pd(double __a,double __b,double __c,double __d)2537*67e74705SXin Li _mm256_set_pd(double __a, double __b, double __c, double __d)
2538*67e74705SXin Li {
2539*67e74705SXin Li   return (__m256d){ __d, __c, __b, __a };
2540*67e74705SXin Li }
2541*67e74705SXin Li 
2542*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set_ps(float __a,float __b,float __c,float __d,float __e,float __f,float __g,float __h)2543*67e74705SXin Li _mm256_set_ps(float __a, float __b, float __c, float __d,
2544*67e74705SXin Li               float __e, float __f, float __g, float __h)
2545*67e74705SXin Li {
2546*67e74705SXin Li   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
2547*67e74705SXin Li }
2548*67e74705SXin Li 
2549*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi32(int __i0,int __i1,int __i2,int __i3,int __i4,int __i5,int __i6,int __i7)2550*67e74705SXin Li _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
2551*67e74705SXin Li                  int __i4, int __i5, int __i6, int __i7)
2552*67e74705SXin Li {
2553*67e74705SXin Li   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
2554*67e74705SXin Li }
2555*67e74705SXin Li 
2556*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi16(short __w15,short __w14,short __w13,short __w12,short __w11,short __w10,short __w09,short __w08,short __w07,short __w06,short __w05,short __w04,short __w03,short __w02,short __w01,short __w00)2557*67e74705SXin Li _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
2558*67e74705SXin Li                  short __w11, short __w10, short __w09, short __w08,
2559*67e74705SXin Li                  short __w07, short __w06, short __w05, short __w04,
2560*67e74705SXin Li                  short __w03, short __w02, short __w01, short __w00)
2561*67e74705SXin Li {
2562*67e74705SXin Li   return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
2563*67e74705SXin Li     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
2564*67e74705SXin Li }
2565*67e74705SXin Li 
2566*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi8(char __b31,char __b30,char __b29,char __b28,char __b27,char __b26,char __b25,char __b24,char __b23,char __b22,char __b21,char __b20,char __b19,char __b18,char __b17,char __b16,char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b09,char __b08,char __b07,char __b06,char __b05,char __b04,char __b03,char __b02,char __b01,char __b00)2567*67e74705SXin Li _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
2568*67e74705SXin Li                 char __b27, char __b26, char __b25, char __b24,
2569*67e74705SXin Li                 char __b23, char __b22, char __b21, char __b20,
2570*67e74705SXin Li                 char __b19, char __b18, char __b17, char __b16,
2571*67e74705SXin Li                 char __b15, char __b14, char __b13, char __b12,
2572*67e74705SXin Li                 char __b11, char __b10, char __b09, char __b08,
2573*67e74705SXin Li                 char __b07, char __b06, char __b05, char __b04,
2574*67e74705SXin Li                 char __b03, char __b02, char __b01, char __b00)
2575*67e74705SXin Li {
2576*67e74705SXin Li   return (__m256i)(__v32qi){
2577*67e74705SXin Li     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
2578*67e74705SXin Li     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
2579*67e74705SXin Li     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
2580*67e74705SXin Li     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
2581*67e74705SXin Li   };
2582*67e74705SXin Li }
2583*67e74705SXin Li 
2584*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi64x(long long __a,long long __b,long long __c,long long __d)2585*67e74705SXin Li _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
2586*67e74705SXin Li {
2587*67e74705SXin Li   return (__m256i)(__v4di){ __d, __c, __b, __a };
2588*67e74705SXin Li }
2589*67e74705SXin Li 
2590*67e74705SXin Li /* Create vectors with elements in reverse order */
2591*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setr_pd(double __a,double __b,double __c,double __d)2592*67e74705SXin Li _mm256_setr_pd(double __a, double __b, double __c, double __d)
2593*67e74705SXin Li {
2594*67e74705SXin Li   return (__m256d){ __a, __b, __c, __d };
2595*67e74705SXin Li }
2596*67e74705SXin Li 
2597*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setr_ps(float __a,float __b,float __c,float __d,float __e,float __f,float __g,float __h)2598*67e74705SXin Li _mm256_setr_ps(float __a, float __b, float __c, float __d,
2599*67e74705SXin Li                float __e, float __f, float __g, float __h)
2600*67e74705SXin Li {
2601*67e74705SXin Li   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
2602*67e74705SXin Li }
2603*67e74705SXin Li 
2604*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi32(int __i0,int __i1,int __i2,int __i3,int __i4,int __i5,int __i6,int __i7)2605*67e74705SXin Li _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
2606*67e74705SXin Li                   int __i4, int __i5, int __i6, int __i7)
2607*67e74705SXin Li {
2608*67e74705SXin Li   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
2609*67e74705SXin Li }
2610*67e74705SXin Li 
2611*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi16(short __w15,short __w14,short __w13,short __w12,short __w11,short __w10,short __w09,short __w08,short __w07,short __w06,short __w05,short __w04,short __w03,short __w02,short __w01,short __w00)2612*67e74705SXin Li _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
2613*67e74705SXin Li        short __w11, short __w10, short __w09, short __w08,
2614*67e74705SXin Li        short __w07, short __w06, short __w05, short __w04,
2615*67e74705SXin Li        short __w03, short __w02, short __w01, short __w00)
2616*67e74705SXin Li {
2617*67e74705SXin Li   return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
2618*67e74705SXin Li     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
2619*67e74705SXin Li }
2620*67e74705SXin Li 
2621*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi8(char __b31,char __b30,char __b29,char __b28,char __b27,char __b26,char __b25,char __b24,char __b23,char __b22,char __b21,char __b20,char __b19,char __b18,char __b17,char __b16,char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b09,char __b08,char __b07,char __b06,char __b05,char __b04,char __b03,char __b02,char __b01,char __b00)2622*67e74705SXin Li _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
2623*67e74705SXin Li                  char __b27, char __b26, char __b25, char __b24,
2624*67e74705SXin Li                  char __b23, char __b22, char __b21, char __b20,
2625*67e74705SXin Li                  char __b19, char __b18, char __b17, char __b16,
2626*67e74705SXin Li                  char __b15, char __b14, char __b13, char __b12,
2627*67e74705SXin Li                  char __b11, char __b10, char __b09, char __b08,
2628*67e74705SXin Li                  char __b07, char __b06, char __b05, char __b04,
2629*67e74705SXin Li                  char __b03, char __b02, char __b01, char __b00)
2630*67e74705SXin Li {
2631*67e74705SXin Li   return (__m256i)(__v32qi){
2632*67e74705SXin Li     __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
2633*67e74705SXin Li     __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
2634*67e74705SXin Li     __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
2635*67e74705SXin Li     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
2636*67e74705SXin Li }
2637*67e74705SXin Li 
2638*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi64x(long long __a,long long __b,long long __c,long long __d)2639*67e74705SXin Li _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
2640*67e74705SXin Li {
2641*67e74705SXin Li   return (__m256i)(__v4di){ __a, __b, __c, __d };
2642*67e74705SXin Li }
2643*67e74705SXin Li 
2644*67e74705SXin Li /* Create vectors with repeated elements */
2645*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set1_pd(double __w)2646*67e74705SXin Li _mm256_set1_pd(double __w)
2647*67e74705SXin Li {
2648*67e74705SXin Li   return (__m256d){ __w, __w, __w, __w };
2649*67e74705SXin Li }
2650*67e74705SXin Li 
2651*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set1_ps(float __w)2652*67e74705SXin Li _mm256_set1_ps(float __w)
2653*67e74705SXin Li {
2654*67e74705SXin Li   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
2655*67e74705SXin Li }
2656*67e74705SXin Li 
2657*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi32(int __i)2658*67e74705SXin Li _mm256_set1_epi32(int __i)
2659*67e74705SXin Li {
2660*67e74705SXin Li   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
2661*67e74705SXin Li }
2662*67e74705SXin Li 
2663*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi16(short __w)2664*67e74705SXin Li _mm256_set1_epi16(short __w)
2665*67e74705SXin Li {
2666*67e74705SXin Li   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
2667*67e74705SXin Li     __w, __w, __w, __w, __w, __w };
2668*67e74705SXin Li }
2669*67e74705SXin Li 
2670*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi8(char __b)2671*67e74705SXin Li _mm256_set1_epi8(char __b)
2672*67e74705SXin Li {
2673*67e74705SXin Li   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
2674*67e74705SXin Li     __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
2675*67e74705SXin Li     __b, __b, __b, __b, __b, __b, __b };
2676*67e74705SXin Li }
2677*67e74705SXin Li 
2678*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi64x(long long __q)2679*67e74705SXin Li _mm256_set1_epi64x(long long __q)
2680*67e74705SXin Li {
2681*67e74705SXin Li   return (__m256i)(__v4di){ __q, __q, __q, __q };
2682*67e74705SXin Li }
2683*67e74705SXin Li 
2684*67e74705SXin Li /* Create __zeroed vectors */
2685*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setzero_pd(void)2686*67e74705SXin Li _mm256_setzero_pd(void)
2687*67e74705SXin Li {
2688*67e74705SXin Li   return (__m256d){ 0, 0, 0, 0 };
2689*67e74705SXin Li }
2690*67e74705SXin Li 
2691*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setzero_ps(void)2692*67e74705SXin Li _mm256_setzero_ps(void)
2693*67e74705SXin Li {
2694*67e74705SXin Li   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
2695*67e74705SXin Li }
2696*67e74705SXin Li 
2697*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setzero_si256(void)2698*67e74705SXin Li _mm256_setzero_si256(void)
2699*67e74705SXin Li {
2700*67e74705SXin Li   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
2701*67e74705SXin Li }
2702*67e74705SXin Li 
2703*67e74705SXin Li /* Cast between vector types */
2704*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castpd_ps(__m256d __a)2705*67e74705SXin Li _mm256_castpd_ps(__m256d __a)
2706*67e74705SXin Li {
2707*67e74705SXin Li   return (__m256)__a;
2708*67e74705SXin Li }
2709*67e74705SXin Li 
2710*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castpd_si256(__m256d __a)2711*67e74705SXin Li _mm256_castpd_si256(__m256d __a)
2712*67e74705SXin Li {
2713*67e74705SXin Li   return (__m256i)__a;
2714*67e74705SXin Li }
2715*67e74705SXin Li 
2716*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castps_pd(__m256 __a)2717*67e74705SXin Li _mm256_castps_pd(__m256 __a)
2718*67e74705SXin Li {
2719*67e74705SXin Li   return (__m256d)__a;
2720*67e74705SXin Li }
2721*67e74705SXin Li 
2722*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castps_si256(__m256 __a)2723*67e74705SXin Li _mm256_castps_si256(__m256 __a)
2724*67e74705SXin Li {
2725*67e74705SXin Li   return (__m256i)__a;
2726*67e74705SXin Li }
2727*67e74705SXin Li 
2728*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castsi256_ps(__m256i __a)2729*67e74705SXin Li _mm256_castsi256_ps(__m256i __a)
2730*67e74705SXin Li {
2731*67e74705SXin Li   return (__m256)__a;
2732*67e74705SXin Li }
2733*67e74705SXin Li 
2734*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castsi256_pd(__m256i __a)2735*67e74705SXin Li _mm256_castsi256_pd(__m256i __a)
2736*67e74705SXin Li {
2737*67e74705SXin Li   return (__m256d)__a;
2738*67e74705SXin Li }
2739*67e74705SXin Li 
2740*67e74705SXin Li static __inline __m128d __DEFAULT_FN_ATTRS
_mm256_castpd256_pd128(__m256d __a)2741*67e74705SXin Li _mm256_castpd256_pd128(__m256d __a)
2742*67e74705SXin Li {
2743*67e74705SXin Li   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
2744*67e74705SXin Li }
2745*67e74705SXin Li 
2746*67e74705SXin Li static __inline __m128 __DEFAULT_FN_ATTRS
_mm256_castps256_ps128(__m256 __a)2747*67e74705SXin Li _mm256_castps256_ps128(__m256 __a)
2748*67e74705SXin Li {
2749*67e74705SXin Li   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
2750*67e74705SXin Li }
2751*67e74705SXin Li 
2752*67e74705SXin Li static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_castsi256_si128(__m256i __a)2753*67e74705SXin Li _mm256_castsi256_si128(__m256i __a)
2754*67e74705SXin Li {
2755*67e74705SXin Li   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
2756*67e74705SXin Li }
2757*67e74705SXin Li 
2758*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castpd128_pd256(__m128d __a)2759*67e74705SXin Li _mm256_castpd128_pd256(__m128d __a)
2760*67e74705SXin Li {
2761*67e74705SXin Li   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
2762*67e74705SXin Li }
2763*67e74705SXin Li 
2764*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castps128_ps256(__m128 __a)2765*67e74705SXin Li _mm256_castps128_ps256(__m128 __a)
2766*67e74705SXin Li {
2767*67e74705SXin Li   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
2768*67e74705SXin Li }
2769*67e74705SXin Li 
2770*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castsi128_si256(__m128i __a)2771*67e74705SXin Li _mm256_castsi128_si256(__m128i __a)
2772*67e74705SXin Li {
2773*67e74705SXin Li   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
2774*67e74705SXin Li }
2775*67e74705SXin Li 
2776*67e74705SXin Li /*
2777*67e74705SXin Li    Vector insert.
2778*67e74705SXin Li    We use macros rather than inlines because we only want to accept
2779*67e74705SXin Li    invocations where the immediate M is a constant expression.
2780*67e74705SXin Li */
2781*67e74705SXin Li #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
2782*67e74705SXin Li   (__m256)__builtin_shufflevector( \
2783*67e74705SXin Li     (__v8sf)(__m256)(V1), \
2784*67e74705SXin Li     (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
2785*67e74705SXin Li     (((M) & 1) ?  0 :  8), \
2786*67e74705SXin Li     (((M) & 1) ?  1 :  9), \
2787*67e74705SXin Li     (((M) & 1) ?  2 : 10), \
2788*67e74705SXin Li     (((M) & 1) ?  3 : 11), \
2789*67e74705SXin Li     (((M) & 1) ?  8 :  4), \
2790*67e74705SXin Li     (((M) & 1) ?  9 :  5), \
2791*67e74705SXin Li     (((M) & 1) ? 10 :  6), \
2792*67e74705SXin Li     (((M) & 1) ? 11 :  7) );})
2793*67e74705SXin Li 
2794*67e74705SXin Li #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
2795*67e74705SXin Li   (__m256d)__builtin_shufflevector( \
2796*67e74705SXin Li     (__v4df)(__m256d)(V1), \
2797*67e74705SXin Li     (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
2798*67e74705SXin Li     (((M) & 1) ? 0 : 4), \
2799*67e74705SXin Li     (((M) & 1) ? 1 : 5), \
2800*67e74705SXin Li     (((M) & 1) ? 4 : 2), \
2801*67e74705SXin Li     (((M) & 1) ? 5 : 3) );})
2802*67e74705SXin Li 
2803*67e74705SXin Li #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
2804*67e74705SXin Li   (__m256i)__builtin_shufflevector( \
2805*67e74705SXin Li     (__v4di)(__m256i)(V1), \
2806*67e74705SXin Li     (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
2807*67e74705SXin Li     (((M) & 1) ? 0 : 4), \
2808*67e74705SXin Li     (((M) & 1) ? 1 : 5), \
2809*67e74705SXin Li     (((M) & 1) ? 4 : 2), \
2810*67e74705SXin Li     (((M) & 1) ? 5 : 3) );})
2811*67e74705SXin Li 
2812*67e74705SXin Li /*
2813*67e74705SXin Li    Vector extract.
2814*67e74705SXin Li    We use macros rather than inlines because we only want to accept
2815*67e74705SXin Li    invocations where the immediate M is a constant expression.
2816*67e74705SXin Li */
2817*67e74705SXin Li #define _mm256_extractf128_ps(V, M) __extension__ ({ \
2818*67e74705SXin Li   (__m128)__builtin_shufflevector( \
2819*67e74705SXin Li     (__v8sf)(__m256)(V), \
2820*67e74705SXin Li     (__v8sf)(_mm256_undefined_ps()), \
2821*67e74705SXin Li     (((M) & 1) ? 4 : 0), \
2822*67e74705SXin Li     (((M) & 1) ? 5 : 1), \
2823*67e74705SXin Li     (((M) & 1) ? 6 : 2), \
2824*67e74705SXin Li     (((M) & 1) ? 7 : 3) );})
2825*67e74705SXin Li 
2826*67e74705SXin Li #define _mm256_extractf128_pd(V, M) __extension__ ({ \
2827*67e74705SXin Li   (__m128d)__builtin_shufflevector( \
2828*67e74705SXin Li     (__v4df)(__m256d)(V), \
2829*67e74705SXin Li     (__v4df)(_mm256_undefined_pd()), \
2830*67e74705SXin Li     (((M) & 1) ? 2 : 0), \
2831*67e74705SXin Li     (((M) & 1) ? 3 : 1) );})
2832*67e74705SXin Li 
2833*67e74705SXin Li #define _mm256_extractf128_si256(V, M) __extension__ ({ \
2834*67e74705SXin Li   (__m128i)__builtin_shufflevector( \
2835*67e74705SXin Li     (__v4di)(__m256i)(V), \
2836*67e74705SXin Li     (__v4di)(_mm256_undefined_si256()), \
2837*67e74705SXin Li     (((M) & 1) ? 2 : 0), \
2838*67e74705SXin Li     (((M) & 1) ? 3 : 1) );})
2839*67e74705SXin Li 
2840*67e74705SXin Li /* SIMD load ops (unaligned) */
2841*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu2_m128(float const * __addr_hi,float const * __addr_lo)2842*67e74705SXin Li _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
2843*67e74705SXin Li {
2844*67e74705SXin Li   __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
2845*67e74705SXin Li   return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
2846*67e74705SXin Li }
2847*67e74705SXin Li 
2848*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu2_m128d(double const * __addr_hi,double const * __addr_lo)2849*67e74705SXin Li _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
2850*67e74705SXin Li {
2851*67e74705SXin Li   __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
2852*67e74705SXin Li   return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
2853*67e74705SXin Li }
2854*67e74705SXin Li 
2855*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu2_m128i(__m128i const * __addr_hi,__m128i const * __addr_lo)2856*67e74705SXin Li _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
2857*67e74705SXin Li {
2858*67e74705SXin Li   __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
2859*67e74705SXin Li   return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
2860*67e74705SXin Li }
2861*67e74705SXin Li 
2862*67e74705SXin Li /* SIMD store ops (unaligned) */
2863*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128(float * __addr_hi,float * __addr_lo,__m256 __a)2864*67e74705SXin Li _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
2865*67e74705SXin Li {
2866*67e74705SXin Li   __m128 __v128;
2867*67e74705SXin Li 
2868*67e74705SXin Li   __v128 = _mm256_castps256_ps128(__a);
2869*67e74705SXin Li   _mm_storeu_ps(__addr_lo, __v128);
2870*67e74705SXin Li   __v128 = _mm256_extractf128_ps(__a, 1);
2871*67e74705SXin Li   _mm_storeu_ps(__addr_hi, __v128);
2872*67e74705SXin Li }
2873*67e74705SXin Li 
2874*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128d(double * __addr_hi,double * __addr_lo,__m256d __a)2875*67e74705SXin Li _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
2876*67e74705SXin Li {
2877*67e74705SXin Li   __m128d __v128;
2878*67e74705SXin Li 
2879*67e74705SXin Li   __v128 = _mm256_castpd256_pd128(__a);
2880*67e74705SXin Li   _mm_storeu_pd(__addr_lo, __v128);
2881*67e74705SXin Li   __v128 = _mm256_extractf128_pd(__a, 1);
2882*67e74705SXin Li   _mm_storeu_pd(__addr_hi, __v128);
2883*67e74705SXin Li }
2884*67e74705SXin Li 
2885*67e74705SXin Li static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128i(__m128i * __addr_hi,__m128i * __addr_lo,__m256i __a)2886*67e74705SXin Li _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
2887*67e74705SXin Li {
2888*67e74705SXin Li   __m128i __v128;
2889*67e74705SXin Li 
2890*67e74705SXin Li   __v128 = _mm256_castsi256_si128(__a);
2891*67e74705SXin Li   _mm_storeu_si128(__addr_lo, __v128);
2892*67e74705SXin Li   __v128 = _mm256_extractf128_si256(__a, 1);
2893*67e74705SXin Li   _mm_storeu_si128(__addr_hi, __v128);
2894*67e74705SXin Li }
2895*67e74705SXin Li 
2896*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set_m128(__m128 __hi,__m128 __lo)2897*67e74705SXin Li _mm256_set_m128 (__m128 __hi, __m128 __lo) {
2898*67e74705SXin Li   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
2899*67e74705SXin Li }
2900*67e74705SXin Li 
2901*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set_m128d(__m128d __hi,__m128d __lo)2902*67e74705SXin Li _mm256_set_m128d (__m128d __hi, __m128d __lo) {
2903*67e74705SXin Li   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2904*67e74705SXin Li }
2905*67e74705SXin Li 
2906*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_m128i(__m128i __hi,__m128i __lo)2907*67e74705SXin Li _mm256_set_m128i (__m128i __hi, __m128i __lo) {
2908*67e74705SXin Li   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2909*67e74705SXin Li }
2910*67e74705SXin Li 
2911*67e74705SXin Li static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setr_m128(__m128 __lo,__m128 __hi)2912*67e74705SXin Li _mm256_setr_m128 (__m128 __lo, __m128 __hi) {
2913*67e74705SXin Li   return _mm256_set_m128(__hi, __lo);
2914*67e74705SXin Li }
2915*67e74705SXin Li 
2916*67e74705SXin Li static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setr_m128d(__m128d __lo,__m128d __hi)2917*67e74705SXin Li _mm256_setr_m128d (__m128d __lo, __m128d __hi) {
2918*67e74705SXin Li   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2919*67e74705SXin Li }
2920*67e74705SXin Li 
2921*67e74705SXin Li static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_m128i(__m128i __lo,__m128i __hi)2922*67e74705SXin Li _mm256_setr_m128i (__m128i __lo, __m128i __hi) {
2923*67e74705SXin Li   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2924*67e74705SXin Li }
2925*67e74705SXin Li 
2926*67e74705SXin Li #undef __DEFAULT_FN_ATTRS
2927*67e74705SXin Li 
2928*67e74705SXin Li #endif /* __AVXINTRIN_H */
2929