1*67e74705SXin Li /*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
2*67e74705SXin Li *
3*67e74705SXin Li * Permission is hereby granted, free of charge, to any person obtaining a copy
4*67e74705SXin Li * of this software and associated documentation files (the "Software"), to deal
5*67e74705SXin Li * in the Software without restriction, including without limitation the rights
6*67e74705SXin Li * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7*67e74705SXin Li * copies of the Software, and to permit persons to whom the Software is
8*67e74705SXin Li * furnished to do so, subject to the following conditions:
9*67e74705SXin Li *
10*67e74705SXin Li * The above copyright notice and this permission notice shall be included in
11*67e74705SXin Li * all copies or substantial portions of the Software.
12*67e74705SXin Li *
13*67e74705SXin Li * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14*67e74705SXin Li * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15*67e74705SXin Li * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16*67e74705SXin Li * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17*67e74705SXin Li * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18*67e74705SXin Li * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19*67e74705SXin Li * THE SOFTWARE.
20*67e74705SXin Li *
21*67e74705SXin Li *===-----------------------------------------------------------------------===
22*67e74705SXin Li */
23*67e74705SXin Li #ifndef __IMMINTRIN_H
24*67e74705SXin Li #error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
25*67e74705SXin Li #endif
26*67e74705SXin Li
27*67e74705SXin Li #ifndef __AVX512VLCDINTRIN_H
28*67e74705SXin Li #define __AVX512VLCDINTRIN_H
29*67e74705SXin Li
30*67e74705SXin Li /* Define the default attributes for the functions in this file. */
31*67e74705SXin Li #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
32*67e74705SXin Li
33*67e74705SXin Li
34*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmb_epi64(__mmask8 __A)35*67e74705SXin Li _mm_broadcastmb_epi64 (__mmask8 __A)
36*67e74705SXin Li {
37*67e74705SXin Li return (__m128i) __builtin_ia32_broadcastmb128 (__A);
38*67e74705SXin Li }
39*67e74705SXin Li
40*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmb_epi64(__mmask8 __A)41*67e74705SXin Li _mm256_broadcastmb_epi64 (__mmask8 __A)
42*67e74705SXin Li {
43*67e74705SXin Li return (__m256i) __builtin_ia32_broadcastmb256 (__A);
44*67e74705SXin Li }
45*67e74705SXin Li
46*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmw_epi32(__mmask16 __A)47*67e74705SXin Li _mm_broadcastmw_epi32 (__mmask16 __A)
48*67e74705SXin Li {
49*67e74705SXin Li return (__m128i) __builtin_ia32_broadcastmw128 (__A);
50*67e74705SXin Li }
51*67e74705SXin Li
52*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmw_epi32(__mmask16 __A)53*67e74705SXin Li _mm256_broadcastmw_epi32 (__mmask16 __A)
54*67e74705SXin Li {
55*67e74705SXin Li return (__m256i) __builtin_ia32_broadcastmw256 (__A);
56*67e74705SXin Li }
57*67e74705SXin Li
58*67e74705SXin Li
59*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_conflict_epi64(__m128i __A)60*67e74705SXin Li _mm_conflict_epi64 (__m128i __A)
61*67e74705SXin Li {
62*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
63*67e74705SXin Li (__v2di) _mm_undefined_si128 (),
64*67e74705SXin Li (__mmask8) -1);
65*67e74705SXin Li }
66*67e74705SXin Li
67*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_conflict_epi64(__m128i __W,__mmask8 __U,__m128i __A)68*67e74705SXin Li _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
69*67e74705SXin Li {
70*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
71*67e74705SXin Li (__v2di) __W,
72*67e74705SXin Li (__mmask8) __U);
73*67e74705SXin Li }
74*67e74705SXin Li
75*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_conflict_epi64(__mmask8 __U,__m128i __A)76*67e74705SXin Li _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
77*67e74705SXin Li {
78*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
79*67e74705SXin Li (__v2di)
80*67e74705SXin Li _mm_setzero_di (),
81*67e74705SXin Li (__mmask8) __U);
82*67e74705SXin Li }
83*67e74705SXin Li
84*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_conflict_epi64(__m256i __A)85*67e74705SXin Li _mm256_conflict_epi64 (__m256i __A)
86*67e74705SXin Li {
87*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
88*67e74705SXin Li (__v4di) _mm256_undefined_si256 (),
89*67e74705SXin Li (__mmask8) -1);
90*67e74705SXin Li }
91*67e74705SXin Li
92*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_conflict_epi64(__m256i __W,__mmask8 __U,__m256i __A)93*67e74705SXin Li _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
94*67e74705SXin Li {
95*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
96*67e74705SXin Li (__v4di) __W,
97*67e74705SXin Li (__mmask8) __U);
98*67e74705SXin Li }
99*67e74705SXin Li
100*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_conflict_epi64(__mmask8 __U,__m256i __A)101*67e74705SXin Li _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
102*67e74705SXin Li {
103*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
104*67e74705SXin Li (__v4di) _mm256_setzero_si256 (),
105*67e74705SXin Li (__mmask8) __U);
106*67e74705SXin Li }
107*67e74705SXin Li
108*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_conflict_epi32(__m128i __A)109*67e74705SXin Li _mm_conflict_epi32 (__m128i __A)
110*67e74705SXin Li {
111*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
112*67e74705SXin Li (__v4si) _mm_undefined_si128 (),
113*67e74705SXin Li (__mmask8) -1);
114*67e74705SXin Li }
115*67e74705SXin Li
116*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_conflict_epi32(__m128i __W,__mmask8 __U,__m128i __A)117*67e74705SXin Li _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
118*67e74705SXin Li {
119*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
120*67e74705SXin Li (__v4si) __W,
121*67e74705SXin Li (__mmask8) __U);
122*67e74705SXin Li }
123*67e74705SXin Li
124*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_conflict_epi32(__mmask8 __U,__m128i __A)125*67e74705SXin Li _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
126*67e74705SXin Li {
127*67e74705SXin Li return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
128*67e74705SXin Li (__v4si) _mm_setzero_si128 (),
129*67e74705SXin Li (__mmask8) __U);
130*67e74705SXin Li }
131*67e74705SXin Li
132*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_conflict_epi32(__m256i __A)133*67e74705SXin Li _mm256_conflict_epi32 (__m256i __A)
134*67e74705SXin Li {
135*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
136*67e74705SXin Li (__v8si) _mm256_undefined_si256 (),
137*67e74705SXin Li (__mmask8) -1);
138*67e74705SXin Li }
139*67e74705SXin Li
140*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_conflict_epi32(__m256i __W,__mmask8 __U,__m256i __A)141*67e74705SXin Li _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
142*67e74705SXin Li {
143*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
144*67e74705SXin Li (__v8si) __W,
145*67e74705SXin Li (__mmask8) __U);
146*67e74705SXin Li }
147*67e74705SXin Li
148*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_conflict_epi32(__mmask8 __U,__m256i __A)149*67e74705SXin Li _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
150*67e74705SXin Li {
151*67e74705SXin Li return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
152*67e74705SXin Li (__v8si)
153*67e74705SXin Li _mm256_setzero_si256 (),
154*67e74705SXin Li (__mmask8) __U);
155*67e74705SXin Li }
156*67e74705SXin Li
157*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lzcnt_epi32(__m128i __A)158*67e74705SXin Li _mm_lzcnt_epi32 (__m128i __A)
159*67e74705SXin Li {
160*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
161*67e74705SXin Li (__v4si)
162*67e74705SXin Li _mm_setzero_si128 (),
163*67e74705SXin Li (__mmask8) -1);
164*67e74705SXin Li }
165*67e74705SXin Li
166*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_lzcnt_epi32(__m128i __W,__mmask8 __U,__m128i __A)167*67e74705SXin Li _mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
168*67e74705SXin Li {
169*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
170*67e74705SXin Li (__v4si) __W,
171*67e74705SXin Li (__mmask8) __U);
172*67e74705SXin Li }
173*67e74705SXin Li
174*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_lzcnt_epi32(__mmask8 __U,__m128i __A)175*67e74705SXin Li _mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
176*67e74705SXin Li {
177*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
178*67e74705SXin Li (__v4si)
179*67e74705SXin Li _mm_setzero_si128 (),
180*67e74705SXin Li (__mmask8) __U);
181*67e74705SXin Li }
182*67e74705SXin Li
183*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_lzcnt_epi32(__m256i __A)184*67e74705SXin Li _mm256_lzcnt_epi32 (__m256i __A)
185*67e74705SXin Li {
186*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
187*67e74705SXin Li (__v8si)
188*67e74705SXin Li _mm256_setzero_si256 (),
189*67e74705SXin Li (__mmask8) -1);
190*67e74705SXin Li }
191*67e74705SXin Li
192*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_lzcnt_epi32(__m256i __W,__mmask8 __U,__m256i __A)193*67e74705SXin Li _mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
194*67e74705SXin Li {
195*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
196*67e74705SXin Li (__v8si) __W,
197*67e74705SXin Li (__mmask8) __U);
198*67e74705SXin Li }
199*67e74705SXin Li
200*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_lzcnt_epi32(__mmask8 __U,__m256i __A)201*67e74705SXin Li _mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
202*67e74705SXin Li {
203*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
204*67e74705SXin Li (__v8si)
205*67e74705SXin Li _mm256_setzero_si256 (),
206*67e74705SXin Li (__mmask8) __U);
207*67e74705SXin Li }
208*67e74705SXin Li
209*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lzcnt_epi64(__m128i __A)210*67e74705SXin Li _mm_lzcnt_epi64 (__m128i __A)
211*67e74705SXin Li {
212*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
213*67e74705SXin Li (__v2di)
214*67e74705SXin Li _mm_setzero_di (),
215*67e74705SXin Li (__mmask8) -1);
216*67e74705SXin Li }
217*67e74705SXin Li
218*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_lzcnt_epi64(__m128i __W,__mmask8 __U,__m128i __A)219*67e74705SXin Li _mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
220*67e74705SXin Li {
221*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
222*67e74705SXin Li (__v2di) __W,
223*67e74705SXin Li (__mmask8) __U);
224*67e74705SXin Li }
225*67e74705SXin Li
226*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_lzcnt_epi64(__mmask8 __U,__m128i __A)227*67e74705SXin Li _mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
228*67e74705SXin Li {
229*67e74705SXin Li return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
230*67e74705SXin Li (__v2di)
231*67e74705SXin Li _mm_setzero_di (),
232*67e74705SXin Li (__mmask8) __U);
233*67e74705SXin Li }
234*67e74705SXin Li
235*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_lzcnt_epi64(__m256i __A)236*67e74705SXin Li _mm256_lzcnt_epi64 (__m256i __A)
237*67e74705SXin Li {
238*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
239*67e74705SXin Li (__v4di)
240*67e74705SXin Li _mm256_setzero_si256 (),
241*67e74705SXin Li (__mmask8) -1);
242*67e74705SXin Li }
243*67e74705SXin Li
244*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_lzcnt_epi64(__m256i __W,__mmask8 __U,__m256i __A)245*67e74705SXin Li _mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
246*67e74705SXin Li {
247*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
248*67e74705SXin Li (__v4di) __W,
249*67e74705SXin Li (__mmask8) __U);
250*67e74705SXin Li }
251*67e74705SXin Li
252*67e74705SXin Li static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_lzcnt_epi64(__mmask8 __U,__m256i __A)253*67e74705SXin Li _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
254*67e74705SXin Li {
255*67e74705SXin Li return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
256*67e74705SXin Li (__v4di)
257*67e74705SXin Li _mm256_setzero_si256 (),
258*67e74705SXin Li (__mmask8) __U);
259*67e74705SXin Li }
260*67e74705SXin Li
261*67e74705SXin Li #undef __DEFAULT_FN_ATTRS
262*67e74705SXin Li
263*67e74705SXin Li #endif /* __AVX512VLCDINTRIN_H */
264