1*bed243d3SAndroid Build Coastguard Worker /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
2*bed243d3SAndroid Build Coastguard Worker *
3*bed243d3SAndroid Build Coastguard Worker *
4*bed243d3SAndroid Build Coastguard Worker * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*bed243d3SAndroid Build Coastguard Worker * See https://llvm.org/LICENSE.txt for license information.
6*bed243d3SAndroid Build Coastguard Worker * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*bed243d3SAndroid Build Coastguard Worker *
8*bed243d3SAndroid Build Coastguard Worker *===-----------------------------------------------------------------------===
9*bed243d3SAndroid Build Coastguard Worker */
10*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
11*bed243d3SAndroid Build Coastguard Worker #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
12*bed243d3SAndroid Build Coastguard Worker #endif
13*bed243d3SAndroid Build Coastguard Worker
14*bed243d3SAndroid Build Coastguard Worker #ifndef __AVX512VLVNNIINTRIN_H
15*bed243d3SAndroid Build Coastguard Worker #define __AVX512VLVNNIINTRIN_H
16*bed243d3SAndroid Build Coastguard Worker
17*bed243d3SAndroid Build Coastguard Worker /* Define the default attributes for the functions in this file. */
18*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS128 \
19*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, \
20*bed243d3SAndroid Build Coastguard Worker __target__("avx512vl,avx512vnni,no-evex512"), \
21*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(128)))
22*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS256 \
23*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, \
24*bed243d3SAndroid Build Coastguard Worker __target__("avx512vl,avx512vnni,no-evex512"), \
25*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(256)))
26*bed243d3SAndroid Build Coastguard Worker
27*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
28*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
29*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
30*bed243d3SAndroid Build Coastguard Worker /// in \a S, and store the packed 32-bit results in DST.
31*bed243d3SAndroid Build Coastguard Worker ///
32*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
33*bed243d3SAndroid Build Coastguard Worker ///
34*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
35*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
36*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
37*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
38*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
39*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
40*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
41*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
42*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:256] := 0
43*bed243d3SAndroid Build Coastguard Worker /// \endcode
44*bed243d3SAndroid Build Coastguard Worker #define _mm256_dpbusd_epi32(S, A, B) \
45*bed243d3SAndroid Build Coastguard Worker ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
46*bed243d3SAndroid Build Coastguard Worker
47*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
48*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
49*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
50*bed243d3SAndroid Build Coastguard Worker /// in \a S using signed saturation, and store the packed 32-bit results in DST.
51*bed243d3SAndroid Build Coastguard Worker ///
52*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
53*bed243d3SAndroid Build Coastguard Worker ///
54*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
55*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
56*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
57*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
58*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
59*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
60*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
61*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
62*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:256] := 0
63*bed243d3SAndroid Build Coastguard Worker /// \endcode
64*bed243d3SAndroid Build Coastguard Worker #define _mm256_dpbusds_epi32(S, A, B) \
65*bed243d3SAndroid Build Coastguard Worker ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
66*bed243d3SAndroid Build Coastguard Worker
67*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
68*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
69*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
70*bed243d3SAndroid Build Coastguard Worker /// and store the packed 32-bit results in DST.
71*bed243d3SAndroid Build Coastguard Worker ///
72*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
73*bed243d3SAndroid Build Coastguard Worker ///
74*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
75*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
76*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
77*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
78*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
79*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
80*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:256] := 0
81*bed243d3SAndroid Build Coastguard Worker /// \endcode
82*bed243d3SAndroid Build Coastguard Worker #define _mm256_dpwssd_epi32(S, A, B) \
83*bed243d3SAndroid Build Coastguard Worker ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
84*bed243d3SAndroid Build Coastguard Worker
85*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
86*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
87*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
88*bed243d3SAndroid Build Coastguard Worker /// using signed saturation, and store the packed 32-bit results in DST.
89*bed243d3SAndroid Build Coastguard Worker ///
90*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
91*bed243d3SAndroid Build Coastguard Worker ///
92*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
93*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
94*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
95*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
96*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
97*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
98*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:256] := 0
99*bed243d3SAndroid Build Coastguard Worker /// \endcode
100*bed243d3SAndroid Build Coastguard Worker #define _mm256_dpwssds_epi32(S, A, B) \
101*bed243d3SAndroid Build Coastguard Worker ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
102*bed243d3SAndroid Build Coastguard Worker
103*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
104*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
105*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
106*bed243d3SAndroid Build Coastguard Worker /// in \a S, and store the packed 32-bit results in DST.
107*bed243d3SAndroid Build Coastguard Worker ///
108*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
109*bed243d3SAndroid Build Coastguard Worker ///
110*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
111*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
112*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
113*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
114*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
115*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
116*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
117*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
118*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:128] := 0
119*bed243d3SAndroid Build Coastguard Worker /// \endcode
120*bed243d3SAndroid Build Coastguard Worker #define _mm_dpbusd_epi32(S, A, B) \
121*bed243d3SAndroid Build Coastguard Worker ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
122*bed243d3SAndroid Build Coastguard Worker
123*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
124*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
125*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
126*bed243d3SAndroid Build Coastguard Worker /// in \a S using signed saturation, and store the packed 32-bit results in DST.
127*bed243d3SAndroid Build Coastguard Worker ///
128*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
129*bed243d3SAndroid Build Coastguard Worker ///
130*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
131*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
132*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
133*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
134*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
135*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
136*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
137*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
138*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:128] := 0
139*bed243d3SAndroid Build Coastguard Worker /// \endcode
140*bed243d3SAndroid Build Coastguard Worker #define _mm_dpbusds_epi32(S, A, B) \
141*bed243d3SAndroid Build Coastguard Worker ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
142*bed243d3SAndroid Build Coastguard Worker
143*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
144*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
145*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
146*bed243d3SAndroid Build Coastguard Worker /// and store the packed 32-bit results in DST.
147*bed243d3SAndroid Build Coastguard Worker ///
148*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
149*bed243d3SAndroid Build Coastguard Worker ///
150*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
151*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
152*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
153*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
154*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
155*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
156*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:128] := 0
157*bed243d3SAndroid Build Coastguard Worker /// \endcode
158*bed243d3SAndroid Build Coastguard Worker #define _mm_dpwssd_epi32(S, A, B) \
159*bed243d3SAndroid Build Coastguard Worker ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
160*bed243d3SAndroid Build Coastguard Worker
161*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
162*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
163*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
164*bed243d3SAndroid Build Coastguard Worker /// using signed saturation, and store the packed 32-bit results in DST.
165*bed243d3SAndroid Build Coastguard Worker ///
166*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
167*bed243d3SAndroid Build Coastguard Worker ///
168*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
169*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
170*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
171*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
172*bed243d3SAndroid Build Coastguard Worker /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
173*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
174*bed243d3SAndroid Build Coastguard Worker /// DST[MAX:128] := 0
175*bed243d3SAndroid Build Coastguard Worker /// \endcode
176*bed243d3SAndroid Build Coastguard Worker #define _mm_dpwssds_epi32(S, A, B) \
177*bed243d3SAndroid Build Coastguard Worker ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
178*bed243d3SAndroid Build Coastguard Worker
179*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S,__mmask8 __U,__m256i __A,__m256i __B)180*bed243d3SAndroid Build Coastguard Worker _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
181*bed243d3SAndroid Build Coastguard Worker {
182*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
183*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
184*bed243d3SAndroid Build Coastguard Worker (__v8si)__S);
185*bed243d3SAndroid Build Coastguard Worker }
186*bed243d3SAndroid Build Coastguard Worker
187*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusd_epi32(__mmask8 __U,__m256i __S,__m256i __A,__m256i __B)188*bed243d3SAndroid Build Coastguard Worker _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
189*bed243d3SAndroid Build Coastguard Worker {
190*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
191*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
192*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_setzero_si256());
193*bed243d3SAndroid Build Coastguard Worker }
194*bed243d3SAndroid Build Coastguard Worker
195*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S,__mmask8 __U,__m256i __A,__m256i __B)196*bed243d3SAndroid Build Coastguard Worker _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
197*bed243d3SAndroid Build Coastguard Worker {
198*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
199*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
200*bed243d3SAndroid Build Coastguard Worker (__v8si)__S);
201*bed243d3SAndroid Build Coastguard Worker }
202*bed243d3SAndroid Build Coastguard Worker
203*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusds_epi32(__mmask8 __U,__m256i __S,__m256i __A,__m256i __B)204*bed243d3SAndroid Build Coastguard Worker _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
205*bed243d3SAndroid Build Coastguard Worker {
206*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
207*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
208*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_setzero_si256());
209*bed243d3SAndroid Build Coastguard Worker }
210*bed243d3SAndroid Build Coastguard Worker
211*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S,__mmask8 __U,__m256i __A,__m256i __B)212*bed243d3SAndroid Build Coastguard Worker _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
213*bed243d3SAndroid Build Coastguard Worker {
214*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
215*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
216*bed243d3SAndroid Build Coastguard Worker (__v8si)__S);
217*bed243d3SAndroid Build Coastguard Worker }
218*bed243d3SAndroid Build Coastguard Worker
219*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssd_epi32(__mmask8 __U,__m256i __S,__m256i __A,__m256i __B)220*bed243d3SAndroid Build Coastguard Worker _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
221*bed243d3SAndroid Build Coastguard Worker {
222*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
223*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
224*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_setzero_si256());
225*bed243d3SAndroid Build Coastguard Worker }
226*bed243d3SAndroid Build Coastguard Worker
227*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S,__mmask8 __U,__m256i __A,__m256i __B)228*bed243d3SAndroid Build Coastguard Worker _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
229*bed243d3SAndroid Build Coastguard Worker {
230*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
231*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
232*bed243d3SAndroid Build Coastguard Worker (__v8si)__S);
233*bed243d3SAndroid Build Coastguard Worker }
234*bed243d3SAndroid Build Coastguard Worker
235*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssds_epi32(__mmask8 __U,__m256i __S,__m256i __A,__m256i __B)236*bed243d3SAndroid Build Coastguard Worker _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
237*bed243d3SAndroid Build Coastguard Worker {
238*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_selectd_256(__U,
239*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
240*bed243d3SAndroid Build Coastguard Worker (__v8si)_mm256_setzero_si256());
241*bed243d3SAndroid Build Coastguard Worker }
242*bed243d3SAndroid Build Coastguard Worker
243*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S,__mmask8 __U,__m128i __A,__m128i __B)244*bed243d3SAndroid Build Coastguard Worker _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
245*bed243d3SAndroid Build Coastguard Worker {
246*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
247*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
248*bed243d3SAndroid Build Coastguard Worker (__v4si)__S);
249*bed243d3SAndroid Build Coastguard Worker }
250*bed243d3SAndroid Build Coastguard Worker
251*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusd_epi32(__mmask8 __U,__m128i __S,__m128i __A,__m128i __B)252*bed243d3SAndroid Build Coastguard Worker _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
253*bed243d3SAndroid Build Coastguard Worker {
254*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
255*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
256*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_setzero_si128());
257*bed243d3SAndroid Build Coastguard Worker }
258*bed243d3SAndroid Build Coastguard Worker
259*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S,__mmask8 __U,__m128i __A,__m128i __B)260*bed243d3SAndroid Build Coastguard Worker _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
261*bed243d3SAndroid Build Coastguard Worker {
262*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
263*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
264*bed243d3SAndroid Build Coastguard Worker (__v4si)__S);
265*bed243d3SAndroid Build Coastguard Worker }
266*bed243d3SAndroid Build Coastguard Worker
267*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusds_epi32(__mmask8 __U,__m128i __S,__m128i __A,__m128i __B)268*bed243d3SAndroid Build Coastguard Worker _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
269*bed243d3SAndroid Build Coastguard Worker {
270*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
271*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
272*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_setzero_si128());
273*bed243d3SAndroid Build Coastguard Worker }
274*bed243d3SAndroid Build Coastguard Worker
275*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S,__mmask8 __U,__m128i __A,__m128i __B)276*bed243d3SAndroid Build Coastguard Worker _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
277*bed243d3SAndroid Build Coastguard Worker {
278*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
279*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
280*bed243d3SAndroid Build Coastguard Worker (__v4si)__S);
281*bed243d3SAndroid Build Coastguard Worker }
282*bed243d3SAndroid Build Coastguard Worker
283*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssd_epi32(__mmask8 __U,__m128i __S,__m128i __A,__m128i __B)284*bed243d3SAndroid Build Coastguard Worker _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
285*bed243d3SAndroid Build Coastguard Worker {
286*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
287*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
288*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_setzero_si128());
289*bed243d3SAndroid Build Coastguard Worker }
290*bed243d3SAndroid Build Coastguard Worker
291*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S,__mmask8 __U,__m128i __A,__m128i __B)292*bed243d3SAndroid Build Coastguard Worker _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
293*bed243d3SAndroid Build Coastguard Worker {
294*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
295*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
296*bed243d3SAndroid Build Coastguard Worker (__v4si)__S);
297*bed243d3SAndroid Build Coastguard Worker }
298*bed243d3SAndroid Build Coastguard Worker
299*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssds_epi32(__mmask8 __U,__m128i __S,__m128i __A,__m128i __B)300*bed243d3SAndroid Build Coastguard Worker _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
301*bed243d3SAndroid Build Coastguard Worker {
302*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_selectd_128(__U,
303*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
304*bed243d3SAndroid Build Coastguard Worker (__v4si)_mm_setzero_si128());
305*bed243d3SAndroid Build Coastguard Worker }
306*bed243d3SAndroid Build Coastguard Worker
307*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS128
308*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS256
309*bed243d3SAndroid Build Coastguard Worker
310*bed243d3SAndroid Build Coastguard Worker #endif
311