1*bed243d3SAndroid Build Coastguard Worker /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2*bed243d3SAndroid Build Coastguard Worker *
3*bed243d3SAndroid Build Coastguard Worker * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bed243d3SAndroid Build Coastguard Worker * See https://llvm.org/LICENSE.txt for license information.
5*bed243d3SAndroid Build Coastguard Worker * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bed243d3SAndroid Build Coastguard Worker *
7*bed243d3SAndroid Build Coastguard Worker *===-----------------------------------------------------------------------===
8*bed243d3SAndroid Build Coastguard Worker */
9*bed243d3SAndroid Build Coastguard Worker
10*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
11*bed243d3SAndroid Build Coastguard Worker #error \
12*bed243d3SAndroid Build Coastguard Worker "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13*bed243d3SAndroid Build Coastguard Worker #endif // __IMMINTRIN_H
14*bed243d3SAndroid Build Coastguard Worker
15*bed243d3SAndroid Build Coastguard Worker #ifndef __AVXVNNIINT16INTRIN_H
16*bed243d3SAndroid Build Coastguard Worker #define __AVXVNNIINT16INTRIN_H
17*bed243d3SAndroid Build Coastguard Worker
18*bed243d3SAndroid Build Coastguard Worker /* Define the default attributes for the functions in this file. */
19*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS128 \
20*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(128)))
22*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS256 \
23*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(256)))
25*bed243d3SAndroid Build Coastguard Worker
26*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
29*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30*bed243d3SAndroid Build Coastguard Worker ///
31*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
32*bed243d3SAndroid Build Coastguard Worker ///
33*bed243d3SAndroid Build Coastguard Worker /// \code
34*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35*bed243d3SAndroid Build Coastguard Worker /// \endcode
36*bed243d3SAndroid Build Coastguard Worker ///
37*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUD instruction.
38*bed243d3SAndroid Build Coastguard Worker ///
39*bed243d3SAndroid Build Coastguard Worker /// \param __W
40*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
41*bed243d3SAndroid Build Coastguard Worker /// \param __A
42*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x short].
43*bed243d3SAndroid Build Coastguard Worker /// \param __B
44*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
45*bed243d3SAndroid Build Coastguard Worker /// \returns
46*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
47*bed243d3SAndroid Build Coastguard Worker ///
48*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
49*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
50*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
54*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
55*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwsud_epi32(__m128i __W,__m128i __A,__m128i __B)56*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57*bed243d3SAndroid Build Coastguard Worker __m128i __A,
58*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
59*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
61*bed243d3SAndroid Build Coastguard Worker }
62*bed243d3SAndroid Build Coastguard Worker
63*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
66*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67*bed243d3SAndroid Build Coastguard Worker ///
68*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
69*bed243d3SAndroid Build Coastguard Worker ///
70*bed243d3SAndroid Build Coastguard Worker /// \code
71*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72*bed243d3SAndroid Build Coastguard Worker /// \endcode
73*bed243d3SAndroid Build Coastguard Worker ///
74*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUD instruction.
75*bed243d3SAndroid Build Coastguard Worker ///
76*bed243d3SAndroid Build Coastguard Worker /// \param __W
77*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
78*bed243d3SAndroid Build Coastguard Worker /// \param __A
79*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x short].
80*bed243d3SAndroid Build Coastguard Worker /// \param __B
81*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
82*bed243d3SAndroid Build Coastguard Worker /// \returns
83*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
84*bed243d3SAndroid Build Coastguard Worker ///
85*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
86*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
87*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
91*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
92*bed243d3SAndroid Build Coastguard Worker /// \endcode
93*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsud_epi32(__m256i __W,__m256i __A,__m256i __B)94*bed243d3SAndroid Build Coastguard Worker _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
97*bed243d3SAndroid Build Coastguard Worker }
98*bed243d3SAndroid Build Coastguard Worker
99*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
102*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
103*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
104*bed243d3SAndroid Build Coastguard Worker ///
105*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
106*bed243d3SAndroid Build Coastguard Worker ///
107*bed243d3SAndroid Build Coastguard Worker /// \code
108*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109*bed243d3SAndroid Build Coastguard Worker /// \endcode
110*bed243d3SAndroid Build Coastguard Worker ///
111*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112*bed243d3SAndroid Build Coastguard Worker ///
113*bed243d3SAndroid Build Coastguard Worker /// \param __W
114*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
115*bed243d3SAndroid Build Coastguard Worker /// \param __A
116*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x short].
117*bed243d3SAndroid Build Coastguard Worker /// \param __B
118*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
119*bed243d3SAndroid Build Coastguard Worker /// \returns
120*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
121*bed243d3SAndroid Build Coastguard Worker ///
122*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
123*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
124*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
128*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
129*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwsuds_epi32(__m128i __W,__m128i __A,__m128i __B)130*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131*bed243d3SAndroid Build Coastguard Worker __m128i __A,
132*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
133*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
135*bed243d3SAndroid Build Coastguard Worker }
136*bed243d3SAndroid Build Coastguard Worker
137*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
140*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
141*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
142*bed243d3SAndroid Build Coastguard Worker ///
143*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
144*bed243d3SAndroid Build Coastguard Worker ///
145*bed243d3SAndroid Build Coastguard Worker /// \code
146*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147*bed243d3SAndroid Build Coastguard Worker /// \endcode
148*bed243d3SAndroid Build Coastguard Worker ///
149*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150*bed243d3SAndroid Build Coastguard Worker ///
151*bed243d3SAndroid Build Coastguard Worker /// \param __W
152*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
153*bed243d3SAndroid Build Coastguard Worker /// \param __A
154*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x short].
155*bed243d3SAndroid Build Coastguard Worker /// \param __B
156*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
157*bed243d3SAndroid Build Coastguard Worker /// \returns
158*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
159*bed243d3SAndroid Build Coastguard Worker ///
160*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
161*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
162*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
166*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
167*bed243d3SAndroid Build Coastguard Worker /// \endcode
168*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsuds_epi32(__m256i __W,__m256i __A,__m256i __B)169*bed243d3SAndroid Build Coastguard Worker _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
172*bed243d3SAndroid Build Coastguard Worker }
173*bed243d3SAndroid Build Coastguard Worker
174*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
177*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178*bed243d3SAndroid Build Coastguard Worker ///
179*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
180*bed243d3SAndroid Build Coastguard Worker ///
181*bed243d3SAndroid Build Coastguard Worker /// \code
182*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183*bed243d3SAndroid Build Coastguard Worker /// \endcode
184*bed243d3SAndroid Build Coastguard Worker ///
185*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWUSD instruction.
186*bed243d3SAndroid Build Coastguard Worker ///
187*bed243d3SAndroid Build Coastguard Worker /// \param __W
188*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
189*bed243d3SAndroid Build Coastguard Worker /// \param __A
190*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
191*bed243d3SAndroid Build Coastguard Worker /// \param __B
192*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x short].
193*bed243d3SAndroid Build Coastguard Worker /// \returns
194*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
195*bed243d3SAndroid Build Coastguard Worker ///
196*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
197*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
198*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
202*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
203*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwusd_epi32(__m128i __W,__m128i __A,__m128i __B)204*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205*bed243d3SAndroid Build Coastguard Worker __m128i __A,
206*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
207*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
209*bed243d3SAndroid Build Coastguard Worker }
210*bed243d3SAndroid Build Coastguard Worker
211*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
214*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215*bed243d3SAndroid Build Coastguard Worker ///
216*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
217*bed243d3SAndroid Build Coastguard Worker ///
218*bed243d3SAndroid Build Coastguard Worker /// \code
219*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220*bed243d3SAndroid Build Coastguard Worker /// \endcode
221*bed243d3SAndroid Build Coastguard Worker ///
222*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWUSD instruction.
223*bed243d3SAndroid Build Coastguard Worker ///
224*bed243d3SAndroid Build Coastguard Worker /// \param __W
225*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
226*bed243d3SAndroid Build Coastguard Worker /// \param __A
227*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
228*bed243d3SAndroid Build Coastguard Worker /// \param __B
229*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x short].
230*bed243d3SAndroid Build Coastguard Worker /// \returns
231*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
232*bed243d3SAndroid Build Coastguard Worker ///
233*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
234*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
235*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
239*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
240*bed243d3SAndroid Build Coastguard Worker /// \endcode
241*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusd_epi32(__m256i __W,__m256i __A,__m256i __B)242*bed243d3SAndroid Build Coastguard Worker _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
245*bed243d3SAndroid Build Coastguard Worker }
246*bed243d3SAndroid Build Coastguard Worker
247*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
250*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
251*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
252*bed243d3SAndroid Build Coastguard Worker ///
253*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
254*bed243d3SAndroid Build Coastguard Worker ///
255*bed243d3SAndroid Build Coastguard Worker /// \code
256*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257*bed243d3SAndroid Build Coastguard Worker /// \endcode
258*bed243d3SAndroid Build Coastguard Worker ///
259*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260*bed243d3SAndroid Build Coastguard Worker ///
261*bed243d3SAndroid Build Coastguard Worker /// \param __W
262*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
263*bed243d3SAndroid Build Coastguard Worker /// \param __A
264*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
265*bed243d3SAndroid Build Coastguard Worker /// \param __B
266*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x short].
267*bed243d3SAndroid Build Coastguard Worker /// \returns
268*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
269*bed243d3SAndroid Build Coastguard Worker ///
270*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
271*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
272*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
276*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
277*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwusds_epi32(__m128i __W,__m128i __A,__m128i __B)278*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279*bed243d3SAndroid Build Coastguard Worker __m128i __A,
280*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
281*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
283*bed243d3SAndroid Build Coastguard Worker }
284*bed243d3SAndroid Build Coastguard Worker
285*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
288*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
289*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
290*bed243d3SAndroid Build Coastguard Worker ///
291*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
292*bed243d3SAndroid Build Coastguard Worker ///
293*bed243d3SAndroid Build Coastguard Worker /// \code
294*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295*bed243d3SAndroid Build Coastguard Worker /// \endcode
296*bed243d3SAndroid Build Coastguard Worker ///
297*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298*bed243d3SAndroid Build Coastguard Worker ///
299*bed243d3SAndroid Build Coastguard Worker /// \param __W
300*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
301*bed243d3SAndroid Build Coastguard Worker /// \param __A
302*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
303*bed243d3SAndroid Build Coastguard Worker /// \param __B
304*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x short].
305*bed243d3SAndroid Build Coastguard Worker /// \returns
306*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
307*bed243d3SAndroid Build Coastguard Worker ///
308*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
309*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
310*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
314*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
315*bed243d3SAndroid Build Coastguard Worker /// \endcode
316*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusds_epi32(__m256i __W,__m256i __A,__m256i __B)317*bed243d3SAndroid Build Coastguard Worker _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
320*bed243d3SAndroid Build Coastguard Worker }
321*bed243d3SAndroid Build Coastguard Worker
322*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
325*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326*bed243d3SAndroid Build Coastguard Worker ///
327*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
328*bed243d3SAndroid Build Coastguard Worker ///
329*bed243d3SAndroid Build Coastguard Worker /// \code
330*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331*bed243d3SAndroid Build Coastguard Worker /// \endcode
332*bed243d3SAndroid Build Coastguard Worker ///
333*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWUUD instruction.
334*bed243d3SAndroid Build Coastguard Worker ///
335*bed243d3SAndroid Build Coastguard Worker /// \param __W
336*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x unsigned int].
337*bed243d3SAndroid Build Coastguard Worker /// \param __A
338*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
339*bed243d3SAndroid Build Coastguard Worker /// \param __B
340*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
341*bed243d3SAndroid Build Coastguard Worker /// \returns
342*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x unsigned int].
343*bed243d3SAndroid Build Coastguard Worker ///
344*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
345*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
346*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
350*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
351*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwuud_epi32(__m128i __W,__m128i __A,__m128i __B)352*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353*bed243d3SAndroid Build Coastguard Worker __m128i __A,
354*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
355*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
357*bed243d3SAndroid Build Coastguard Worker }
358*bed243d3SAndroid Build Coastguard Worker
359*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
362*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363*bed243d3SAndroid Build Coastguard Worker ///
364*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
365*bed243d3SAndroid Build Coastguard Worker ///
366*bed243d3SAndroid Build Coastguard Worker /// \code
367*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368*bed243d3SAndroid Build Coastguard Worker /// \endcode
369*bed243d3SAndroid Build Coastguard Worker ///
370*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWUUD instruction.
371*bed243d3SAndroid Build Coastguard Worker ///
372*bed243d3SAndroid Build Coastguard Worker /// \param __W
373*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x unsigned int].
374*bed243d3SAndroid Build Coastguard Worker /// \param __A
375*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
376*bed243d3SAndroid Build Coastguard Worker /// \param __B
377*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
378*bed243d3SAndroid Build Coastguard Worker /// \returns
379*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x unsigned int].
380*bed243d3SAndroid Build Coastguard Worker ///
381*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
382*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
383*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
387*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
388*bed243d3SAndroid Build Coastguard Worker /// \endcode
389*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuud_epi32(__m256i __W,__m256i __A,__m256i __B)390*bed243d3SAndroid Build Coastguard Worker _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
393*bed243d3SAndroid Build Coastguard Worker }
394*bed243d3SAndroid Build Coastguard Worker
395*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
398*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
399*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
400*bed243d3SAndroid Build Coastguard Worker ///
401*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
402*bed243d3SAndroid Build Coastguard Worker ///
403*bed243d3SAndroid Build Coastguard Worker /// \code
404*bed243d3SAndroid Build Coastguard Worker /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405*bed243d3SAndroid Build Coastguard Worker /// \endcode
406*bed243d3SAndroid Build Coastguard Worker ///
407*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408*bed243d3SAndroid Build Coastguard Worker ///
409*bed243d3SAndroid Build Coastguard Worker /// \param __W
410*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x unsigned int].
411*bed243d3SAndroid Build Coastguard Worker /// \param __A
412*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
413*bed243d3SAndroid Build Coastguard Worker /// \param __B
414*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [8 x unsigned short].
415*bed243d3SAndroid Build Coastguard Worker /// \returns
416*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x unsigned int].
417*bed243d3SAndroid Build Coastguard Worker ///
418*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
419*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
420*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
424*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
425*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpwuuds_epi32(__m128i __W,__m128i __A,__m128i __B)426*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427*bed243d3SAndroid Build Coastguard Worker __m128i __A,
428*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
429*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
431*bed243d3SAndroid Build Coastguard Worker }
432*bed243d3SAndroid Build Coastguard Worker
433*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 2 results with the corresponding
436*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
437*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
438*bed243d3SAndroid Build Coastguard Worker ///
439*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
440*bed243d3SAndroid Build Coastguard Worker ///
441*bed243d3SAndroid Build Coastguard Worker /// \code
442*bed243d3SAndroid Build Coastguard Worker /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443*bed243d3SAndroid Build Coastguard Worker /// \endcode
444*bed243d3SAndroid Build Coastguard Worker ///
445*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446*bed243d3SAndroid Build Coastguard Worker ///
447*bed243d3SAndroid Build Coastguard Worker /// \param __W
448*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x unsigned int].
449*bed243d3SAndroid Build Coastguard Worker /// \param __A
450*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
451*bed243d3SAndroid Build Coastguard Worker /// \param __B
452*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x unsigned short].
453*bed243d3SAndroid Build Coastguard Worker /// \returns
454*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x unsigned int].
455*bed243d3SAndroid Build Coastguard Worker ///
456*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
457*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
458*bed243d3SAndroid Build Coastguard Worker /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459*bed243d3SAndroid Build Coastguard Worker /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
462*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
463*bed243d3SAndroid Build Coastguard Worker /// \endcode
464*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuuds_epi32(__m256i __W,__m256i __A,__m256i __B)465*bed243d3SAndroid Build Coastguard Worker _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
468*bed243d3SAndroid Build Coastguard Worker }
469*bed243d3SAndroid Build Coastguard Worker
470*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS128
471*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS256
472*bed243d3SAndroid Build Coastguard Worker
473*bed243d3SAndroid Build Coastguard Worker #endif // __AVXVNNIINT16INTRIN_H
474