xref: /aosp_15_r20/prebuilts/clang-tools/linux-x86/clang-headers/avxvnniintrin.h (revision bed243d3d9cd544cfb038bfa7be843dedc6e6bf7)
1*bed243d3SAndroid Build Coastguard Worker /*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
2*bed243d3SAndroid Build Coastguard Worker  *
3*bed243d3SAndroid Build Coastguard Worker  *
4*bed243d3SAndroid Build Coastguard Worker  * Permission is hereby granted, free of charge, to any person obtaining a copy
5*bed243d3SAndroid Build Coastguard Worker  * of this software and associated documentation files (the "Software"), to deal
6*bed243d3SAndroid Build Coastguard Worker  * in the Software without restriction, including without limitation the rights
7*bed243d3SAndroid Build Coastguard Worker  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8*bed243d3SAndroid Build Coastguard Worker  * copies of the Software, and to permit persons to whom the Software is
9*bed243d3SAndroid Build Coastguard Worker  * furnished to do so, subject to the following conditions:
10*bed243d3SAndroid Build Coastguard Worker  *
11*bed243d3SAndroid Build Coastguard Worker  * The above copyright notice and this permission notice shall be included in
12*bed243d3SAndroid Build Coastguard Worker  * all copies or substantial portions of the Software.
13*bed243d3SAndroid Build Coastguard Worker  *
14*bed243d3SAndroid Build Coastguard Worker  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15*bed243d3SAndroid Build Coastguard Worker  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16*bed243d3SAndroid Build Coastguard Worker  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17*bed243d3SAndroid Build Coastguard Worker  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18*bed243d3SAndroid Build Coastguard Worker  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19*bed243d3SAndroid Build Coastguard Worker  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20*bed243d3SAndroid Build Coastguard Worker  * THE SOFTWARE.
21*bed243d3SAndroid Build Coastguard Worker  *
22*bed243d3SAndroid Build Coastguard Worker  *===-----------------------------------------------------------------------===
23*bed243d3SAndroid Build Coastguard Worker  */
24*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
25*bed243d3SAndroid Build Coastguard Worker #error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
26*bed243d3SAndroid Build Coastguard Worker #endif
27*bed243d3SAndroid Build Coastguard Worker 
28*bed243d3SAndroid Build Coastguard Worker #ifndef __AVXVNNIINTRIN_H
29*bed243d3SAndroid Build Coastguard Worker #define __AVXVNNIINTRIN_H
30*bed243d3SAndroid Build Coastguard Worker 
31*bed243d3SAndroid Build Coastguard Worker /* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
32*bed243d3SAndroid Build Coastguard Worker /// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
33*bed243d3SAndroid Build Coastguard Worker /// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
34*bed243d3SAndroid Build Coastguard Worker /// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
35*bed243d3SAndroid Build Coastguard Worker /// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
36*bed243d3SAndroid Build Coastguard Worker /// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
37*bed243d3SAndroid Build Coastguard Worker /// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
38*bed243d3SAndroid Build Coastguard Worker /// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
39*bed243d3SAndroid Build Coastguard Worker /// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
40*bed243d3SAndroid Build Coastguard Worker 
41*bed243d3SAndroid Build Coastguard Worker /* Intrinsics with _avx_ prefix are for compatibility with msvc. */
42*bed243d3SAndroid Build Coastguard Worker /* Define the default attributes for the functions in this file. */
43*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
44*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
45*bed243d3SAndroid Build Coastguard Worker 
46*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
47*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
48*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
49*bed243d3SAndroid Build Coastguard Worker /// in \a __S, and store the packed 32-bit results in DST.
50*bed243d3SAndroid Build Coastguard Worker ///
51*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
52*bed243d3SAndroid Build Coastguard Worker ///
53*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
54*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 7
55*bed243d3SAndroid Build Coastguard Worker ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
56*bed243d3SAndroid Build Coastguard Worker ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
57*bed243d3SAndroid Build Coastguard Worker ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
58*bed243d3SAndroid Build Coastguard Worker ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
59*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
60*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
61*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:256] := 0
62*bed243d3SAndroid Build Coastguard Worker /// \endcode
63*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_avx_epi32(__m256i __S,__m256i __A,__m256i __B)64*bed243d3SAndroid Build Coastguard Worker _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
65*bed243d3SAndroid Build Coastguard Worker {
66*bed243d3SAndroid Build Coastguard Worker   return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
67*bed243d3SAndroid Build Coastguard Worker }
68*bed243d3SAndroid Build Coastguard Worker 
69*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
70*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
71*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
72*bed243d3SAndroid Build Coastguard Worker /// in \a __S using signed saturation, and store the packed 32-bit results in DST.
73*bed243d3SAndroid Build Coastguard Worker ///
74*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
75*bed243d3SAndroid Build Coastguard Worker ///
76*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
77*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 7
78*bed243d3SAndroid Build Coastguard Worker ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
79*bed243d3SAndroid Build Coastguard Worker ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
80*bed243d3SAndroid Build Coastguard Worker ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
81*bed243d3SAndroid Build Coastguard Worker ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
82*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
83*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
84*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:256] := 0
85*bed243d3SAndroid Build Coastguard Worker /// \endcode
86*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_avx_epi32(__m256i __S,__m256i __A,__m256i __B)87*bed243d3SAndroid Build Coastguard Worker _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
88*bed243d3SAndroid Build Coastguard Worker {
89*bed243d3SAndroid Build Coastguard Worker   return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
90*bed243d3SAndroid Build Coastguard Worker }
91*bed243d3SAndroid Build Coastguard Worker 
92*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
93*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
94*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
95*bed243d3SAndroid Build Coastguard Worker ///  and store the packed 32-bit results in DST.
96*bed243d3SAndroid Build Coastguard Worker ///
97*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
98*bed243d3SAndroid Build Coastguard Worker ///
99*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
100*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 7
101*bed243d3SAndroid Build Coastguard Worker ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
102*bed243d3SAndroid Build Coastguard Worker ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
103*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
104*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
105*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:256] := 0
106*bed243d3SAndroid Build Coastguard Worker /// \endcode
107*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_avx_epi32(__m256i __S,__m256i __A,__m256i __B)108*bed243d3SAndroid Build Coastguard Worker _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
109*bed243d3SAndroid Build Coastguard Worker {
110*bed243d3SAndroid Build Coastguard Worker   return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
111*bed243d3SAndroid Build Coastguard Worker }
112*bed243d3SAndroid Build Coastguard Worker 
113*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
114*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
115*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
116*bed243d3SAndroid Build Coastguard Worker /// using signed saturation, and store the packed 32-bit results in DST.
117*bed243d3SAndroid Build Coastguard Worker ///
118*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
119*bed243d3SAndroid Build Coastguard Worker ///
120*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
121*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 7
122*bed243d3SAndroid Build Coastguard Worker ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
123*bed243d3SAndroid Build Coastguard Worker ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
124*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
125*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
126*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:256] := 0
127*bed243d3SAndroid Build Coastguard Worker /// \endcode
128*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_avx_epi32(__m256i __S,__m256i __A,__m256i __B)129*bed243d3SAndroid Build Coastguard Worker _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
130*bed243d3SAndroid Build Coastguard Worker {
131*bed243d3SAndroid Build Coastguard Worker   return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
132*bed243d3SAndroid Build Coastguard Worker }
133*bed243d3SAndroid Build Coastguard Worker 
134*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
135*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
136*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
137*bed243d3SAndroid Build Coastguard Worker /// in \a __S, and store the packed 32-bit results in DST.
138*bed243d3SAndroid Build Coastguard Worker ///
139*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
140*bed243d3SAndroid Build Coastguard Worker ///
141*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
142*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 3
143*bed243d3SAndroid Build Coastguard Worker ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
144*bed243d3SAndroid Build Coastguard Worker ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
145*bed243d3SAndroid Build Coastguard Worker ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
146*bed243d3SAndroid Build Coastguard Worker ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
147*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
148*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
149*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:128] := 0
150*bed243d3SAndroid Build Coastguard Worker /// \endcode
151*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_avx_epi32(__m128i __S,__m128i __A,__m128i __B)152*bed243d3SAndroid Build Coastguard Worker _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
153*bed243d3SAndroid Build Coastguard Worker {
154*bed243d3SAndroid Build Coastguard Worker   return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
155*bed243d3SAndroid Build Coastguard Worker }
156*bed243d3SAndroid Build Coastguard Worker 
157*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
158*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
159*bed243d3SAndroid Build Coastguard Worker /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
160*bed243d3SAndroid Build Coastguard Worker /// in \a __S using signed saturation, and store the packed 32-bit results in DST.
161*bed243d3SAndroid Build Coastguard Worker ///
162*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
163*bed243d3SAndroid Build Coastguard Worker ///
164*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
165*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 3
166*bed243d3SAndroid Build Coastguard Worker ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
167*bed243d3SAndroid Build Coastguard Worker ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
168*bed243d3SAndroid Build Coastguard Worker ///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
169*bed243d3SAndroid Build Coastguard Worker ///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
170*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
171*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
172*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:128] := 0
173*bed243d3SAndroid Build Coastguard Worker /// \endcode
174*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_avx_epi32(__m128i __S,__m128i __A,__m128i __B)175*bed243d3SAndroid Build Coastguard Worker _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
176*bed243d3SAndroid Build Coastguard Worker {
177*bed243d3SAndroid Build Coastguard Worker   return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
178*bed243d3SAndroid Build Coastguard Worker }
179*bed243d3SAndroid Build Coastguard Worker 
180*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
181*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
182*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
183*bed243d3SAndroid Build Coastguard Worker /// and store the packed 32-bit results in DST.
184*bed243d3SAndroid Build Coastguard Worker ///
185*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
186*bed243d3SAndroid Build Coastguard Worker ///
187*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
188*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 3
189*bed243d3SAndroid Build Coastguard Worker ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
190*bed243d3SAndroid Build Coastguard Worker ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
192*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
193*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:128] := 0
194*bed243d3SAndroid Build Coastguard Worker /// \endcode
195*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_avx_epi32(__m128i __S,__m128i __A,__m128i __B)196*bed243d3SAndroid Build Coastguard Worker _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
197*bed243d3SAndroid Build Coastguard Worker {
198*bed243d3SAndroid Build Coastguard Worker   return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
199*bed243d3SAndroid Build Coastguard Worker }
200*bed243d3SAndroid Build Coastguard Worker 
201*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
202*bed243d3SAndroid Build Coastguard Worker /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
203*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
204*bed243d3SAndroid Build Coastguard Worker /// using signed saturation, and store the packed 32-bit results in DST.
205*bed243d3SAndroid Build Coastguard Worker ///
206*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
207*bed243d3SAndroid Build Coastguard Worker ///
208*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
209*bed243d3SAndroid Build Coastguard Worker ///    FOR j := 0 to 3
210*bed243d3SAndroid Build Coastguard Worker ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
211*bed243d3SAndroid Build Coastguard Worker ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
212*bed243d3SAndroid Build Coastguard Worker ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
213*bed243d3SAndroid Build Coastguard Worker ///    ENDFOR
214*bed243d3SAndroid Build Coastguard Worker ///    DST[MAX:128] := 0
215*bed243d3SAndroid Build Coastguard Worker /// \endcode
216*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_avx_epi32(__m128i __S,__m128i __A,__m128i __B)217*bed243d3SAndroid Build Coastguard Worker _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
218*bed243d3SAndroid Build Coastguard Worker {
219*bed243d3SAndroid Build Coastguard Worker   return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
220*bed243d3SAndroid Build Coastguard Worker }
221*bed243d3SAndroid Build Coastguard Worker 
222*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS128
223*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS256
224*bed243d3SAndroid Build Coastguard Worker 
225*bed243d3SAndroid Build Coastguard Worker #endif // __AVXVNNIINTRIN_H
226