1*bed243d3SAndroid Build Coastguard Worker /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2*bed243d3SAndroid Build Coastguard Worker *
3*bed243d3SAndroid Build Coastguard Worker * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bed243d3SAndroid Build Coastguard Worker * See https://llvm.org/LICENSE.txt for license information.
5*bed243d3SAndroid Build Coastguard Worker * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bed243d3SAndroid Build Coastguard Worker *
7*bed243d3SAndroid Build Coastguard Worker *===-----------------------------------------------------------------------===
8*bed243d3SAndroid Build Coastguard Worker */
9*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
10*bed243d3SAndroid Build Coastguard Worker #error \
11*bed243d3SAndroid Build Coastguard Worker "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12*bed243d3SAndroid Build Coastguard Worker #endif
13*bed243d3SAndroid Build Coastguard Worker
14*bed243d3SAndroid Build Coastguard Worker #ifndef __AVXVNNIINT8INTRIN_H
15*bed243d3SAndroid Build Coastguard Worker #define __AVXVNNIINT8INTRIN_H
16*bed243d3SAndroid Build Coastguard Worker
17*bed243d3SAndroid Build Coastguard Worker /* Define the default attributes for the functions in this file. */
18*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS256 \
19*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(256)))
21*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS128 \
22*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(128)))
24*bed243d3SAndroid Build Coastguard Worker
25*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
28*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29*bed243d3SAndroid Build Coastguard Worker ///
30*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
31*bed243d3SAndroid Build Coastguard Worker ///
32*bed243d3SAndroid Build Coastguard Worker /// \code
33*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34*bed243d3SAndroid Build Coastguard Worker /// \endcode
35*bed243d3SAndroid Build Coastguard Worker ///
36*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
37*bed243d3SAndroid Build Coastguard Worker ///
38*bed243d3SAndroid Build Coastguard Worker /// \param __A
39*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
40*bed243d3SAndroid Build Coastguard Worker /// \param __B
41*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
42*bed243d3SAndroid Build Coastguard Worker /// \returns
43*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
44*bed243d3SAndroid Build Coastguard Worker ///
45*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
46*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
47*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
53*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
54*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbssd_epi32(__m128i __W,__m128i __A,__m128i __B)55*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56*bed243d3SAndroid Build Coastguard Worker __m128i __A,
57*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
58*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
60*bed243d3SAndroid Build Coastguard Worker }
61*bed243d3SAndroid Build Coastguard Worker
62*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
65*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66*bed243d3SAndroid Build Coastguard Worker ///
67*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
68*bed243d3SAndroid Build Coastguard Worker ///
69*bed243d3SAndroid Build Coastguard Worker /// \code
70*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71*bed243d3SAndroid Build Coastguard Worker /// \endcode
72*bed243d3SAndroid Build Coastguard Worker ///
73*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
74*bed243d3SAndroid Build Coastguard Worker ///
75*bed243d3SAndroid Build Coastguard Worker /// \param __A
76*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
77*bed243d3SAndroid Build Coastguard Worker /// \param __B
78*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
79*bed243d3SAndroid Build Coastguard Worker /// \returns
80*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
81*bed243d3SAndroid Build Coastguard Worker ///
82*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
83*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
84*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
90*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
91*bed243d3SAndroid Build Coastguard Worker /// \endcode
92*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssd_epi32(__m256i __W,__m256i __A,__m256i __B)93*bed243d3SAndroid Build Coastguard Worker _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
96*bed243d3SAndroid Build Coastguard Worker }
97*bed243d3SAndroid Build Coastguard Worker
98*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
101*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
102*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
103*bed243d3SAndroid Build Coastguard Worker ///
104*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
105*bed243d3SAndroid Build Coastguard Worker ///
106*bed243d3SAndroid Build Coastguard Worker /// \code
107*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108*bed243d3SAndroid Build Coastguard Worker /// \endcode
109*bed243d3SAndroid Build Coastguard Worker ///
110*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
111*bed243d3SAndroid Build Coastguard Worker ///
112*bed243d3SAndroid Build Coastguard Worker /// \param __A
113*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
114*bed243d3SAndroid Build Coastguard Worker /// \param __B
115*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
116*bed243d3SAndroid Build Coastguard Worker /// \returns
117*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
118*bed243d3SAndroid Build Coastguard Worker ///
119*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
120*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
121*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
127*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
128*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbssds_epi32(__m128i __W,__m128i __A,__m128i __B)129*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130*bed243d3SAndroid Build Coastguard Worker __m128i __A,
131*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
132*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
134*bed243d3SAndroid Build Coastguard Worker }
135*bed243d3SAndroid Build Coastguard Worker
136*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
139*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
140*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
141*bed243d3SAndroid Build Coastguard Worker ///
142*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
143*bed243d3SAndroid Build Coastguard Worker ///
144*bed243d3SAndroid Build Coastguard Worker /// \code
145*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146*bed243d3SAndroid Build Coastguard Worker /// \endcode
147*bed243d3SAndroid Build Coastguard Worker ///
148*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
149*bed243d3SAndroid Build Coastguard Worker ///
150*bed243d3SAndroid Build Coastguard Worker /// \param __A
151*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
152*bed243d3SAndroid Build Coastguard Worker /// \param __B
153*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
154*bed243d3SAndroid Build Coastguard Worker /// \returns
155*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
156*bed243d3SAndroid Build Coastguard Worker ///
157*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
158*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
159*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
165*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
166*bed243d3SAndroid Build Coastguard Worker /// \endcode
167*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssds_epi32(__m256i __W,__m256i __A,__m256i __B)168*bed243d3SAndroid Build Coastguard Worker _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
171*bed243d3SAndroid Build Coastguard Worker }
172*bed243d3SAndroid Build Coastguard Worker
173*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
176*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177*bed243d3SAndroid Build Coastguard Worker ///
178*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
179*bed243d3SAndroid Build Coastguard Worker ///
180*bed243d3SAndroid Build Coastguard Worker /// \code
181*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182*bed243d3SAndroid Build Coastguard Worker /// \endcode
183*bed243d3SAndroid Build Coastguard Worker ///
184*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
185*bed243d3SAndroid Build Coastguard Worker ///
186*bed243d3SAndroid Build Coastguard Worker /// \param __A
187*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
188*bed243d3SAndroid Build Coastguard Worker /// \param __B
189*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
190*bed243d3SAndroid Build Coastguard Worker /// \returns
191*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
192*bed243d3SAndroid Build Coastguard Worker ///
193*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
194*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
195*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
201*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
202*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbsud_epi32(__m128i __W,__m128i __A,__m128i __B)203*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204*bed243d3SAndroid Build Coastguard Worker __m128i __A,
205*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
206*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
208*bed243d3SAndroid Build Coastguard Worker }
209*bed243d3SAndroid Build Coastguard Worker
210*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
213*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214*bed243d3SAndroid Build Coastguard Worker ///
215*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
216*bed243d3SAndroid Build Coastguard Worker ///
217*bed243d3SAndroid Build Coastguard Worker /// \code
218*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219*bed243d3SAndroid Build Coastguard Worker /// \endcode
220*bed243d3SAndroid Build Coastguard Worker ///
221*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
222*bed243d3SAndroid Build Coastguard Worker ///
223*bed243d3SAndroid Build Coastguard Worker /// \param __A
224*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
225*bed243d3SAndroid Build Coastguard Worker /// \param __B
226*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
227*bed243d3SAndroid Build Coastguard Worker /// \returns
228*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
229*bed243d3SAndroid Build Coastguard Worker ///
230*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
231*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
232*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
238*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
239*bed243d3SAndroid Build Coastguard Worker /// \endcode
240*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsud_epi32(__m256i __W,__m256i __A,__m256i __B)241*bed243d3SAndroid Build Coastguard Worker _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
244*bed243d3SAndroid Build Coastguard Worker }
245*bed243d3SAndroid Build Coastguard Worker
246*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
249*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
250*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
251*bed243d3SAndroid Build Coastguard Worker ///
252*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
253*bed243d3SAndroid Build Coastguard Worker ///
254*bed243d3SAndroid Build Coastguard Worker /// \code
255*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256*bed243d3SAndroid Build Coastguard Worker /// \endcode
257*bed243d3SAndroid Build Coastguard Worker ///
258*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
259*bed243d3SAndroid Build Coastguard Worker ///
260*bed243d3SAndroid Build Coastguard Worker /// \param __A
261*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x char].
262*bed243d3SAndroid Build Coastguard Worker /// \param __B
263*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
264*bed243d3SAndroid Build Coastguard Worker /// \returns
265*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
266*bed243d3SAndroid Build Coastguard Worker ///
267*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
268*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
269*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
275*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
276*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbsuds_epi32(__m128i __W,__m128i __A,__m128i __B)277*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278*bed243d3SAndroid Build Coastguard Worker __m128i __A,
279*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
280*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
282*bed243d3SAndroid Build Coastguard Worker }
283*bed243d3SAndroid Build Coastguard Worker
284*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
287*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
288*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
289*bed243d3SAndroid Build Coastguard Worker ///
290*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
291*bed243d3SAndroid Build Coastguard Worker ///
292*bed243d3SAndroid Build Coastguard Worker /// \code
293*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294*bed243d3SAndroid Build Coastguard Worker /// \endcode
295*bed243d3SAndroid Build Coastguard Worker ///
296*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
297*bed243d3SAndroid Build Coastguard Worker ///
298*bed243d3SAndroid Build Coastguard Worker /// \param __A
299*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x char].
300*bed243d3SAndroid Build Coastguard Worker /// \param __B
301*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
302*bed243d3SAndroid Build Coastguard Worker /// \returns
303*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
304*bed243d3SAndroid Build Coastguard Worker ///
305*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
306*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
307*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
313*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
314*bed243d3SAndroid Build Coastguard Worker /// \endcode
315*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsuds_epi32(__m256i __W,__m256i __A,__m256i __B)316*bed243d3SAndroid Build Coastguard Worker _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
319*bed243d3SAndroid Build Coastguard Worker }
320*bed243d3SAndroid Build Coastguard Worker
321*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
324*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325*bed243d3SAndroid Build Coastguard Worker ///
326*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
327*bed243d3SAndroid Build Coastguard Worker ///
328*bed243d3SAndroid Build Coastguard Worker /// \code
329*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330*bed243d3SAndroid Build Coastguard Worker /// \endcode
331*bed243d3SAndroid Build Coastguard Worker ///
332*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
333*bed243d3SAndroid Build Coastguard Worker ///
334*bed243d3SAndroid Build Coastguard Worker /// \param __A
335*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
336*bed243d3SAndroid Build Coastguard Worker /// \param __B
337*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
338*bed243d3SAndroid Build Coastguard Worker /// \returns
339*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
340*bed243d3SAndroid Build Coastguard Worker ///
341*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
342*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
343*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
349*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
350*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbuud_epi32(__m128i __W,__m128i __A,__m128i __B)351*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352*bed243d3SAndroid Build Coastguard Worker __m128i __A,
353*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
354*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
356*bed243d3SAndroid Build Coastguard Worker }
357*bed243d3SAndroid Build Coastguard Worker
358*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
361*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362*bed243d3SAndroid Build Coastguard Worker ///
363*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
364*bed243d3SAndroid Build Coastguard Worker ///
365*bed243d3SAndroid Build Coastguard Worker /// \code
366*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367*bed243d3SAndroid Build Coastguard Worker /// \endcode
368*bed243d3SAndroid Build Coastguard Worker ///
369*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBSSD instruction.
370*bed243d3SAndroid Build Coastguard Worker ///
371*bed243d3SAndroid Build Coastguard Worker /// \param __A
372*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
373*bed243d3SAndroid Build Coastguard Worker /// \param __B
374*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
375*bed243d3SAndroid Build Coastguard Worker /// \returns
376*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
377*bed243d3SAndroid Build Coastguard Worker ///
378*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
379*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
380*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
386*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
387*bed243d3SAndroid Build Coastguard Worker /// \endcode
388*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuud_epi32(__m256i __W,__m256i __A,__m256i __B)389*bed243d3SAndroid Build Coastguard Worker _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
392*bed243d3SAndroid Build Coastguard Worker }
393*bed243d3SAndroid Build Coastguard Worker
394*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
397*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
398*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
399*bed243d3SAndroid Build Coastguard Worker ///
400*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
401*bed243d3SAndroid Build Coastguard Worker ///
402*bed243d3SAndroid Build Coastguard Worker /// \code
403*bed243d3SAndroid Build Coastguard Worker /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404*bed243d3SAndroid Build Coastguard Worker /// \endcode
405*bed243d3SAndroid Build Coastguard Worker ///
406*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407*bed243d3SAndroid Build Coastguard Worker ///
408*bed243d3SAndroid Build Coastguard Worker /// \param __A
409*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
410*bed243d3SAndroid Build Coastguard Worker /// \param __B
411*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [16 x unsigned char].
412*bed243d3SAndroid Build Coastguard Worker /// \returns
413*bed243d3SAndroid Build Coastguard Worker /// A 128-bit vector of [4 x int].
414*bed243d3SAndroid Build Coastguard Worker ///
415*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
416*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 3
417*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
423*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:128] := 0
424*bed243d3SAndroid Build Coastguard Worker /// \endcode
_mm_dpbuuds_epi32(__m128i __W,__m128i __A,__m128i __B)425*bed243d3SAndroid Build Coastguard Worker static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426*bed243d3SAndroid Build Coastguard Worker __m128i __A,
427*bed243d3SAndroid Build Coastguard Worker __m128i __B) {
428*bed243d3SAndroid Build Coastguard Worker return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429*bed243d3SAndroid Build Coastguard Worker (__v4si)__B);
430*bed243d3SAndroid Build Coastguard Worker }
431*bed243d3SAndroid Build Coastguard Worker
432*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434*bed243d3SAndroid Build Coastguard Worker /// signed 16-bit results. Sum these 4 results with the corresponding
435*bed243d3SAndroid Build Coastguard Worker /// 32-bit integer in \a __W with signed saturation, and store the packed
436*bed243d3SAndroid Build Coastguard Worker /// 32-bit results in \a dst.
437*bed243d3SAndroid Build Coastguard Worker ///
438*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
439*bed243d3SAndroid Build Coastguard Worker ///
440*bed243d3SAndroid Build Coastguard Worker /// \code
441*bed243d3SAndroid Build Coastguard Worker /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442*bed243d3SAndroid Build Coastguard Worker /// \endcode
443*bed243d3SAndroid Build Coastguard Worker ///
444*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445*bed243d3SAndroid Build Coastguard Worker ///
446*bed243d3SAndroid Build Coastguard Worker /// \param __A
447*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
448*bed243d3SAndroid Build Coastguard Worker /// \param __B
449*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [32 x unsigned char].
450*bed243d3SAndroid Build Coastguard Worker /// \returns
451*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [8 x int].
452*bed243d3SAndroid Build Coastguard Worker ///
453*bed243d3SAndroid Build Coastguard Worker /// \code{.operation}
454*bed243d3SAndroid Build Coastguard Worker /// FOR j := 0 to 7
455*bed243d3SAndroid Build Coastguard Worker /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456*bed243d3SAndroid Build Coastguard Worker /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457*bed243d3SAndroid Build Coastguard Worker /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458*bed243d3SAndroid Build Coastguard Worker /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459*bed243d3SAndroid Build Coastguard Worker /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460*bed243d3SAndroid Build Coastguard Worker /// ENDFOR
461*bed243d3SAndroid Build Coastguard Worker /// dst[MAX:256] := 0
462*bed243d3SAndroid Build Coastguard Worker /// \endcode
463*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuuds_epi32(__m256i __W,__m256i __A,__m256i __B)464*bed243d3SAndroid Build Coastguard Worker _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465*bed243d3SAndroid Build Coastguard Worker return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466*bed243d3SAndroid Build Coastguard Worker (__v8si)__B);
467*bed243d3SAndroid Build Coastguard Worker }
468*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS128
469*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS256
470*bed243d3SAndroid Build Coastguard Worker
471*bed243d3SAndroid Build Coastguard Worker #endif // __AVXVNNIINT8INTRIN_H
472