1*bed243d3SAndroid Build Coastguard Worker /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2*bed243d3SAndroid Build Coastguard Worker *
3*bed243d3SAndroid Build Coastguard Worker * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bed243d3SAndroid Build Coastguard Worker * See https://llvm.org/LICENSE.txt for license information.
5*bed243d3SAndroid Build Coastguard Worker * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bed243d3SAndroid Build Coastguard Worker *
7*bed243d3SAndroid Build Coastguard Worker *===-----------------------------------------------------------------------===
8*bed243d3SAndroid Build Coastguard Worker */
9*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
10*bed243d3SAndroid Build Coastguard Worker #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11*bed243d3SAndroid Build Coastguard Worker #endif
12*bed243d3SAndroid Build Coastguard Worker
13*bed243d3SAndroid Build Coastguard Worker #ifdef __SSE2__
14*bed243d3SAndroid Build Coastguard Worker
15*bed243d3SAndroid Build Coastguard Worker #ifndef __AVX512BF16INTRIN_H
16*bed243d3SAndroid Build Coastguard Worker #define __AVX512BF16INTRIN_H
17*bed243d3SAndroid Build Coastguard Worker
18*bed243d3SAndroid Build Coastguard Worker typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19*bed243d3SAndroid Build Coastguard Worker typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20*bed243d3SAndroid Build Coastguard Worker typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21*bed243d3SAndroid Build Coastguard Worker
22*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS512 \
23*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24*bed243d3SAndroid Build Coastguard Worker __min_vector_width__(512)))
25*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS \
26*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, \
27*bed243d3SAndroid Build Coastguard Worker __target__("avx512bf16,no-evex512")))
28*bed243d3SAndroid Build Coastguard Worker
29*bed243d3SAndroid Build Coastguard Worker /// Convert One BF16 Data to One Single Float Data.
30*bed243d3SAndroid Build Coastguard Worker ///
31*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
32*bed243d3SAndroid Build Coastguard Worker ///
33*bed243d3SAndroid Build Coastguard Worker /// This intrinsic does not correspond to a specific instruction.
34*bed243d3SAndroid Build Coastguard Worker ///
35*bed243d3SAndroid Build Coastguard Worker /// \param __A
36*bed243d3SAndroid Build Coastguard Worker /// A bfloat data.
37*bed243d3SAndroid Build Coastguard Worker /// \returns A float data whose sign field and exponent field keep unchanged,
38*bed243d3SAndroid Build Coastguard Worker /// and fraction field is extended to 23 bits.
_mm_cvtsbh_ss(__bf16 __A)39*bed243d3SAndroid Build Coastguard Worker static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_cvtsbf162ss_32(__A);
41*bed243d3SAndroid Build Coastguard Worker }
42*bed243d3SAndroid Build Coastguard Worker
43*bed243d3SAndroid Build Coastguard Worker /// Convert Two Packed Single Data to One Packed BF16 Data.
44*bed243d3SAndroid Build Coastguard Worker ///
45*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
46*bed243d3SAndroid Build Coastguard Worker ///
47*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48*bed243d3SAndroid Build Coastguard Worker ///
49*bed243d3SAndroid Build Coastguard Worker /// \param __A
50*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
51*bed243d3SAndroid Build Coastguard Worker /// \param __B
52*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
53*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54*bed243d3SAndroid Build Coastguard Worker /// conversion of __B, and higher 256 bits come from conversion of __A.
55*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_cvtne2ps_pbh(__m512 __A,__m512 __B)56*bed243d3SAndroid Build Coastguard Worker _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57*bed243d3SAndroid Build Coastguard Worker return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58*bed243d3SAndroid Build Coastguard Worker (__v16sf) __B);
59*bed243d3SAndroid Build Coastguard Worker }
60*bed243d3SAndroid Build Coastguard Worker
61*bed243d3SAndroid Build Coastguard Worker /// Convert Two Packed Single Data to One Packed BF16 Data.
62*bed243d3SAndroid Build Coastguard Worker ///
63*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
64*bed243d3SAndroid Build Coastguard Worker ///
65*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66*bed243d3SAndroid Build Coastguard Worker ///
67*bed243d3SAndroid Build Coastguard Worker /// \param __A
68*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
69*bed243d3SAndroid Build Coastguard Worker /// \param __B
70*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
71*bed243d3SAndroid Build Coastguard Worker /// \param __W
72*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
73*bed243d3SAndroid Build Coastguard Worker /// \param __U
74*bed243d3SAndroid Build Coastguard Worker /// A 32-bit mask value specifying what is chosen for each element.
75*bed243d3SAndroid Build Coastguard Worker /// A 1 means conversion of __A or __B. A 0 means element from __W.
76*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77*bed243d3SAndroid Build Coastguard Worker /// conversion of __B, and higher 256 bits come from conversion of __A.
78*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtne2ps_pbh(__m512bh __W,__mmask32 __U,__m512 __A,__m512 __B)79*bed243d3SAndroid Build Coastguard Worker _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80*bed243d3SAndroid Build Coastguard Worker return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81*bed243d3SAndroid Build Coastguard Worker (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82*bed243d3SAndroid Build Coastguard Worker (__v32bf)__W);
83*bed243d3SAndroid Build Coastguard Worker }
84*bed243d3SAndroid Build Coastguard Worker
85*bed243d3SAndroid Build Coastguard Worker /// Convert Two Packed Single Data to One Packed BF16 Data.
86*bed243d3SAndroid Build Coastguard Worker ///
87*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
88*bed243d3SAndroid Build Coastguard Worker ///
89*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90*bed243d3SAndroid Build Coastguard Worker ///
91*bed243d3SAndroid Build Coastguard Worker /// \param __A
92*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
93*bed243d3SAndroid Build Coastguard Worker /// \param __B
94*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
95*bed243d3SAndroid Build Coastguard Worker /// \param __U
96*bed243d3SAndroid Build Coastguard Worker /// A 32-bit mask value specifying what is chosen for each element.
97*bed243d3SAndroid Build Coastguard Worker /// A 1 means conversion of __A or __B. A 0 means element is zero.
98*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99*bed243d3SAndroid Build Coastguard Worker /// conversion of __B, and higher 256 bits come from conversion of __A.
100*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U,__m512 __A,__m512 __B)101*bed243d3SAndroid Build Coastguard Worker _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102*bed243d3SAndroid Build Coastguard Worker return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103*bed243d3SAndroid Build Coastguard Worker (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104*bed243d3SAndroid Build Coastguard Worker (__v32bf)_mm512_setzero_si512());
105*bed243d3SAndroid Build Coastguard Worker }
106*bed243d3SAndroid Build Coastguard Worker
107*bed243d3SAndroid Build Coastguard Worker /// Convert Packed Single Data to Packed BF16 Data.
108*bed243d3SAndroid Build Coastguard Worker ///
109*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
110*bed243d3SAndroid Build Coastguard Worker ///
111*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112*bed243d3SAndroid Build Coastguard Worker ///
113*bed243d3SAndroid Build Coastguard Worker /// \param __A
114*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
115*bed243d3SAndroid Build Coastguard Worker /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A)117*bed243d3SAndroid Build Coastguard Worker _mm512_cvtneps_pbh(__m512 __A) {
118*bed243d3SAndroid Build Coastguard Worker return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119*bed243d3SAndroid Build Coastguard Worker (__v16bf)_mm256_undefined_si256(),
120*bed243d3SAndroid Build Coastguard Worker (__mmask16)-1);
121*bed243d3SAndroid Build Coastguard Worker }
122*bed243d3SAndroid Build Coastguard Worker
123*bed243d3SAndroid Build Coastguard Worker /// Convert Packed Single Data to Packed BF16 Data.
124*bed243d3SAndroid Build Coastguard Worker ///
125*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
126*bed243d3SAndroid Build Coastguard Worker ///
127*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128*bed243d3SAndroid Build Coastguard Worker ///
129*bed243d3SAndroid Build Coastguard Worker /// \param __A
130*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
131*bed243d3SAndroid Build Coastguard Worker /// \param __W
132*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x bfloat].
133*bed243d3SAndroid Build Coastguard Worker /// \param __U
134*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask value specifying what is chosen for each element.
135*bed243d3SAndroid Build Coastguard Worker /// A 1 means conversion of __A. A 0 means element from __W.
136*bed243d3SAndroid Build Coastguard Worker /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtneps_pbh(__m256bh __W,__mmask16 __U,__m512 __A)138*bed243d3SAndroid Build Coastguard Worker _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139*bed243d3SAndroid Build Coastguard Worker return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140*bed243d3SAndroid Build Coastguard Worker (__v16bf)__W,
141*bed243d3SAndroid Build Coastguard Worker (__mmask16)__U);
142*bed243d3SAndroid Build Coastguard Worker }
143*bed243d3SAndroid Build Coastguard Worker
144*bed243d3SAndroid Build Coastguard Worker /// Convert Packed Single Data to Packed BF16 Data.
145*bed243d3SAndroid Build Coastguard Worker ///
146*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
147*bed243d3SAndroid Build Coastguard Worker ///
148*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149*bed243d3SAndroid Build Coastguard Worker ///
150*bed243d3SAndroid Build Coastguard Worker /// \param __A
151*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
152*bed243d3SAndroid Build Coastguard Worker /// \param __U
153*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask value specifying what is chosen for each element.
154*bed243d3SAndroid Build Coastguard Worker /// A 1 means conversion of __A. A 0 means element is zero.
155*bed243d3SAndroid Build Coastguard Worker /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156*bed243d3SAndroid Build Coastguard Worker static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtneps_pbh(__mmask16 __U,__m512 __A)157*bed243d3SAndroid Build Coastguard Worker _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158*bed243d3SAndroid Build Coastguard Worker return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159*bed243d3SAndroid Build Coastguard Worker (__v16bf)_mm256_setzero_si256(),
160*bed243d3SAndroid Build Coastguard Worker (__mmask16)__U);
161*bed243d3SAndroid Build Coastguard Worker }
162*bed243d3SAndroid Build Coastguard Worker
163*bed243d3SAndroid Build Coastguard Worker /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164*bed243d3SAndroid Build Coastguard Worker ///
165*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
166*bed243d3SAndroid Build Coastguard Worker ///
167*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168*bed243d3SAndroid Build Coastguard Worker ///
169*bed243d3SAndroid Build Coastguard Worker /// \param __A
170*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
171*bed243d3SAndroid Build Coastguard Worker /// \param __B
172*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
173*bed243d3SAndroid Build Coastguard Worker /// \param __D
174*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
175*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
176*bed243d3SAndroid Build Coastguard Worker /// __A, __B and __D
177*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_dpbf16_ps(__m512 __D,__m512bh __A,__m512bh __B)178*bed243d3SAndroid Build Coastguard Worker _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179*bed243d3SAndroid Build Coastguard Worker return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180*bed243d3SAndroid Build Coastguard Worker (__v32bf) __A,
181*bed243d3SAndroid Build Coastguard Worker (__v32bf) __B);
182*bed243d3SAndroid Build Coastguard Worker }
183*bed243d3SAndroid Build Coastguard Worker
184*bed243d3SAndroid Build Coastguard Worker /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185*bed243d3SAndroid Build Coastguard Worker ///
186*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
187*bed243d3SAndroid Build Coastguard Worker ///
188*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189*bed243d3SAndroid Build Coastguard Worker ///
190*bed243d3SAndroid Build Coastguard Worker /// \param __A
191*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
192*bed243d3SAndroid Build Coastguard Worker /// \param __B
193*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
194*bed243d3SAndroid Build Coastguard Worker /// \param __D
195*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
196*bed243d3SAndroid Build Coastguard Worker /// \param __U
197*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask value specifying what is chosen for each element.
198*bed243d3SAndroid Build Coastguard Worker /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
200*bed243d3SAndroid Build Coastguard Worker /// __A, __B and __D
201*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_dpbf16_ps(__m512 __D,__mmask16 __U,__m512bh __A,__m512bh __B)202*bed243d3SAndroid Build Coastguard Worker _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203*bed243d3SAndroid Build Coastguard Worker return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204*bed243d3SAndroid Build Coastguard Worker (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205*bed243d3SAndroid Build Coastguard Worker (__v16sf)__D);
206*bed243d3SAndroid Build Coastguard Worker }
207*bed243d3SAndroid Build Coastguard Worker
208*bed243d3SAndroid Build Coastguard Worker /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209*bed243d3SAndroid Build Coastguard Worker ///
210*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
211*bed243d3SAndroid Build Coastguard Worker ///
212*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213*bed243d3SAndroid Build Coastguard Worker ///
214*bed243d3SAndroid Build Coastguard Worker /// \param __A
215*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
216*bed243d3SAndroid Build Coastguard Worker /// \param __B
217*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [32 x bfloat].
218*bed243d3SAndroid Build Coastguard Worker /// \param __D
219*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float].
220*bed243d3SAndroid Build Coastguard Worker /// \param __U
221*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask value specifying what is chosen for each element.
222*bed243d3SAndroid Build Coastguard Worker /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
224*bed243d3SAndroid Build Coastguard Worker /// __A, __B and __D
225*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_dpbf16_ps(__mmask16 __U,__m512 __D,__m512bh __A,__m512bh __B)226*bed243d3SAndroid Build Coastguard Worker _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227*bed243d3SAndroid Build Coastguard Worker return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228*bed243d3SAndroid Build Coastguard Worker (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229*bed243d3SAndroid Build Coastguard Worker (__v16sf)_mm512_setzero_si512());
230*bed243d3SAndroid Build Coastguard Worker }
231*bed243d3SAndroid Build Coastguard Worker
232*bed243d3SAndroid Build Coastguard Worker /// Convert Packed BF16 Data to Packed float Data.
233*bed243d3SAndroid Build Coastguard Worker ///
234*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
235*bed243d3SAndroid Build Coastguard Worker ///
236*bed243d3SAndroid Build Coastguard Worker /// \param __A
237*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x bfloat].
238*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] come from conversion of __A
_mm512_cvtpbh_ps(__m256bh __A)239*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240*bed243d3SAndroid Build Coastguard Worker return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241*bed243d3SAndroid Build Coastguard Worker (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242*bed243d3SAndroid Build Coastguard Worker }
243*bed243d3SAndroid Build Coastguard Worker
244*bed243d3SAndroid Build Coastguard Worker /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245*bed243d3SAndroid Build Coastguard Worker ///
246*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
247*bed243d3SAndroid Build Coastguard Worker ///
248*bed243d3SAndroid Build Coastguard Worker /// \param __U
249*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask. Elements are zeroed out when the corresponding mask
250*bed243d3SAndroid Build Coastguard Worker /// bit is not set.
251*bed243d3SAndroid Build Coastguard Worker /// \param __A
252*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x bfloat].
253*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] come from conversion of __A
254*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U,__m256bh __A)255*bed243d3SAndroid Build Coastguard Worker _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256*bed243d3SAndroid Build Coastguard Worker return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257*bed243d3SAndroid Build Coastguard Worker (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258*bed243d3SAndroid Build Coastguard Worker }
259*bed243d3SAndroid Build Coastguard Worker
260*bed243d3SAndroid Build Coastguard Worker /// Convert Packed BF16 Data to Packed float Data using merging mask.
261*bed243d3SAndroid Build Coastguard Worker ///
262*bed243d3SAndroid Build Coastguard Worker /// \headerfile <x86intrin.h>
263*bed243d3SAndroid Build Coastguard Worker ///
264*bed243d3SAndroid Build Coastguard Worker /// \param __S
265*bed243d3SAndroid Build Coastguard Worker /// A 512-bit vector of [16 x float]. Elements are copied from __S when
266*bed243d3SAndroid Build Coastguard Worker /// the corresponding mask bit is not set.
267*bed243d3SAndroid Build Coastguard Worker /// \param __U
268*bed243d3SAndroid Build Coastguard Worker /// A 16-bit mask.
269*bed243d3SAndroid Build Coastguard Worker /// \param __A
270*bed243d3SAndroid Build Coastguard Worker /// A 256-bit vector of [16 x bfloat].
271*bed243d3SAndroid Build Coastguard Worker /// \returns A 512-bit vector of [16 x float] come from conversion of __A
272*bed243d3SAndroid Build Coastguard Worker static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S,__mmask16 __U,__m256bh __A)273*bed243d3SAndroid Build Coastguard Worker _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274*bed243d3SAndroid Build Coastguard Worker return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275*bed243d3SAndroid Build Coastguard Worker (__m512i)__S, (__mmask16)__U,
276*bed243d3SAndroid Build Coastguard Worker (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277*bed243d3SAndroid Build Coastguard Worker }
278*bed243d3SAndroid Build Coastguard Worker
279*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS
280*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS512
281*bed243d3SAndroid Build Coastguard Worker
282*bed243d3SAndroid Build Coastguard Worker #endif
283*bed243d3SAndroid Build Coastguard Worker #endif
284