1*bed243d3SAndroid Build Coastguard Worker /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2*bed243d3SAndroid Build Coastguard Worker *
3*bed243d3SAndroid Build Coastguard Worker * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bed243d3SAndroid Build Coastguard Worker * See https://llvm.org/LICENSE.txt for license information.
5*bed243d3SAndroid Build Coastguard Worker * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bed243d3SAndroid Build Coastguard Worker *
7*bed243d3SAndroid Build Coastguard Worker *===------------------------------------------------------------------------===
8*bed243d3SAndroid Build Coastguard Worker */
9*bed243d3SAndroid Build Coastguard Worker
10*bed243d3SAndroid Build Coastguard Worker #ifndef __IMMINTRIN_H
11*bed243d3SAndroid Build Coastguard Worker #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12*bed243d3SAndroid Build Coastguard Worker #endif /* __IMMINTRIN_H */
13*bed243d3SAndroid Build Coastguard Worker
14*bed243d3SAndroid Build Coastguard Worker #ifndef __AMXINTRIN_H
15*bed243d3SAndroid Build Coastguard Worker #define __AMXINTRIN_H
16*bed243d3SAndroid Build Coastguard Worker #ifdef __x86_64__
17*bed243d3SAndroid Build Coastguard Worker
18*bed243d3SAndroid Build Coastguard Worker /* Define the default attributes for the functions in this file. */
19*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS_TILE \
20*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS_INT8 \
22*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS_BF16 \
24*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25*bed243d3SAndroid Build Coastguard Worker #define __DEFAULT_FN_ATTRS_FP16 \
26*bed243d3SAndroid Build Coastguard Worker __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
27*bed243d3SAndroid Build Coastguard Worker
28*bed243d3SAndroid Build Coastguard Worker /// Load tile configuration from a 64-byte memory location specified by
29*bed243d3SAndroid Build Coastguard Worker /// "mem_addr". The tile configuration includes the tile type palette, the
30*bed243d3SAndroid Build Coastguard Worker /// number of bytes per row, and the number of rows. If the specified
31*bed243d3SAndroid Build Coastguard Worker /// palette_id is zero, that signifies the init state for both the tile
32*bed243d3SAndroid Build Coastguard Worker /// config and the tile data, and the tiles are zeroed. Any invalid
33*bed243d3SAndroid Build Coastguard Worker /// configurations will result in #GP fault.
34*bed243d3SAndroid Build Coastguard Worker ///
35*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
36*bed243d3SAndroid Build Coastguard Worker ///
37*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
38*bed243d3SAndroid Build Coastguard Worker ///
39*bed243d3SAndroid Build Coastguard Worker /// \param __config
40*bed243d3SAndroid Build Coastguard Worker /// A pointer to 512-bits configuration
41*bed243d3SAndroid Build Coastguard Worker static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void * __config)42*bed243d3SAndroid Build Coastguard Worker _tile_loadconfig(const void *__config) {
43*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tile_loadconfig(__config);
44*bed243d3SAndroid Build Coastguard Worker }
45*bed243d3SAndroid Build Coastguard Worker
46*bed243d3SAndroid Build Coastguard Worker /// Stores the current tile configuration to a 64-byte memory location
47*bed243d3SAndroid Build Coastguard Worker /// specified by "mem_addr". The tile configuration includes the tile type
48*bed243d3SAndroid Build Coastguard Worker /// palette, the number of bytes per row, and the number of rows. If tiles
49*bed243d3SAndroid Build Coastguard Worker /// are not configured, all zeroes will be stored to memory.
50*bed243d3SAndroid Build Coastguard Worker ///
51*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
52*bed243d3SAndroid Build Coastguard Worker ///
53*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
54*bed243d3SAndroid Build Coastguard Worker ///
55*bed243d3SAndroid Build Coastguard Worker /// \param __config
56*bed243d3SAndroid Build Coastguard Worker /// A pointer to 512-bits configuration
57*bed243d3SAndroid Build Coastguard Worker static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void * __config)58*bed243d3SAndroid Build Coastguard Worker _tile_storeconfig(void *__config) {
59*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tile_storeconfig(__config);
60*bed243d3SAndroid Build Coastguard Worker }
61*bed243d3SAndroid Build Coastguard Worker
62*bed243d3SAndroid Build Coastguard Worker /// Release the tile configuration to return to the init state, which
63*bed243d3SAndroid Build Coastguard Worker /// releases all storage it currently holds.
64*bed243d3SAndroid Build Coastguard Worker ///
65*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
66*bed243d3SAndroid Build Coastguard Worker ///
67*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
_tile_release(void)68*bed243d3SAndroid Build Coastguard Worker static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
69*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tilerelease();
70*bed243d3SAndroid Build Coastguard Worker }
71*bed243d3SAndroid Build Coastguard Worker
72*bed243d3SAndroid Build Coastguard Worker /// Load tile rows from memory specifieid by "base" address and "stride" into
73*bed243d3SAndroid Build Coastguard Worker /// destination tile "dst" using the tile configuration previously configured
74*bed243d3SAndroid Build Coastguard Worker /// via "_tile_loadconfig".
75*bed243d3SAndroid Build Coastguard Worker ///
76*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
77*bed243d3SAndroid Build Coastguard Worker ///
78*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
79*bed243d3SAndroid Build Coastguard Worker ///
80*bed243d3SAndroid Build Coastguard Worker /// \param dst
81*bed243d3SAndroid Build Coastguard Worker /// A destination tile. Max size is 1024 Bytes.
82*bed243d3SAndroid Build Coastguard Worker /// \param base
83*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
84*bed243d3SAndroid Build Coastguard Worker /// \param stride
85*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be loaded in memory.
86*bed243d3SAndroid Build Coastguard Worker #define _tile_loadd(dst, base, stride) \
87*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
88*bed243d3SAndroid Build Coastguard Worker (__SIZE_TYPE__)(stride))
89*bed243d3SAndroid Build Coastguard Worker
90*bed243d3SAndroid Build Coastguard Worker /// Load tile rows from memory specifieid by "base" address and "stride" into
91*bed243d3SAndroid Build Coastguard Worker /// destination tile "dst" using the tile configuration previously configured
92*bed243d3SAndroid Build Coastguard Worker /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
93*bed243d3SAndroid Build Coastguard Worker /// that the data will likely not be reused in the near future and the data
94*bed243d3SAndroid Build Coastguard Worker /// caching can be optimized accordingly.
95*bed243d3SAndroid Build Coastguard Worker ///
96*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
97*bed243d3SAndroid Build Coastguard Worker ///
98*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
99*bed243d3SAndroid Build Coastguard Worker ///
100*bed243d3SAndroid Build Coastguard Worker /// \param dst
101*bed243d3SAndroid Build Coastguard Worker /// A destination tile. Max size is 1024 Bytes.
102*bed243d3SAndroid Build Coastguard Worker /// \param base
103*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
104*bed243d3SAndroid Build Coastguard Worker /// \param stride
105*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be loaded in memory.
106*bed243d3SAndroid Build Coastguard Worker #define _tile_stream_loadd(dst, base, stride) \
107*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
108*bed243d3SAndroid Build Coastguard Worker (__SIZE_TYPE__)(stride))
109*bed243d3SAndroid Build Coastguard Worker
110*bed243d3SAndroid Build Coastguard Worker /// Store the tile specified by "src" to memory specifieid by "base" address and
111*bed243d3SAndroid Build Coastguard Worker /// "stride" using the tile configuration previously configured via
112*bed243d3SAndroid Build Coastguard Worker /// "_tile_loadconfig".
113*bed243d3SAndroid Build Coastguard Worker ///
114*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
115*bed243d3SAndroid Build Coastguard Worker ///
116*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
117*bed243d3SAndroid Build Coastguard Worker ///
118*bed243d3SAndroid Build Coastguard Worker /// \param dst
119*bed243d3SAndroid Build Coastguard Worker /// A destination tile. Max size is 1024 Bytes.
120*bed243d3SAndroid Build Coastguard Worker /// \param base
121*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
122*bed243d3SAndroid Build Coastguard Worker /// \param stride
123*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be stored in memory.
124*bed243d3SAndroid Build Coastguard Worker #define _tile_stored(dst, base, stride) \
125*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
126*bed243d3SAndroid Build Coastguard Worker
127*bed243d3SAndroid Build Coastguard Worker /// Zero the tile specified by "tdest".
128*bed243d3SAndroid Build Coastguard Worker ///
129*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
130*bed243d3SAndroid Build Coastguard Worker ///
131*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
132*bed243d3SAndroid Build Coastguard Worker ///
133*bed243d3SAndroid Build Coastguard Worker /// \param tile
134*bed243d3SAndroid Build Coastguard Worker /// The destination tile to be zero. Max size is 1024 Bytes.
135*bed243d3SAndroid Build Coastguard Worker #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
136*bed243d3SAndroid Build Coastguard Worker
137*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
138*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
139*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
140*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
141*bed243d3SAndroid Build Coastguard Worker /// and store the 32-bit result back to tile "dst".
142*bed243d3SAndroid Build Coastguard Worker ///
143*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
144*bed243d3SAndroid Build Coastguard Worker ///
145*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
146*bed243d3SAndroid Build Coastguard Worker ///
147*bed243d3SAndroid Build Coastguard Worker /// \param dst
148*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
149*bed243d3SAndroid Build Coastguard Worker /// \param src0
150*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
151*bed243d3SAndroid Build Coastguard Worker /// \param src1
152*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
153*bed243d3SAndroid Build Coastguard Worker #define _tile_dpbssd(dst, src0, src1) \
154*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tdpbssd((dst), (src0), (src1))
155*bed243d3SAndroid Build Coastguard Worker
156*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
157*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
158*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
159*bed243d3SAndroid Build Coastguard Worker /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
160*bed243d3SAndroid Build Coastguard Worker /// in "dst", and store the 32-bit result back to tile "dst".
161*bed243d3SAndroid Build Coastguard Worker ///
162*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
163*bed243d3SAndroid Build Coastguard Worker ///
164*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
165*bed243d3SAndroid Build Coastguard Worker ///
166*bed243d3SAndroid Build Coastguard Worker /// \param dst
167*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
168*bed243d3SAndroid Build Coastguard Worker /// \param src0
169*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
170*bed243d3SAndroid Build Coastguard Worker /// \param src1
171*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
172*bed243d3SAndroid Build Coastguard Worker #define _tile_dpbsud(dst, src0, src1) \
173*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tdpbsud((dst), (src0), (src1))
174*bed243d3SAndroid Build Coastguard Worker
175*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
176*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
177*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
178*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
179*bed243d3SAndroid Build Coastguard Worker /// and store the 32-bit result back to tile "dst".
180*bed243d3SAndroid Build Coastguard Worker ///
181*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
182*bed243d3SAndroid Build Coastguard Worker ///
183*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
184*bed243d3SAndroid Build Coastguard Worker ///
185*bed243d3SAndroid Build Coastguard Worker /// \param dst
186*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
187*bed243d3SAndroid Build Coastguard Worker /// \param src0
188*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
189*bed243d3SAndroid Build Coastguard Worker /// \param src1
190*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
191*bed243d3SAndroid Build Coastguard Worker #define _tile_dpbusd(dst, src0, src1) \
192*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tdpbusd((dst), (src0), (src1))
193*bed243d3SAndroid Build Coastguard Worker
194*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
195*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
196*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
197*bed243d3SAndroid Build Coastguard Worker /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
198*bed243d3SAndroid Build Coastguard Worker /// "dst", and store the 32-bit result back to tile "dst".
199*bed243d3SAndroid Build Coastguard Worker ///
200*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
201*bed243d3SAndroid Build Coastguard Worker ///
202*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
203*bed243d3SAndroid Build Coastguard Worker ///
204*bed243d3SAndroid Build Coastguard Worker /// \param dst
205*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
206*bed243d3SAndroid Build Coastguard Worker /// \param src0
207*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
208*bed243d3SAndroid Build Coastguard Worker /// \param src1
209*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
210*bed243d3SAndroid Build Coastguard Worker #define _tile_dpbuud(dst, src0, src1) \
211*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tdpbuud((dst), (src0), (src1))
212*bed243d3SAndroid Build Coastguard Worker
213*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
214*bed243d3SAndroid Build Coastguard Worker /// src1, accumulating the intermediate single-precision (32-bit) floating-point
215*bed243d3SAndroid Build Coastguard Worker /// elements with elements in "dst", and store the 32-bit result back to tile
216*bed243d3SAndroid Build Coastguard Worker /// "dst".
217*bed243d3SAndroid Build Coastguard Worker ///
218*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
219*bed243d3SAndroid Build Coastguard Worker ///
220*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
221*bed243d3SAndroid Build Coastguard Worker ///
222*bed243d3SAndroid Build Coastguard Worker /// \param dst
223*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
224*bed243d3SAndroid Build Coastguard Worker /// \param src0
225*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
226*bed243d3SAndroid Build Coastguard Worker /// \param src1
227*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
228*bed243d3SAndroid Build Coastguard Worker #define _tile_dpbf16ps(dst, src0, src1) \
229*bed243d3SAndroid Build Coastguard Worker __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
230*bed243d3SAndroid Build Coastguard Worker
231*bed243d3SAndroid Build Coastguard Worker /// AMX tile register size can be configured, the maximum size is 16x64=1024
232*bed243d3SAndroid Build Coastguard Worker /// bytes. Since there is no 2D type in llvm IR, we use vector type to
233*bed243d3SAndroid Build Coastguard Worker /// represent 2D tile and the fixed size is maximum amx tile register size.
234*bed243d3SAndroid Build Coastguard Worker typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
235*bed243d3SAndroid Build Coastguard Worker
236*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
237*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)238*bed243d3SAndroid Build Coastguard Worker _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
239*bed243d3SAndroid Build Coastguard Worker __SIZE_TYPE__ stride) {
240*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tileloadd64_internal(m, n, base,
241*bed243d3SAndroid Build Coastguard Worker (__SIZE_TYPE__)(stride));
242*bed243d3SAndroid Build Coastguard Worker }
243*bed243d3SAndroid Build Coastguard Worker
244*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
245*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)246*bed243d3SAndroid Build Coastguard Worker _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
247*bed243d3SAndroid Build Coastguard Worker __SIZE_TYPE__ stride) {
248*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tileloaddt164_internal(m, n, base,
249*bed243d3SAndroid Build Coastguard Worker (__SIZE_TYPE__)(stride));
250*bed243d3SAndroid Build Coastguard Worker }
251*bed243d3SAndroid Build Coastguard Worker
252*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
253*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)254*bed243d3SAndroid Build Coastguard Worker _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
255*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
257*bed243d3SAndroid Build Coastguard Worker }
258*bed243d3SAndroid Build Coastguard Worker
259*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
260*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)261*bed243d3SAndroid Build Coastguard Worker _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
262*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
264*bed243d3SAndroid Build Coastguard Worker }
265*bed243d3SAndroid Build Coastguard Worker
266*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
267*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)268*bed243d3SAndroid Build Coastguard Worker _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
269*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
271*bed243d3SAndroid Build Coastguard Worker }
272*bed243d3SAndroid Build Coastguard Worker
273*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
274*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)275*bed243d3SAndroid Build Coastguard Worker _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
276*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
278*bed243d3SAndroid Build Coastguard Worker }
279*bed243d3SAndroid Build Coastguard Worker
280*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
281*bed243d3SAndroid Build Coastguard Worker static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m,unsigned short n,void * base,__SIZE_TYPE__ stride,_tile1024i tile)282*bed243d3SAndroid Build Coastguard Worker _tile_stored_internal(unsigned short m, unsigned short n, void *base,
283*bed243d3SAndroid Build Coastguard Worker __SIZE_TYPE__ stride, _tile1024i tile) {
284*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tilestored64_internal(m, n, base,
285*bed243d3SAndroid Build Coastguard Worker (__SIZE_TYPE__)(stride), tile);
286*bed243d3SAndroid Build Coastguard Worker }
287*bed243d3SAndroid Build Coastguard Worker
288*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
289*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)290*bed243d3SAndroid Build Coastguard Worker _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
291*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
293*bed243d3SAndroid Build Coastguard Worker }
294*bed243d3SAndroid Build Coastguard Worker
295*bed243d3SAndroid Build Coastguard Worker /// This is internal intrinsic. C/C++ user should avoid calling it directly.
296*bed243d3SAndroid Build Coastguard Worker static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)297*bed243d3SAndroid Build Coastguard Worker _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
298*bed243d3SAndroid Build Coastguard Worker _tile1024i dst, _tile1024i src1, _tile1024i src2) {
299*bed243d3SAndroid Build Coastguard Worker return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
300*bed243d3SAndroid Build Coastguard Worker }
301*bed243d3SAndroid Build Coastguard Worker
302*bed243d3SAndroid Build Coastguard Worker /// This struct pack the shape and tile data together for user. We suggest
303*bed243d3SAndroid Build Coastguard Worker /// initializing the struct as early as possible, because compiler depends
304*bed243d3SAndroid Build Coastguard Worker /// on the shape information to do configure. The constant value is preferred
305*bed243d3SAndroid Build Coastguard Worker /// for optimization by compiler.
306*bed243d3SAndroid Build Coastguard Worker typedef struct __tile1024i_str {
307*bed243d3SAndroid Build Coastguard Worker const unsigned short row;
308*bed243d3SAndroid Build Coastguard Worker const unsigned short col;
309*bed243d3SAndroid Build Coastguard Worker _tile1024i tile;
310*bed243d3SAndroid Build Coastguard Worker } __tile1024i;
311*bed243d3SAndroid Build Coastguard Worker
312*bed243d3SAndroid Build Coastguard Worker /// Load tile rows from memory specifieid by "base" address and "stride" into
313*bed243d3SAndroid Build Coastguard Worker /// destination tile "dst".
314*bed243d3SAndroid Build Coastguard Worker ///
315*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
316*bed243d3SAndroid Build Coastguard Worker ///
317*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
318*bed243d3SAndroid Build Coastguard Worker ///
319*bed243d3SAndroid Build Coastguard Worker /// \param dst
320*bed243d3SAndroid Build Coastguard Worker /// A destination tile. Max size is 1024 Bytes.
321*bed243d3SAndroid Build Coastguard Worker /// \param base
322*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
323*bed243d3SAndroid Build Coastguard Worker /// \param stride
324*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be loaded in memory.
325*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_TILE
__tile_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)326*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
327*bed243d3SAndroid Build Coastguard Worker __SIZE_TYPE__ stride) {
328*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
329*bed243d3SAndroid Build Coastguard Worker }
330*bed243d3SAndroid Build Coastguard Worker
331*bed243d3SAndroid Build Coastguard Worker /// Load tile rows from memory specifieid by "base" address and "stride" into
332*bed243d3SAndroid Build Coastguard Worker /// destination tile "dst". This intrinsic provides a hint to the implementation
333*bed243d3SAndroid Build Coastguard Worker /// that the data will likely not be reused in the near future and the data
334*bed243d3SAndroid Build Coastguard Worker /// caching can be optimized accordingly.
335*bed243d3SAndroid Build Coastguard Worker ///
336*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
337*bed243d3SAndroid Build Coastguard Worker ///
338*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
339*bed243d3SAndroid Build Coastguard Worker ///
340*bed243d3SAndroid Build Coastguard Worker /// \param dst
341*bed243d3SAndroid Build Coastguard Worker /// A destination tile. Max size is 1024 Bytes.
342*bed243d3SAndroid Build Coastguard Worker /// \param base
343*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
344*bed243d3SAndroid Build Coastguard Worker /// \param stride
345*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be loaded in memory.
346*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_TILE
__tile_stream_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)347*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
348*bed243d3SAndroid Build Coastguard Worker __SIZE_TYPE__ stride) {
349*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
350*bed243d3SAndroid Build Coastguard Worker }
351*bed243d3SAndroid Build Coastguard Worker
352*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
353*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356*bed243d3SAndroid Build Coastguard Worker /// and store the 32-bit result back to tile "dst".
357*bed243d3SAndroid Build Coastguard Worker ///
358*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
359*bed243d3SAndroid Build Coastguard Worker ///
360*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
361*bed243d3SAndroid Build Coastguard Worker ///
362*bed243d3SAndroid Build Coastguard Worker /// \param dst
363*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
364*bed243d3SAndroid Build Coastguard Worker /// \param src0
365*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
366*bed243d3SAndroid Build Coastguard Worker /// \param src1
367*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
368*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_INT8
__tile_dpbssd(__tile1024i * dst,__tile1024i src0,__tile1024i src1)369*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
370*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
371*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
372*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
373*bed243d3SAndroid Build Coastguard Worker }
374*bed243d3SAndroid Build Coastguard Worker
375*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
376*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378*bed243d3SAndroid Build Coastguard Worker /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379*bed243d3SAndroid Build Coastguard Worker /// in "dst", and store the 32-bit result back to tile "dst".
380*bed243d3SAndroid Build Coastguard Worker ///
381*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
382*bed243d3SAndroid Build Coastguard Worker ///
383*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
384*bed243d3SAndroid Build Coastguard Worker ///
385*bed243d3SAndroid Build Coastguard Worker /// \param dst
386*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
387*bed243d3SAndroid Build Coastguard Worker /// \param src0
388*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
389*bed243d3SAndroid Build Coastguard Worker /// \param src1
390*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
391*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_INT8
__tile_dpbsud(__tile1024i * dst,__tile1024i src0,__tile1024i src1)392*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
393*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
394*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
395*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
396*bed243d3SAndroid Build Coastguard Worker }
397*bed243d3SAndroid Build Coastguard Worker
398*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
399*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400*bed243d3SAndroid Build Coastguard Worker /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401*bed243d3SAndroid Build Coastguard Worker /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402*bed243d3SAndroid Build Coastguard Worker /// and store the 32-bit result back to tile "dst".
403*bed243d3SAndroid Build Coastguard Worker ///
404*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
405*bed243d3SAndroid Build Coastguard Worker ///
406*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
407*bed243d3SAndroid Build Coastguard Worker ///
408*bed243d3SAndroid Build Coastguard Worker /// \param dst
409*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
410*bed243d3SAndroid Build Coastguard Worker /// \param src0
411*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
412*bed243d3SAndroid Build Coastguard Worker /// \param src1
413*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
414*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_INT8
__tile_dpbusd(__tile1024i * dst,__tile1024i src0,__tile1024i src1)415*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
416*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
417*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
418*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
419*bed243d3SAndroid Build Coastguard Worker }
420*bed243d3SAndroid Build Coastguard Worker
421*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of bytes in tiles with a source/destination accumulator.
422*bed243d3SAndroid Build Coastguard Worker /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423*bed243d3SAndroid Build Coastguard Worker /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424*bed243d3SAndroid Build Coastguard Worker /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425*bed243d3SAndroid Build Coastguard Worker /// "dst", and store the 32-bit result back to tile "dst".
426*bed243d3SAndroid Build Coastguard Worker ///
427*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
428*bed243d3SAndroid Build Coastguard Worker ///
429*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
430*bed243d3SAndroid Build Coastguard Worker ///
431*bed243d3SAndroid Build Coastguard Worker /// \param dst
432*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
433*bed243d3SAndroid Build Coastguard Worker /// \param src0
434*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
435*bed243d3SAndroid Build Coastguard Worker /// \param src1
436*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
437*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_INT8
__tile_dpbuud(__tile1024i * dst,__tile1024i src0,__tile1024i src1)438*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
439*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
440*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
441*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
442*bed243d3SAndroid Build Coastguard Worker }
443*bed243d3SAndroid Build Coastguard Worker
444*bed243d3SAndroid Build Coastguard Worker /// Store the tile specified by "src" to memory specifieid by "base" address and
445*bed243d3SAndroid Build Coastguard Worker /// "stride".
446*bed243d3SAndroid Build Coastguard Worker ///
447*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
448*bed243d3SAndroid Build Coastguard Worker ///
449*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
450*bed243d3SAndroid Build Coastguard Worker ///
451*bed243d3SAndroid Build Coastguard Worker /// \param base
452*bed243d3SAndroid Build Coastguard Worker /// A pointer to base address.
453*bed243d3SAndroid Build Coastguard Worker /// \param stride
454*bed243d3SAndroid Build Coastguard Worker /// The stride between the rows' data to be stored in memory.
455*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_TILE
__tile_stored(void * base,__SIZE_TYPE__ stride,__tile1024i src)456*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
457*bed243d3SAndroid Build Coastguard Worker __tile1024i src) {
458*bed243d3SAndroid Build Coastguard Worker _tile_stored_internal(src.row, src.col, base, stride, src.tile);
459*bed243d3SAndroid Build Coastguard Worker }
460*bed243d3SAndroid Build Coastguard Worker
461*bed243d3SAndroid Build Coastguard Worker /// Zero the tile specified by "dst".
462*bed243d3SAndroid Build Coastguard Worker ///
463*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
464*bed243d3SAndroid Build Coastguard Worker ///
465*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
466*bed243d3SAndroid Build Coastguard Worker ///
467*bed243d3SAndroid Build Coastguard Worker /// \param dst
468*bed243d3SAndroid Build Coastguard Worker /// The destination tile to be zero. Max size is 1024 Bytes.
469*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_TILE
__tile_zero(__tile1024i * dst)470*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_zero(__tile1024i *dst) {
471*bed243d3SAndroid Build Coastguard Worker dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
472*bed243d3SAndroid Build Coastguard Worker }
473*bed243d3SAndroid Build Coastguard Worker
474*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475*bed243d3SAndroid Build Coastguard Worker /// src1, accumulating the intermediate single-precision (32-bit) floating-point
476*bed243d3SAndroid Build Coastguard Worker /// elements with elements in "dst", and store the 32-bit result back to tile
477*bed243d3SAndroid Build Coastguard Worker /// "dst".
478*bed243d3SAndroid Build Coastguard Worker ///
479*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
480*bed243d3SAndroid Build Coastguard Worker ///
481*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
482*bed243d3SAndroid Build Coastguard Worker ///
483*bed243d3SAndroid Build Coastguard Worker /// \param dst
484*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
485*bed243d3SAndroid Build Coastguard Worker /// \param src0
486*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
487*bed243d3SAndroid Build Coastguard Worker /// \param src1
488*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
489*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_BF16
__tile_dpbf16ps(__tile1024i * dst,__tile1024i src0,__tile1024i src1)490*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
491*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
492*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
493*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
494*bed243d3SAndroid Build Coastguard Worker }
495*bed243d3SAndroid Build Coastguard Worker
496*bed243d3SAndroid Build Coastguard Worker /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497*bed243d3SAndroid Build Coastguard Worker /// src1, accumulating the intermediate single-precision (32-bit) floating-point
498*bed243d3SAndroid Build Coastguard Worker /// elements with elements in "dst", and store the 32-bit result back to tile
499*bed243d3SAndroid Build Coastguard Worker /// "dst".
500*bed243d3SAndroid Build Coastguard Worker ///
501*bed243d3SAndroid Build Coastguard Worker /// \headerfile <immintrin.h>
502*bed243d3SAndroid Build Coastguard Worker ///
503*bed243d3SAndroid Build Coastguard Worker /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
504*bed243d3SAndroid Build Coastguard Worker ///
505*bed243d3SAndroid Build Coastguard Worker /// \param dst
506*bed243d3SAndroid Build Coastguard Worker /// The destination tile. Max size is 1024 Bytes.
507*bed243d3SAndroid Build Coastguard Worker /// \param src0
508*bed243d3SAndroid Build Coastguard Worker /// The 1st source tile. Max size is 1024 Bytes.
509*bed243d3SAndroid Build Coastguard Worker /// \param src1
510*bed243d3SAndroid Build Coastguard Worker /// The 2nd source tile. Max size is 1024 Bytes.
511*bed243d3SAndroid Build Coastguard Worker __DEFAULT_FN_ATTRS_FP16
__tile_dpfp16ps(__tile1024i * dst,__tile1024i src0,__tile1024i src1)512*bed243d3SAndroid Build Coastguard Worker static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
513*bed243d3SAndroid Build Coastguard Worker __tile1024i src1) {
514*bed243d3SAndroid Build Coastguard Worker dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
515*bed243d3SAndroid Build Coastguard Worker src0.tile, src1.tile);
516*bed243d3SAndroid Build Coastguard Worker }
517*bed243d3SAndroid Build Coastguard Worker
518*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS_TILE
519*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS_INT8
520*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS_BF16
521*bed243d3SAndroid Build Coastguard Worker #undef __DEFAULT_FN_ATTRS_FP16
522*bed243d3SAndroid Build Coastguard Worker
523*bed243d3SAndroid Build Coastguard Worker #endif /* __x86_64__ */
524*bed243d3SAndroid Build Coastguard Worker #endif /* __AMXINTRIN_H */
525