xref: /aosp_15_r20/external/clang/lib/Headers/pmmintrin.h (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
2*67e74705SXin Li  *
3*67e74705SXin Li  * Permission is hereby granted, free of charge, to any person obtaining a copy
4*67e74705SXin Li  * of this software and associated documentation files (the "Software"), to deal
5*67e74705SXin Li  * in the Software without restriction, including without limitation the rights
6*67e74705SXin Li  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7*67e74705SXin Li  * copies of the Software, and to permit persons to whom the Software is
8*67e74705SXin Li  * furnished to do so, subject to the following conditions:
9*67e74705SXin Li  *
10*67e74705SXin Li  * The above copyright notice and this permission notice shall be included in
11*67e74705SXin Li  * all copies or substantial portions of the Software.
12*67e74705SXin Li  *
13*67e74705SXin Li  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14*67e74705SXin Li  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15*67e74705SXin Li  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16*67e74705SXin Li  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17*67e74705SXin Li  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18*67e74705SXin Li  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19*67e74705SXin Li  * THE SOFTWARE.
20*67e74705SXin Li  *
21*67e74705SXin Li  *===-----------------------------------------------------------------------===
22*67e74705SXin Li  */
23*67e74705SXin Li 
24*67e74705SXin Li #ifndef __PMMINTRIN_H
25*67e74705SXin Li #define __PMMINTRIN_H
26*67e74705SXin Li 
27*67e74705SXin Li #include <emmintrin.h>
28*67e74705SXin Li 
29*67e74705SXin Li /* Define the default attributes for the functions in this file. */
30*67e74705SXin Li #define __DEFAULT_FN_ATTRS \
31*67e74705SXin Li   __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
32*67e74705SXin Li 
33*67e74705SXin Li /// \brief Loads data from an unaligned memory location to elements in a 128-bit
34*67e74705SXin Li ///    vector. If the address of the data is not 16-byte aligned, the
35*67e74705SXin Li ///    instruction may read two adjacent aligned blocks of memory to retrieve
36*67e74705SXin Li ///    the requested data.
37*67e74705SXin Li ///
38*67e74705SXin Li /// \headerfile <x86intrin.h>
39*67e74705SXin Li ///
40*67e74705SXin Li /// This intrinsic corresponds to the \c VLDDQU instruction.
41*67e74705SXin Li ///
42*67e74705SXin Li /// \param __p
43*67e74705SXin Li ///    A pointer to a 128-bit integer vector containing integer values.
44*67e74705SXin Li /// \returns A 128-bit vector containing the moved values.
45*67e74705SXin Li static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lddqu_si128(__m128i const * __p)46*67e74705SXin Li _mm_lddqu_si128(__m128i const *__p)
47*67e74705SXin Li {
48*67e74705SXin Li   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
49*67e74705SXin Li }
50*67e74705SXin Li 
51*67e74705SXin Li /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
52*67e74705SXin Li ///    two 128-bit vectors of [4 x float].
53*67e74705SXin Li ///
54*67e74705SXin Li /// \headerfile <x86intrin.h>
55*67e74705SXin Li ///
56*67e74705SXin Li /// This intrinsic corresponds to the \c VADDSUBPS instruction.
57*67e74705SXin Li ///
58*67e74705SXin Li /// \param __a
59*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing the left source operand.
60*67e74705SXin Li /// \param __b
61*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing the right source operand.
62*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
63*67e74705SXin Li ///    differences of both operands.
64*67e74705SXin Li static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_addsub_ps(__m128 __a,__m128 __b)65*67e74705SXin Li _mm_addsub_ps(__m128 __a, __m128 __b)
66*67e74705SXin Li {
67*67e74705SXin Li   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
68*67e74705SXin Li }
69*67e74705SXin Li 
70*67e74705SXin Li /// \brief Horizontally adds the adjacent pairs of values contained in two
71*67e74705SXin Li ///    128-bit vectors of [4 x float].
72*67e74705SXin Li ///
73*67e74705SXin Li /// \headerfile <x86intrin.h>
74*67e74705SXin Li ///
75*67e74705SXin Li /// This intrinsic corresponds to the \c VHADDPS instruction.
76*67e74705SXin Li ///
77*67e74705SXin Li /// \param __a
78*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing one of the source operands.
79*67e74705SXin Li ///    The horizontal sums of the values are stored in the lower bits of the
80*67e74705SXin Li ///    destination.
81*67e74705SXin Li /// \param __b
82*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing one of the source operands.
83*67e74705SXin Li ///    The horizontal sums of the values are stored in the upper bits of the
84*67e74705SXin Li ///    destination.
85*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
86*67e74705SXin Li ///    both operands.
87*67e74705SXin Li static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hadd_ps(__m128 __a,__m128 __b)88*67e74705SXin Li _mm_hadd_ps(__m128 __a, __m128 __b)
89*67e74705SXin Li {
90*67e74705SXin Li   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
91*67e74705SXin Li }
92*67e74705SXin Li 
93*67e74705SXin Li /// \brief Horizontally subtracts the adjacent pairs of values contained in two
94*67e74705SXin Li ///    128-bit vectors of [4 x float].
95*67e74705SXin Li ///
96*67e74705SXin Li /// \headerfile <x86intrin.h>
97*67e74705SXin Li ///
98*67e74705SXin Li /// This intrinsic corresponds to the \c VHSUBPS instruction.
99*67e74705SXin Li ///
100*67e74705SXin Li /// \param __a
101*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing one of the source operands.
102*67e74705SXin Li ///    The horizontal differences between the values are stored in the lower
103*67e74705SXin Li ///    bits of the destination.
104*67e74705SXin Li /// \param __b
105*67e74705SXin Li ///    A 128-bit vector of [4 x float] containing one of the source operands.
106*67e74705SXin Li ///    The horizontal differences between the values are stored in the upper
107*67e74705SXin Li ///    bits of the destination.
108*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the horizontal
109*67e74705SXin Li ///    differences of both operands.
110*67e74705SXin Li static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hsub_ps(__m128 __a,__m128 __b)111*67e74705SXin Li _mm_hsub_ps(__m128 __a, __m128 __b)
112*67e74705SXin Li {
113*67e74705SXin Li   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
114*67e74705SXin Li }
115*67e74705SXin Li 
116*67e74705SXin Li /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
117*67e74705SXin Li ///    vector of [4 x float] to float values stored in a 128-bit vector of
118*67e74705SXin Li ///    [4 x float].
119*67e74705SXin Li ///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
120*67e74705SXin Li ///    the destination.
121*67e74705SXin Li ///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
122*67e74705SXin Li ///    destination.
123*67e74705SXin Li ///
124*67e74705SXin Li /// \headerfile <x86intrin.h>
125*67e74705SXin Li ///
126*67e74705SXin Li /// This intrinsic corresponds to the \c VMOVSHDUP instruction.
127*67e74705SXin Li ///
128*67e74705SXin Li /// \param __a
129*67e74705SXin Li ///    A 128-bit vector of [4 x float].
130*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
131*67e74705SXin Li ///    values.
132*67e74705SXin Li static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)133*67e74705SXin Li _mm_movehdup_ps(__m128 __a)
134*67e74705SXin Li {
135*67e74705SXin Li   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
136*67e74705SXin Li }
137*67e74705SXin Li 
138*67e74705SXin Li /// \brief Duplicates low-order (even-indexed) values from a 128-bit
139*67e74705SXin Li ///    vector of [4 x float] to float values stored in a 128-bit vector of
140*67e74705SXin Li ///    [4 x float].
141*67e74705SXin Li ///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
142*67e74705SXin Li ///    the destination.
143*67e74705SXin Li ///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
144*67e74705SXin Li ///    destination.
145*67e74705SXin Li ///
146*67e74705SXin Li /// \headerfile <x86intrin.h>
147*67e74705SXin Li ///
148*67e74705SXin Li /// This intrinsic corresponds to the \c VMOVSLDUP instruction.
149*67e74705SXin Li ///
150*67e74705SXin Li /// \param __a
151*67e74705SXin Li ///    A 128-bit vector of [4 x float].
152*67e74705SXin Li /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
153*67e74705SXin Li ///    values.
154*67e74705SXin Li static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)155*67e74705SXin Li _mm_moveldup_ps(__m128 __a)
156*67e74705SXin Li {
157*67e74705SXin Li   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
158*67e74705SXin Li }
159*67e74705SXin Li 
160*67e74705SXin Li /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
161*67e74705SXin Li ///    two 128-bit vectors of [2 x double].
162*67e74705SXin Li ///
163*67e74705SXin Li /// \headerfile <x86intrin.h>
164*67e74705SXin Li ///
165*67e74705SXin Li /// This intrinsic corresponds to the \c VADDSUBPD instruction.
166*67e74705SXin Li ///
167*67e74705SXin Li /// \param __a
168*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing the left source operand.
169*67e74705SXin Li /// \param __b
170*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing the right source operand.
171*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the alternating sums
172*67e74705SXin Li ///    and differences of both operands.
173*67e74705SXin Li static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_addsub_pd(__m128d __a,__m128d __b)174*67e74705SXin Li _mm_addsub_pd(__m128d __a, __m128d __b)
175*67e74705SXin Li {
176*67e74705SXin Li   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
177*67e74705SXin Li }
178*67e74705SXin Li 
179*67e74705SXin Li /// \brief Horizontally adds the pairs of values contained in two 128-bit
180*67e74705SXin Li ///    vectors of [2 x double].
181*67e74705SXin Li ///
182*67e74705SXin Li /// \headerfile <x86intrin.h>
183*67e74705SXin Li ///
184*67e74705SXin Li /// This intrinsic corresponds to the \c VHADDPD instruction.
185*67e74705SXin Li ///
186*67e74705SXin Li /// \param __a
187*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing one of the source operands.
188*67e74705SXin Li ///    The horizontal sum of the values is stored in the lower bits of the
189*67e74705SXin Li ///    destination.
190*67e74705SXin Li /// \param __b
191*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing one of the source operands.
192*67e74705SXin Li ///    The horizontal sum of the values is stored in the upper bits of the
193*67e74705SXin Li ///    destination.
194*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
195*67e74705SXin Li ///    both operands.
196*67e74705SXin Li static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hadd_pd(__m128d __a,__m128d __b)197*67e74705SXin Li _mm_hadd_pd(__m128d __a, __m128d __b)
198*67e74705SXin Li {
199*67e74705SXin Li   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
200*67e74705SXin Li }
201*67e74705SXin Li 
202*67e74705SXin Li /// \brief Horizontally subtracts the pairs of values contained in two 128-bit
203*67e74705SXin Li ///    vectors of [2 x double].
204*67e74705SXin Li ///
205*67e74705SXin Li /// \headerfile <x86intrin.h>
206*67e74705SXin Li ///
207*67e74705SXin Li /// This intrinsic corresponds to the \c VHSUBPD instruction.
208*67e74705SXin Li ///
209*67e74705SXin Li /// \param __a
210*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing one of the source operands.
211*67e74705SXin Li ///    The horizontal difference of the values is stored in the lower bits of
212*67e74705SXin Li ///    the destination.
213*67e74705SXin Li /// \param __b
214*67e74705SXin Li ///    A 128-bit vector of [2 x double] containing one of the source operands.
215*67e74705SXin Li ///    The horizontal difference of the values is stored in the upper bits of
216*67e74705SXin Li ///    the destination.
217*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the horizontal
218*67e74705SXin Li ///    differences of both operands.
219*67e74705SXin Li static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hsub_pd(__m128d __a,__m128d __b)220*67e74705SXin Li _mm_hsub_pd(__m128d __a, __m128d __b)
221*67e74705SXin Li {
222*67e74705SXin Li   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
223*67e74705SXin Li }
224*67e74705SXin Li 
225*67e74705SXin Li /// \brief Moves and duplicates one double-precision value to double-precision
226*67e74705SXin Li ///    values stored in a 128-bit vector of [2 x double].
227*67e74705SXin Li ///
228*67e74705SXin Li /// \headerfile <x86intrin.h>
229*67e74705SXin Li ///
230*67e74705SXin Li /// \code
231*67e74705SXin Li /// __m128d _mm_loaddup_pd(double const * dp);
232*67e74705SXin Li /// \endcode
233*67e74705SXin Li ///
234*67e74705SXin Li /// This intrinsic corresponds to the \c VMOVDDUP instruction.
235*67e74705SXin Li ///
236*67e74705SXin Li /// \param dp
237*67e74705SXin Li ///    A pointer to a double-precision value to be moved and duplicated.
238*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the moved and
239*67e74705SXin Li ///    duplicated values.
240*67e74705SXin Li #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
241*67e74705SXin Li 
242*67e74705SXin Li /// \brief Moves and duplicates the double-precision value in the lower bits of
243*67e74705SXin Li ///    a 128-bit vector of [2 x double] to double-precision values stored in a
244*67e74705SXin Li ///    128-bit vector of [2 x double].
245*67e74705SXin Li ///
246*67e74705SXin Li /// \headerfile <x86intrin.h>
247*67e74705SXin Li ///
248*67e74705SXin Li /// This intrinsic corresponds to the \c VMOVDDUP instruction.
249*67e74705SXin Li ///
250*67e74705SXin Li /// \param __a
251*67e74705SXin Li ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
252*67e74705SXin Li ///    [127:64] and [63:0] of the destination.
253*67e74705SXin Li /// \returns A 128-bit vector of [2 x double] containing the moved and
254*67e74705SXin Li ///    duplicated values.
255*67e74705SXin Li static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)256*67e74705SXin Li _mm_movedup_pd(__m128d __a)
257*67e74705SXin Li {
258*67e74705SXin Li   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
259*67e74705SXin Li }
260*67e74705SXin Li 
261*67e74705SXin Li #define _MM_DENORMALS_ZERO_ON   (0x0040)
262*67e74705SXin Li #define _MM_DENORMALS_ZERO_OFF  (0x0000)
263*67e74705SXin Li 
264*67e74705SXin Li #define _MM_DENORMALS_ZERO_MASK (0x0040)
265*67e74705SXin Li 
266*67e74705SXin Li #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
267*67e74705SXin Li #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
268*67e74705SXin Li 
269*67e74705SXin Li /// \brief Establishes a linear address memory range to be monitored and puts
270*67e74705SXin Li ///    the processor in the monitor event pending state. Data stored in the
271*67e74705SXin Li ///    monitored address range causes the processor to exit the pending state.
272*67e74705SXin Li ///
273*67e74705SXin Li /// \headerfile <x86intrin.h>
274*67e74705SXin Li ///
275*67e74705SXin Li /// This intrinsic corresponds to the \c MONITOR instruction.
276*67e74705SXin Li ///
277*67e74705SXin Li /// \param __p
278*67e74705SXin Li ///    The memory range to be monitored. The size of the range is determined by
279*67e74705SXin Li ///    CPUID function 0000_0005h.
280*67e74705SXin Li /// \param __extensions
281*67e74705SXin Li ///    Optional extensions for the monitoring state.
282*67e74705SXin Li /// \param __hints
283*67e74705SXin Li ///    Optional hints for the monitoring state.
284*67e74705SXin Li static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitor(void const * __p,unsigned __extensions,unsigned __hints)285*67e74705SXin Li _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
286*67e74705SXin Li {
287*67e74705SXin Li   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
288*67e74705SXin Li }
289*67e74705SXin Li 
290*67e74705SXin Li /// \brief Used with the MONITOR instruction to wait while the processor is in
291*67e74705SXin Li ///    the monitor event pending state. Data stored in the monitored address
292*67e74705SXin Li ///    range causes the processor to exit the pending state.
293*67e74705SXin Li ///
294*67e74705SXin Li /// \headerfile <x86intrin.h>
295*67e74705SXin Li ///
296*67e74705SXin Li /// This intrinsic corresponds to the \c MWAIT instruction.
297*67e74705SXin Li ///
298*67e74705SXin Li /// \param __extensions
299*67e74705SXin Li ///    Optional extensions for the monitoring state, which may vary by
300*67e74705SXin Li ///    processor.
301*67e74705SXin Li /// \param __hints
302*67e74705SXin Li ///    Optional hints for the monitoring state, which may vary by processor.
303*67e74705SXin Li static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwait(unsigned __extensions,unsigned __hints)304*67e74705SXin Li _mm_mwait(unsigned __extensions, unsigned __hints)
305*67e74705SXin Li {
306*67e74705SXin Li   __builtin_ia32_mwait(__extensions, __hints);
307*67e74705SXin Li }
308*67e74705SXin Li 
309*67e74705SXin Li #undef __DEFAULT_FN_ATTRS
310*67e74705SXin Li 
311*67e74705SXin Li #endif /* __PMMINTRIN_H */
312