1*67e74705SXin Li /*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== 2*67e74705SXin Li * 3*67e74705SXin Li * Permission is hereby granted, free of charge, to any person obtaining a copy 4*67e74705SXin Li * of this software and associated documentation files (the "Software"), to deal 5*67e74705SXin Li * in the Software without restriction, including without limitation the rights 6*67e74705SXin Li * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7*67e74705SXin Li * copies of the Software, and to permit persons to whom the Software is 8*67e74705SXin Li * furnished to do so, subject to the following conditions: 9*67e74705SXin Li * 10*67e74705SXin Li * The above copyright notice and this permission notice shall be included in 11*67e74705SXin Li * all copies or substantial portions of the Software. 12*67e74705SXin Li * 13*67e74705SXin Li * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14*67e74705SXin Li * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15*67e74705SXin Li * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16*67e74705SXin Li * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17*67e74705SXin Li * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18*67e74705SXin Li * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19*67e74705SXin Li * THE SOFTWARE. 20*67e74705SXin Li * 21*67e74705SXin Li *===-----------------------------------------------------------------------=== 22*67e74705SXin Li */ 23*67e74705SXin Li #ifndef __IMMINTRIN_H 24*67e74705SXin Li #error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead." 25*67e74705SXin Li #endif 26*67e74705SXin Li 27*67e74705SXin Li #ifndef __AVX512ERINTRIN_H 28*67e74705SXin Li #define __AVX512ERINTRIN_H 29*67e74705SXin Li 30*67e74705SXin Li // exp2a23 31*67e74705SXin Li #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ 32*67e74705SXin Li (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 33*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 34*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 35*67e74705SXin Li 36*67e74705SXin Li #define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ 37*67e74705SXin Li (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 38*67e74705SXin Li (__v8df)(__m512d)(S), (__mmask8)(M), \ 39*67e74705SXin Li (int)(R)); }) 40*67e74705SXin Li 41*67e74705SXin Li #define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ 42*67e74705SXin Li (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 43*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 44*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 45*67e74705SXin Li 46*67e74705SXin Li #define _mm512_exp2a23_pd(A) \ 47*67e74705SXin Li _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) 48*67e74705SXin Li 49*67e74705SXin Li #define _mm512_mask_exp2a23_pd(S, M, A) \ 50*67e74705SXin Li _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 51*67e74705SXin Li 52*67e74705SXin Li #define _mm512_maskz_exp2a23_pd(M, A) \ 53*67e74705SXin Li _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 54*67e74705SXin Li 55*67e74705SXin Li #define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ 56*67e74705SXin Li (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 57*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 58*67e74705SXin Li (__mmask16)-1, (int)(R)); }) 59*67e74705SXin Li 60*67e74705SXin Li #define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ 61*67e74705SXin Li (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 62*67e74705SXin Li (__v16sf)(__m512)(S), (__mmask16)(M), \ 63*67e74705SXin Li (int)(R)); }) 64*67e74705SXin Li 65*67e74705SXin Li #define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ 66*67e74705SXin Li (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 67*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 68*67e74705SXin Li (__mmask16)(M), (int)(R)); }) 69*67e74705SXin Li 70*67e74705SXin Li #define _mm512_exp2a23_ps(A) \ 71*67e74705SXin Li _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) 72*67e74705SXin Li 73*67e74705SXin Li #define _mm512_mask_exp2a23_ps(S, M, A) \ 74*67e74705SXin Li _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 75*67e74705SXin Li 76*67e74705SXin Li #define _mm512_maskz_exp2a23_ps(M, A) \ 77*67e74705SXin Li _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 78*67e74705SXin Li 79*67e74705SXin Li // rsqrt28 80*67e74705SXin Li #define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ 81*67e74705SXin Li (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 82*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 83*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 84*67e74705SXin Li 85*67e74705SXin Li #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ 86*67e74705SXin Li (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 87*67e74705SXin Li (__v8df)(__m512d)(S), (__mmask8)(M), \ 88*67e74705SXin Li (int)(R)); }) 89*67e74705SXin Li 90*67e74705SXin Li #define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ 91*67e74705SXin Li (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 92*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 93*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 94*67e74705SXin Li 95*67e74705SXin Li #define _mm512_rsqrt28_pd(A) \ 96*67e74705SXin Li _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 97*67e74705SXin Li 98*67e74705SXin Li #define _mm512_mask_rsqrt28_pd(S, M, A) \ 99*67e74705SXin Li _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 100*67e74705SXin Li 101*67e74705SXin Li #define _mm512_maskz_rsqrt28_pd(M, A) \ 102*67e74705SXin Li _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 103*67e74705SXin Li 104*67e74705SXin Li #define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ 105*67e74705SXin Li (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 106*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 107*67e74705SXin Li (__mmask16)-1, (int)(R)); }) 108*67e74705SXin Li 109*67e74705SXin Li #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ 110*67e74705SXin Li (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 111*67e74705SXin Li (__v16sf)(__m512)(S), (__mmask16)(M), \ 112*67e74705SXin Li (int)(R)); }) 113*67e74705SXin Li 114*67e74705SXin Li #define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ 115*67e74705SXin Li (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 116*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 117*67e74705SXin Li (__mmask16)(M), (int)(R)); }) 118*67e74705SXin Li 119*67e74705SXin Li #define _mm512_rsqrt28_ps(A) \ 120*67e74705SXin Li _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 121*67e74705SXin Li 122*67e74705SXin Li #define _mm512_mask_rsqrt28_ps(S, M, A) \ 123*67e74705SXin Li _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) 124*67e74705SXin Li 125*67e74705SXin Li #define _mm512_maskz_rsqrt28_ps(M, A) \ 126*67e74705SXin Li _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 127*67e74705SXin Li 128*67e74705SXin Li #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ 129*67e74705SXin Li (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ 130*67e74705SXin Li (__v4sf)(__m128)(B), \ 131*67e74705SXin Li (__v4sf)_mm_setzero_ps(), \ 132*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 133*67e74705SXin Li 134*67e74705SXin Li #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ 135*67e74705SXin Li (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ 136*67e74705SXin Li (__v4sf)(__m128)(B), \ 137*67e74705SXin Li (__v4sf)(__m128)(S), \ 138*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 139*67e74705SXin Li 140*67e74705SXin Li #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ 141*67e74705SXin Li (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ 142*67e74705SXin Li (__v4sf)(__m128)(B), \ 143*67e74705SXin Li (__v4sf)_mm_setzero_ps(), \ 144*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 145*67e74705SXin Li 146*67e74705SXin Li #define _mm_rsqrt28_ss(A, B) \ 147*67e74705SXin Li _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 148*67e74705SXin Li 149*67e74705SXin Li #define _mm_mask_rsqrt28_ss(S, M, A, B) \ 150*67e74705SXin Li _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 151*67e74705SXin Li 152*67e74705SXin Li #define _mm_maskz_rsqrt28_ss(M, A, B) \ 153*67e74705SXin Li _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 154*67e74705SXin Li 155*67e74705SXin Li #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ 156*67e74705SXin Li (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ 157*67e74705SXin Li (__v2df)(__m128d)(B), \ 158*67e74705SXin Li (__v2df)_mm_setzero_pd(), \ 159*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 160*67e74705SXin Li 161*67e74705SXin Li #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ 162*67e74705SXin Li (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ 163*67e74705SXin Li (__v2df)(__m128d)(B), \ 164*67e74705SXin Li (__v2df)(__m128d)(S), \ 165*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 166*67e74705SXin Li 167*67e74705SXin Li #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ 168*67e74705SXin Li (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ 169*67e74705SXin Li (__v2df)(__m128d)(B), \ 170*67e74705SXin Li (__v2df)_mm_setzero_pd(), \ 171*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 172*67e74705SXin Li 173*67e74705SXin Li #define _mm_rsqrt28_sd(A, B) \ 174*67e74705SXin Li _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 175*67e74705SXin Li 176*67e74705SXin Li #define _mm_mask_rsqrt28_sd(S, M, A, B) \ 177*67e74705SXin Li _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 178*67e74705SXin Li 179*67e74705SXin Li #define _mm_maskz_rsqrt28_sd(M, A, B) \ 180*67e74705SXin Li _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 181*67e74705SXin Li 182*67e74705SXin Li // rcp28 183*67e74705SXin Li #define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ 184*67e74705SXin Li (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 185*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 186*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 187*67e74705SXin Li 188*67e74705SXin Li #define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ 189*67e74705SXin Li (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 190*67e74705SXin Li (__v8df)(__m512d)(S), (__mmask8)(M), \ 191*67e74705SXin Li (int)(R)); }) 192*67e74705SXin Li 193*67e74705SXin Li #define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ 194*67e74705SXin Li (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 195*67e74705SXin Li (__v8df)_mm512_setzero_pd(), \ 196*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 197*67e74705SXin Li 198*67e74705SXin Li #define _mm512_rcp28_pd(A) \ 199*67e74705SXin Li _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 200*67e74705SXin Li 201*67e74705SXin Li #define _mm512_mask_rcp28_pd(S, M, A) \ 202*67e74705SXin Li _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 203*67e74705SXin Li 204*67e74705SXin Li #define _mm512_maskz_rcp28_pd(M, A) \ 205*67e74705SXin Li _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 206*67e74705SXin Li 207*67e74705SXin Li #define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ 208*67e74705SXin Li (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 209*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 210*67e74705SXin Li (__mmask16)-1, (int)(R)); }) 211*67e74705SXin Li 212*67e74705SXin Li #define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ 213*67e74705SXin Li (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 214*67e74705SXin Li (__v16sf)(__m512)(S), (__mmask16)(M), \ 215*67e74705SXin Li (int)(R)); }) 216*67e74705SXin Li 217*67e74705SXin Li #define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ 218*67e74705SXin Li (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 219*67e74705SXin Li (__v16sf)_mm512_setzero_ps(), \ 220*67e74705SXin Li (__mmask16)(M), (int)(R)); }) 221*67e74705SXin Li 222*67e74705SXin Li #define _mm512_rcp28_ps(A) \ 223*67e74705SXin Li _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 224*67e74705SXin Li 225*67e74705SXin Li #define _mm512_mask_rcp28_ps(S, M, A) \ 226*67e74705SXin Li _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 227*67e74705SXin Li 228*67e74705SXin Li #define _mm512_maskz_rcp28_ps(M, A) \ 229*67e74705SXin Li _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 230*67e74705SXin Li 231*67e74705SXin Li #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ 232*67e74705SXin Li (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ 233*67e74705SXin Li (__v4sf)(__m128)(B), \ 234*67e74705SXin Li (__v4sf)_mm_setzero_ps(), \ 235*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 236*67e74705SXin Li 237*67e74705SXin Li #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ 238*67e74705SXin Li (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ 239*67e74705SXin Li (__v4sf)(__m128)(B), \ 240*67e74705SXin Li (__v4sf)(__m128)(S), \ 241*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 242*67e74705SXin Li 243*67e74705SXin Li #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ 244*67e74705SXin Li (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ 245*67e74705SXin Li (__v4sf)(__m128)(B), \ 246*67e74705SXin Li (__v4sf)_mm_setzero_ps(), \ 247*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 248*67e74705SXin Li 249*67e74705SXin Li #define _mm_rcp28_ss(A, B) \ 250*67e74705SXin Li _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 251*67e74705SXin Li 252*67e74705SXin Li #define _mm_mask_rcp28_ss(S, M, A, B) \ 253*67e74705SXin Li _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 254*67e74705SXin Li 255*67e74705SXin Li #define _mm_maskz_rcp28_ss(M, A, B) \ 256*67e74705SXin Li _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 257*67e74705SXin Li 258*67e74705SXin Li #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ 259*67e74705SXin Li (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ 260*67e74705SXin Li (__v2df)(__m128d)(B), \ 261*67e74705SXin Li (__v2df)_mm_setzero_pd(), \ 262*67e74705SXin Li (__mmask8)-1, (int)(R)); }) 263*67e74705SXin Li 264*67e74705SXin Li #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ 265*67e74705SXin Li (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ 266*67e74705SXin Li (__v2df)(__m128d)(B), \ 267*67e74705SXin Li (__v2df)(__m128d)(S), \ 268*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 269*67e74705SXin Li 270*67e74705SXin Li #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ 271*67e74705SXin Li (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ 272*67e74705SXin Li (__v2df)(__m128d)(B), \ 273*67e74705SXin Li (__v2df)_mm_setzero_pd(), \ 274*67e74705SXin Li (__mmask8)(M), (int)(R)); }) 275*67e74705SXin Li 276*67e74705SXin Li #define _mm_rcp28_sd(A, B) \ 277*67e74705SXin Li _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 278*67e74705SXin Li 279*67e74705SXin Li #define _mm_mask_rcp28_sd(S, M, A, B) \ 280*67e74705SXin Li _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 281*67e74705SXin Li 282*67e74705SXin Li #define _mm_maskz_rcp28_sd(M, A, B) \ 283*67e74705SXin Li _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 284*67e74705SXin Li 285*67e74705SXin Li #endif // __AVX512ERINTRIN_H 286