xref: /aosp_15_r20/external/XNNPACK/src/f32-vscaleextexp/gen/avx512f-p5-scalef-x16.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-vscaleextexp/avx512f-p5-scalef.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <immintrin.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/vscaleextexp.h>
17 
18 
xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16(size_t elements,const float * x,float * y,float scale_value,float scale_exp)19 void xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16(
20     size_t elements,
21     const float* x,
22     float* y,
23     float scale_value,
24     float scale_exp)
25 {
26   assert(elements % sizeof(float) == 0);
27 
28   const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f);
29   const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E43p-1f);
30   const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05C61p-29f);
31 
32   const __m512 vc0 = _mm512_set1_ps(1.0f);
33   const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f);
34   const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f);
35   const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f);
36   const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f);
37   const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f);
38 
39   const __m512 vscalev = _mm512_set1_ps(scale_value);
40   const __m512 vscalee = _mm512_set1_ps(scale_exp);
41 
42   for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
43     // Load 16 (1x16) inputs at a time.
44     const __m512 vx0 = _mm512_loadu_ps(x);
45     x += 16;
46 
47     // Compute reduced argument elements := round(x / log(2)).
48     const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0);
49 
50     // Compute reduced argument t := x - elements * log(2).
51     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
52     __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vx0);
53 
54     vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0);
55 
56     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
57     __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4);
58 
59     vp0 = _mm512_fmadd_ps(vp0, vt0, vc3);
60 
61     vp0 = _mm512_fmadd_ps(vp0, vt0, vc2);
62 
63     vp0 = _mm512_fmadd_ps(vp0, vt0, vc1);
64 
65     vp0 = _mm512_fmadd_ps(vp0, vt0, vc0);
66 
67     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation where
68     //  - vnX is "exponent"
69     //  - vpX is "mantissa"
70     //
71     // exp2(ae) * av * exp2(be) * bv =
72     //   = exp2(ae + be) * (av * bv)
73     __m512 vf0 = _mm512_mul_ps(vp0, vscalev);
74 
75     const __m512 ve0 = _mm512_add_ps(vn0, vscalee);
76 
77     // Multiply "mantissa" by the exp2("exponent").
78     vf0 = _mm512_scalef_ps(vf0, ve0);
79 
80     // Store 128 (8x16) results at a time.
81     _mm512_storeu_ps(y, vf0);
82     _mm512_storeu_ps(y + 0, vf0);
83     y += 16;
84   }
85 
86   for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
87     // Load 16 inputs at a time.
88     const __m512 vx = _mm512_loadu_ps(x);
89     x += 16;
90 
91     // Compute reduced argument elements := round(x / log(2)).
92     const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0);
93 
94     // Compute reduced argument t := x - elements * log(2).
95     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
96     __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
97     vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
98 
99     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
100     __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4);
101     vp = _mm512_fmadd_ps(vp, vt, vc3);
102     vp = _mm512_fmadd_ps(vp, vt, vc2);
103     vp = _mm512_fmadd_ps(vp, vt, vc1);
104     vp = _mm512_fmadd_ps(vp, vt, vc0);
105 
106     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation.
107     __m512 vf = _mm512_mul_ps(vp, vscalev);
108     const __m512 ve = _mm512_add_ps(vn, vscalee);
109 
110     // Multiply "mantissa" by the exp2("exponent").
111     vf = _mm512_scalef_ps(vf, ve);
112 
113     // Store 16 results at a time.
114     _mm512_storeu_ps(y, vf);
115     y += 16;
116   }
117   if XNN_UNLIKELY(elements != 0) {
118     // Prepare mask for valid 32-bit elements (depends on elements).
119     elements >>= 2 /* log2(sizeof(float)) */;
120     const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << elements) - UINT32_C(1)));
121 
122     // Load up to 15 inputs at a time.
123     const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
124 
125     // Compute reduced argument elements := round(x / log(2)).
126     const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0);
127 
128     // Compute reduced argument t := x - elements * log(2).
129     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
130     __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
131     vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
132 
133     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
134     __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4);
135     vp = _mm512_fmadd_ps(vp, vt, vc3);
136     vp = _mm512_fmadd_ps(vp, vt, vc2);
137     vp = _mm512_fmadd_ps(vp, vt, vc1);
138     vp = _mm512_fmadd_ps(vp, vt, vc0);
139 
140     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation.
141     __m512 vf = _mm512_mul_ps(vp, vscalev);
142     const __m512 ve = _mm512_add_ps(vn, vscalee);
143 
144     // Multiply "mantissa" by the exp2("exponent").
145     vf = _mm512_scalef_ps(vf, ve);
146 
147     // Store up to 15 results at a time.
148     _mm512_mask_storeu_ps(y, vmask, vf);
149   }
150   _mm256_zeroupper();
151 }
152