xref: /aosp_15_r20/external/XNNPACK/src/f32-velu/gen/velu-scalar-rr2-p6-x5.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-velu/scalar-rr2-p6.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 #include <math.h>
12 
13 #include <xnnpack/common.h>
14 #include <xnnpack/math.h>
15 #include <xnnpack/vunary.h>
16 
17 
xnn_f32_velu_ukernel__scalar_rr2_p6_x5(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_velu_ukernel__scalar_rr2_p6_x5(
19     size_t n,
20     const float* x,
21     float* y,
22     const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
23 {
24   assert(n % sizeof(float) == 0);
25 
26   const float vprescale = params->scalar_rr2_p6.prescale;
27   const float valpha = params->scalar_rr2_p6.alpha;
28   const float vbeta = params->scalar_rr2_p6.beta;
29   const float vmagic_bias = params->scalar_rr2_p6.magic_bias;
30   const float vlog2e = params->scalar_rr2_p6.log2e;
31   const float vsat_cutoff = params->scalar_rr2_p6.sat_cutoff;
32   const float vminus_ln2_hi = params->scalar_rr2_p6.minus_ln2_hi;
33   const float vminus_ln2_lo = params->scalar_rr2_p6.minus_ln2_lo;
34   const float vc6 = params->scalar_rr2_p6.c6;
35   const float vc5 = params->scalar_rr2_p6.c5;
36   const float vc4 = params->scalar_rr2_p6.c4;
37   const float vc3 = params->scalar_rr2_p6.c3;
38   const float vc2 = params->scalar_rr2_p6.c2;
39   const float vone = params->scalar_rr2_p6.one;
40 
41   for (; n >= 5 * sizeof(float); n -= 5 * sizeof(float)) {
42     float vx0 = x[0];
43     float vx1 = x[1];
44     float vx2 = x[2];
45     float vx3 = x[3];
46     float vx4 = x[4];
47     x += 5;
48 
49     const float vz0 = vx0 * vprescale;
50     const float vz1 = vx1 * vprescale;
51     const float vz2 = vx2 * vprescale;
52     const float vz3 = vx3 * vprescale;
53     const float vz4 = vx4 * vprescale;
54 
55     float vn0 = vz0 * vlog2e + vmagic_bias;
56     float vn1 = vz1 * vlog2e + vmagic_bias;
57     float vn2 = vz2 * vlog2e + vmagic_bias;
58     float vn3 = vz3 * vlog2e + vmagic_bias;
59     float vn4 = vz4 * vlog2e + vmagic_bias;
60 
61     float vs0 = uint32_as_float(float_as_uint32(vn0) << 23);
62     vn0 -= vmagic_bias;
63     float vs1 = uint32_as_float(float_as_uint32(vn1) << 23);
64     vn1 -= vmagic_bias;
65     float vs2 = uint32_as_float(float_as_uint32(vn2) << 23);
66     vn2 -= vmagic_bias;
67     float vs3 = uint32_as_float(float_as_uint32(vn3) << 23);
68     vn3 -= vmagic_bias;
69     float vs4 = uint32_as_float(float_as_uint32(vn4) << 23);
70     vn4 -= vmagic_bias;
71 
72     float vt0 = vn0 * vminus_ln2_hi + vz0;
73     float vt1 = vn1 * vminus_ln2_hi + vz1;
74     float vt2 = vn2 * vminus_ln2_hi + vz2;
75     float vt3 = vn3 * vminus_ln2_hi + vz3;
76     float vt4 = vn4 * vminus_ln2_hi + vz4;
77 
78     vt0 = vn0 * vminus_ln2_lo + vt0;
79     vt1 = vn1 * vminus_ln2_lo + vt1;
80     vt2 = vn2 * vminus_ln2_lo + vt2;
81     vt3 = vn3 * vminus_ln2_lo + vt3;
82     vt4 = vn4 * vminus_ln2_lo + vt4;
83 
84     if XNN_UNPREDICTABLE(vz0 <= vsat_cutoff) {
85       vs0 = 0.0f;
86       vt0 = 0.0f;
87     }
88     if XNN_UNPREDICTABLE(vz1 <= vsat_cutoff) {
89       vs1 = 0.0f;
90       vt1 = 0.0f;
91     }
92     if XNN_UNPREDICTABLE(vz2 <= vsat_cutoff) {
93       vs2 = 0.0f;
94       vt2 = 0.0f;
95     }
96     if XNN_UNPREDICTABLE(vz3 <= vsat_cutoff) {
97       vs3 = 0.0f;
98       vt3 = 0.0f;
99     }
100     if XNN_UNPREDICTABLE(vz4 <= vsat_cutoff) {
101       vs4 = 0.0f;
102       vt4 = 0.0f;
103     }
104 
105     float vp0 = vc6 * vt0 + vc5;
106     float vp1 = vc6 * vt1 + vc5;
107     float vp2 = vc6 * vt2 + vc5;
108     float vp3 = vc6 * vt3 + vc5;
109     float vp4 = vc6 * vt4 + vc5;
110 
111     vp0 = vp0 * vt0 + vc4;
112     vp1 = vp1 * vt1 + vc4;
113     vp2 = vp2 * vt2 + vc4;
114     vp3 = vp3 * vt3 + vc4;
115     vp4 = vp4 * vt4 + vc4;
116 
117     vp0 = vp0 * vt0 + vc3;
118     vp1 = vp1 * vt1 + vc3;
119     vp2 = vp2 * vt2 + vc3;
120     vp3 = vp3 * vt3 + vc3;
121     vp4 = vp4 * vt4 + vc3;
122 
123     vp0 = vp0 * vt0 + vc2;
124     vp1 = vp1 * vt1 + vc2;
125     vp2 = vp2 * vt2 + vc2;
126     vp3 = vp3 * vt3 + vc2;
127     vp4 = vp4 * vt4 + vc2;
128 
129     vp0 *= vt0;
130     vp1 *= vt1;
131     vp2 *= vt2;
132     vp3 *= vt3;
133     vp4 *= vt4;
134 
135     vt0 *= vs0;
136     vs0 -= vone;
137     vt1 *= vs1;
138     vs1 -= vone;
139     vt2 *= vs2;
140     vs2 -= vone;
141     vt3 *= vs3;
142     vs3 -= vone;
143     vt4 *= vs4;
144     vs4 -= vone;
145 
146     vp0 = vp0 * vt0 + vt0;
147     vp1 = vp1 * vt1 + vt1;
148     vp2 = vp2 * vt2 + vt2;
149     vp3 = vp3 * vt3 + vt3;
150     vp4 = vp4 * vt4 + vt4;
151 
152     const float ve0 = (vp0 + vs0) * valpha;
153     float vy0 = vx0 * vbeta;
154     const float ve1 = (vp1 + vs1) * valpha;
155     float vy1 = vx1 * vbeta;
156     const float ve2 = (vp2 + vs2) * valpha;
157     float vy2 = vx2 * vbeta;
158     const float ve3 = (vp3 + vs3) * valpha;
159     float vy3 = vx3 * vbeta;
160     const float ve4 = (vp4 + vs4) * valpha;
161     float vy4 = vx4 * vbeta;
162 
163     if XNN_UNPREDICTABLE(vx0 < 0.0f) {
164       vy0 = ve0;
165     }
166     if XNN_UNPREDICTABLE(vx1 < 0.0f) {
167       vy1 = ve1;
168     }
169     if XNN_UNPREDICTABLE(vx2 < 0.0f) {
170       vy2 = ve2;
171     }
172     if XNN_UNPREDICTABLE(vx3 < 0.0f) {
173       vy3 = ve3;
174     }
175     if XNN_UNPREDICTABLE(vx4 < 0.0f) {
176       vy4 = ve4;
177     }
178 
179     y[0] = vy0;
180     y[1] = vy1;
181     y[2] = vy2;
182     y[3] = vy3;
183     y[4] = vy4;
184     y += 5;
185   }
186   if XNN_UNLIKELY(n != 0) {
187     do {
188       float vx = *x++;
189 
190       const float vz = vx * vprescale;
191 
192       float vn = vz * vlog2e + vmagic_bias;
193       float vs = uint32_as_float(float_as_uint32(vn) << 23);
194       vn -= vmagic_bias;
195 
196       float vt = vn * vminus_ln2_hi + vz;
197       vt = vn * vminus_ln2_lo + vt;
198 
199       if XNN_UNPREDICTABLE(vz <= vsat_cutoff) {
200         vs = 0.0f;
201         vt = 0.0f;
202       }
203 
204       float vp = vc6 * vt + vc5;
205       vp = vp * vt + vc4;
206       vp = vp * vt + vc3;
207       vp = vp * vt + vc2;
208       vp *= vt;
209 
210       vt *= vs;
211       vs -= vone;
212       vp = vp * vt + vt;
213       const float ve = (vp + vs) * valpha;
214 
215       float vy = vx * vbeta;
216       if XNN_UNPREDICTABLE(vx < 0.0f) {
217         vy = ve;
218       }
219 
220       *y++ = vy;
221 
222       n -= sizeof(float);
223     } while (n != 0);
224   }
225 }
226