1*a58d3d2aSXin Li /* Copyright (c) 2018 Mozilla
2*a58d3d2aSXin Li 2008-2011 Octasic Inc.
3*a58d3d2aSXin Li 2012-2017 Jean-Marc Valin */
4*a58d3d2aSXin Li /*
5*a58d3d2aSXin Li Redistribution and use in source and binary forms, with or without
6*a58d3d2aSXin Li modification, are permitted provided that the following conditions
7*a58d3d2aSXin Li are met:
8*a58d3d2aSXin Li
9*a58d3d2aSXin Li - Redistributions of source code must retain the above copyright
10*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer.
11*a58d3d2aSXin Li
12*a58d3d2aSXin Li - Redistributions in binary form must reproduce the above copyright
13*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer in the
14*a58d3d2aSXin Li documentation and/or other materials provided with the distribution.
15*a58d3d2aSXin Li
16*a58d3d2aSXin Li THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17*a58d3d2aSXin Li ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18*a58d3d2aSXin Li LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19*a58d3d2aSXin Li A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
20*a58d3d2aSXin Li CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21*a58d3d2aSXin Li EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22*a58d3d2aSXin Li PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23*a58d3d2aSXin Li PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24*a58d3d2aSXin Li LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25*a58d3d2aSXin Li NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*a58d3d2aSXin Li SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*a58d3d2aSXin Li */
28*a58d3d2aSXin Li
29*a58d3d2aSXin Li #ifndef VEC_H
30*a58d3d2aSXin Li #define VEC_H
31*a58d3d2aSXin Li
32*a58d3d2aSXin Li #include "opus_types.h"
33*a58d3d2aSXin Li #include <math.h>
34*a58d3d2aSXin Li #include "arch.h"
35*a58d3d2aSXin Li #include "x86/x86_arch_macros.h"
36*a58d3d2aSXin Li
37*a58d3d2aSXin Li
38*a58d3d2aSXin Li #if defined(__AVX__) || defined(__SSE2__)
39*a58d3d2aSXin Li #include "vec_avx.h"
40*a58d3d2aSXin Li #elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
41*a58d3d2aSXin Li #include "vec_neon.h"
42*a58d3d2aSXin Li #else
43*a58d3d2aSXin Li
44*a58d3d2aSXin Li #include "os_support.h"
45*a58d3d2aSXin Li
46*a58d3d2aSXin Li #define MAX_INPUTS (2048)
47*a58d3d2aSXin Li
48*a58d3d2aSXin Li #define NO_OPTIMIZATIONS
49*a58d3d2aSXin Li
sgemv16x1(float * out,const float * weights,int rows,int cols,int col_stride,const float * x)50*a58d3d2aSXin Li static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
51*a58d3d2aSXin Li {
52*a58d3d2aSXin Li int i, j;
53*a58d3d2aSXin Li OPUS_CLEAR(out, rows);
54*a58d3d2aSXin Li for (i=0;i<rows;i+=16)
55*a58d3d2aSXin Li {
56*a58d3d2aSXin Li for (j=0;j<cols;j++)
57*a58d3d2aSXin Li {
58*a58d3d2aSXin Li const float * restrict w;
59*a58d3d2aSXin Li float * restrict y;
60*a58d3d2aSXin Li float xj;
61*a58d3d2aSXin Li w = &weights[j*col_stride + i];
62*a58d3d2aSXin Li xj = x[j];
63*a58d3d2aSXin Li y = &out[i];
64*a58d3d2aSXin Li y[0] += w[0]*xj;
65*a58d3d2aSXin Li y[1] += w[1]*xj;
66*a58d3d2aSXin Li y[2] += w[2]*xj;
67*a58d3d2aSXin Li y[3] += w[3]*xj;
68*a58d3d2aSXin Li y[4] += w[4]*xj;
69*a58d3d2aSXin Li y[5] += w[5]*xj;
70*a58d3d2aSXin Li y[6] += w[6]*xj;
71*a58d3d2aSXin Li y[7] += w[7]*xj;
72*a58d3d2aSXin Li y[8] += w[8]*xj;
73*a58d3d2aSXin Li y[9] += w[9]*xj;
74*a58d3d2aSXin Li y[10] += w[10]*xj;
75*a58d3d2aSXin Li y[11] += w[11]*xj;
76*a58d3d2aSXin Li y[12] += w[12]*xj;
77*a58d3d2aSXin Li y[13] += w[13]*xj;
78*a58d3d2aSXin Li y[14] += w[14]*xj;
79*a58d3d2aSXin Li y[15] += w[15]*xj;
80*a58d3d2aSXin Li }
81*a58d3d2aSXin Li }
82*a58d3d2aSXin Li }
83*a58d3d2aSXin Li
sgemv8x1(float * out,const float * weights,int rows,int cols,int col_stride,const float * x)84*a58d3d2aSXin Li static inline void sgemv8x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
85*a58d3d2aSXin Li {
86*a58d3d2aSXin Li int i, j;
87*a58d3d2aSXin Li OPUS_CLEAR(out, rows);
88*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
89*a58d3d2aSXin Li {
90*a58d3d2aSXin Li for (j=0;j<cols;j++)
91*a58d3d2aSXin Li {
92*a58d3d2aSXin Li const float * restrict w;
93*a58d3d2aSXin Li float * restrict y;
94*a58d3d2aSXin Li float xj;
95*a58d3d2aSXin Li w = &weights[j*col_stride + i];
96*a58d3d2aSXin Li xj = x[j];
97*a58d3d2aSXin Li y = &out[i];
98*a58d3d2aSXin Li y[0] += w[0]*xj;
99*a58d3d2aSXin Li y[1] += w[1]*xj;
100*a58d3d2aSXin Li y[2] += w[2]*xj;
101*a58d3d2aSXin Li y[3] += w[3]*xj;
102*a58d3d2aSXin Li y[4] += w[4]*xj;
103*a58d3d2aSXin Li y[5] += w[5]*xj;
104*a58d3d2aSXin Li y[6] += w[6]*xj;
105*a58d3d2aSXin Li y[7] += w[7]*xj;
106*a58d3d2aSXin Li }
107*a58d3d2aSXin Li }
108*a58d3d2aSXin Li }
109*a58d3d2aSXin Li
sgemv(float * out,const float * weights,int rows,int cols,int col_stride,const float * x)110*a58d3d2aSXin Li static inline void sgemv(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
111*a58d3d2aSXin Li {
112*a58d3d2aSXin Li if ((rows&0xf) == 0) sgemv16x1(out, weights, rows, cols, col_stride, x);
113*a58d3d2aSXin Li else if ((rows&0x7) == 0) sgemv8x1(out, weights, rows, cols, col_stride, x);
114*a58d3d2aSXin Li else {
115*a58d3d2aSXin Li int i, j;
116*a58d3d2aSXin Li for (i=0;i<rows;i++)
117*a58d3d2aSXin Li {
118*a58d3d2aSXin Li out[i] = 0;
119*a58d3d2aSXin Li for (j=0;j<cols;j++) out[i] += weights[j*col_stride + i]*x[j];
120*a58d3d2aSXin Li }
121*a58d3d2aSXin Li }
122*a58d3d2aSXin Li }
123*a58d3d2aSXin Li
sparse_sgemv8x4(float * out,const float * w,const int * idx,int rows,const float * x)124*a58d3d2aSXin Li static inline void sparse_sgemv8x4(float *out, const float *w, const int *idx, int rows, const float *x)
125*a58d3d2aSXin Li {
126*a58d3d2aSXin Li int i, j;
127*a58d3d2aSXin Li OPUS_CLEAR(out, rows);
128*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
129*a58d3d2aSXin Li {
130*a58d3d2aSXin Li int cols;
131*a58d3d2aSXin Li cols = *idx++;
132*a58d3d2aSXin Li for (j=0;j<cols;j++)
133*a58d3d2aSXin Li {
134*a58d3d2aSXin Li int pos;
135*a58d3d2aSXin Li float * restrict y;
136*a58d3d2aSXin Li float xj0, xj1, xj2, xj3;
137*a58d3d2aSXin Li pos = (*idx++);
138*a58d3d2aSXin Li xj0 = x[pos+0];
139*a58d3d2aSXin Li xj1 = x[pos+1];
140*a58d3d2aSXin Li xj2 = x[pos+2];
141*a58d3d2aSXin Li xj3 = x[pos+3];
142*a58d3d2aSXin Li y = &out[i];
143*a58d3d2aSXin Li y[0] += w[0]*xj0;
144*a58d3d2aSXin Li y[1] += w[1]*xj0;
145*a58d3d2aSXin Li y[2] += w[2]*xj0;
146*a58d3d2aSXin Li y[3] += w[3]*xj0;
147*a58d3d2aSXin Li y[4] += w[4]*xj0;
148*a58d3d2aSXin Li y[5] += w[5]*xj0;
149*a58d3d2aSXin Li y[6] += w[6]*xj0;
150*a58d3d2aSXin Li y[7] += w[7]*xj0;
151*a58d3d2aSXin Li
152*a58d3d2aSXin Li y[0] += w[8]*xj1;
153*a58d3d2aSXin Li y[1] += w[9]*xj1;
154*a58d3d2aSXin Li y[2] += w[10]*xj1;
155*a58d3d2aSXin Li y[3] += w[11]*xj1;
156*a58d3d2aSXin Li y[4] += w[12]*xj1;
157*a58d3d2aSXin Li y[5] += w[13]*xj1;
158*a58d3d2aSXin Li y[6] += w[14]*xj1;
159*a58d3d2aSXin Li y[7] += w[15]*xj1;
160*a58d3d2aSXin Li
161*a58d3d2aSXin Li y[0] += w[16]*xj2;
162*a58d3d2aSXin Li y[1] += w[17]*xj2;
163*a58d3d2aSXin Li y[2] += w[18]*xj2;
164*a58d3d2aSXin Li y[3] += w[19]*xj2;
165*a58d3d2aSXin Li y[4] += w[20]*xj2;
166*a58d3d2aSXin Li y[5] += w[21]*xj2;
167*a58d3d2aSXin Li y[6] += w[22]*xj2;
168*a58d3d2aSXin Li y[7] += w[23]*xj2;
169*a58d3d2aSXin Li
170*a58d3d2aSXin Li y[0] += w[24]*xj3;
171*a58d3d2aSXin Li y[1] += w[25]*xj3;
172*a58d3d2aSXin Li y[2] += w[26]*xj3;
173*a58d3d2aSXin Li y[3] += w[27]*xj3;
174*a58d3d2aSXin Li y[4] += w[28]*xj3;
175*a58d3d2aSXin Li y[5] += w[29]*xj3;
176*a58d3d2aSXin Li y[6] += w[30]*xj3;
177*a58d3d2aSXin Li y[7] += w[31]*xj3;
178*a58d3d2aSXin Li w += 32;
179*a58d3d2aSXin Li }
180*a58d3d2aSXin Li }
181*a58d3d2aSXin Li }
182*a58d3d2aSXin Li
183*a58d3d2aSXin Li #ifdef USE_SU_BIAS
sparse_cgemv8x4(float * out,const opus_int8 * w,const int * idx,const float * scale,int rows,int cols,const float * _x)184*a58d3d2aSXin Li static inline void sparse_cgemv8x4(float *out, const opus_int8 *w, const int *idx, const float *scale, int rows, int cols, const float *_x)
185*a58d3d2aSXin Li {
186*a58d3d2aSXin Li int i, j;
187*a58d3d2aSXin Li unsigned char x[MAX_INPUTS];
188*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] = 0;
189*a58d3d2aSXin Li for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
190*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
191*a58d3d2aSXin Li {
192*a58d3d2aSXin Li int colblocks;
193*a58d3d2aSXin Li colblocks = *idx++;
194*a58d3d2aSXin Li for (j=0;j<colblocks;j++)
195*a58d3d2aSXin Li {
196*a58d3d2aSXin Li int pos;
197*a58d3d2aSXin Li float * restrict y;
198*a58d3d2aSXin Li int xj0, xj1, xj2, xj3;
199*a58d3d2aSXin Li pos = (*idx++);
200*a58d3d2aSXin Li xj0 = x[pos+0];
201*a58d3d2aSXin Li xj1 = x[pos+1];
202*a58d3d2aSXin Li xj2 = x[pos+2];
203*a58d3d2aSXin Li xj3 = x[pos+3];
204*a58d3d2aSXin Li y = &out[i];
205*a58d3d2aSXin Li y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
206*a58d3d2aSXin Li y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
207*a58d3d2aSXin Li y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
208*a58d3d2aSXin Li y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
209*a58d3d2aSXin Li y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
210*a58d3d2aSXin Li y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
211*a58d3d2aSXin Li y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
212*a58d3d2aSXin Li y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
213*a58d3d2aSXin Li w += 32;
214*a58d3d2aSXin Li }
215*a58d3d2aSXin Li }
216*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] *= scale[i];
217*a58d3d2aSXin Li }
cgemv8x4(float * out,const opus_int8 * w,const float * scale,int rows,int cols,const float * _x)218*a58d3d2aSXin Li static inline void cgemv8x4(float *out, const opus_int8 *w, const float *scale, int rows, int cols, const float *_x)
219*a58d3d2aSXin Li {
220*a58d3d2aSXin Li int i, j;
221*a58d3d2aSXin Li unsigned char x[MAX_INPUTS];
222*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] = 0;
223*a58d3d2aSXin Li for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
224*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
225*a58d3d2aSXin Li {
226*a58d3d2aSXin Li for (j=0;j<cols;j+=4)
227*a58d3d2aSXin Li {
228*a58d3d2aSXin Li float *y;
229*a58d3d2aSXin Li float xj0, xj1, xj2, xj3;
230*a58d3d2aSXin Li xj0 = x[j+0];
231*a58d3d2aSXin Li xj1 = x[j+1];
232*a58d3d2aSXin Li xj2 = x[j+2];
233*a58d3d2aSXin Li xj3 = x[j+3];
234*a58d3d2aSXin Li y = &out[i];
235*a58d3d2aSXin Li y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
236*a58d3d2aSXin Li y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
237*a58d3d2aSXin Li y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
238*a58d3d2aSXin Li y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
239*a58d3d2aSXin Li y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
240*a58d3d2aSXin Li y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
241*a58d3d2aSXin Li y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
242*a58d3d2aSXin Li y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
243*a58d3d2aSXin Li w += 32;
244*a58d3d2aSXin Li }
245*a58d3d2aSXin Li }
246*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] *= scale[i];
247*a58d3d2aSXin Li }
248*a58d3d2aSXin Li #else
sparse_cgemv8x4(float * out,const opus_int8 * w,const int * idx,const float * scale,int rows,int cols,const float * _x)249*a58d3d2aSXin Li static inline void sparse_cgemv8x4(float *out, const opus_int8 *w, const int *idx, const float *scale, int rows, int cols, const float *_x)
250*a58d3d2aSXin Li {
251*a58d3d2aSXin Li int i, j;
252*a58d3d2aSXin Li opus_int8 x[MAX_INPUTS];
253*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] = 0;
254*a58d3d2aSXin Li for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
255*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
256*a58d3d2aSXin Li {
257*a58d3d2aSXin Li int colblocks;
258*a58d3d2aSXin Li colblocks = *idx++;
259*a58d3d2aSXin Li for (j=0;j<colblocks;j++)
260*a58d3d2aSXin Li {
261*a58d3d2aSXin Li int pos;
262*a58d3d2aSXin Li float * restrict y;
263*a58d3d2aSXin Li int xj0, xj1, xj2, xj3;
264*a58d3d2aSXin Li pos = (*idx++);
265*a58d3d2aSXin Li xj0 = x[pos+0];
266*a58d3d2aSXin Li xj1 = x[pos+1];
267*a58d3d2aSXin Li xj2 = x[pos+2];
268*a58d3d2aSXin Li xj3 = x[pos+3];
269*a58d3d2aSXin Li y = &out[i];
270*a58d3d2aSXin Li y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
271*a58d3d2aSXin Li y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
272*a58d3d2aSXin Li y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
273*a58d3d2aSXin Li y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
274*a58d3d2aSXin Li y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
275*a58d3d2aSXin Li y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
276*a58d3d2aSXin Li y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
277*a58d3d2aSXin Li y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
278*a58d3d2aSXin Li w += 32;
279*a58d3d2aSXin Li }
280*a58d3d2aSXin Li }
281*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] *= scale[i];
282*a58d3d2aSXin Li }
cgemv8x4(float * out,const opus_int8 * w,const float * scale,int rows,int cols,const float * _x)283*a58d3d2aSXin Li static inline void cgemv8x4(float *out, const opus_int8 *w, const float *scale, int rows, int cols, const float *_x)
284*a58d3d2aSXin Li {
285*a58d3d2aSXin Li int i, j;
286*a58d3d2aSXin Li opus_int8 x[MAX_INPUTS];
287*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] = 0;
288*a58d3d2aSXin Li for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
289*a58d3d2aSXin Li for (i=0;i<rows;i+=8)
290*a58d3d2aSXin Li {
291*a58d3d2aSXin Li for (j=0;j<cols;j+=4)
292*a58d3d2aSXin Li {
293*a58d3d2aSXin Li float *y;
294*a58d3d2aSXin Li float xj0, xj1, xj2, xj3;
295*a58d3d2aSXin Li xj0 = x[j+0];
296*a58d3d2aSXin Li xj1 = x[j+1];
297*a58d3d2aSXin Li xj2 = x[j+2];
298*a58d3d2aSXin Li xj3 = x[j+3];
299*a58d3d2aSXin Li y = &out[i];
300*a58d3d2aSXin Li y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
301*a58d3d2aSXin Li y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
302*a58d3d2aSXin Li y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
303*a58d3d2aSXin Li y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
304*a58d3d2aSXin Li y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
305*a58d3d2aSXin Li y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
306*a58d3d2aSXin Li y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
307*a58d3d2aSXin Li y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
308*a58d3d2aSXin Li w += 32;
309*a58d3d2aSXin Li }
310*a58d3d2aSXin Li }
311*a58d3d2aSXin Li for (i=0;i<rows;i++) out[i] *= scale[i];
312*a58d3d2aSXin Li }
313*a58d3d2aSXin Li #endif
314*a58d3d2aSXin Li
315*a58d3d2aSXin Li /* No AVX2/FMA support */
316*a58d3d2aSXin Li #ifndef LPCNET_TEST
lpcnet_exp2(float x)317*a58d3d2aSXin Li static inline float lpcnet_exp2(float x)
318*a58d3d2aSXin Li {
319*a58d3d2aSXin Li int integer;
320*a58d3d2aSXin Li float frac;
321*a58d3d2aSXin Li union {
322*a58d3d2aSXin Li float f;
323*a58d3d2aSXin Li opus_uint32 i;
324*a58d3d2aSXin Li } res;
325*a58d3d2aSXin Li integer = floor(x);
326*a58d3d2aSXin Li if (integer < -50)
327*a58d3d2aSXin Li return 0;
328*a58d3d2aSXin Li frac = x-integer;
329*a58d3d2aSXin Li /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
330*a58d3d2aSXin Li res.f = 0.99992522f + frac * (0.69583354f
331*a58d3d2aSXin Li + frac * (0.22606716f + 0.078024523f*frac));
332*a58d3d2aSXin Li res.i = (res.i + (integer<<23)) & 0x7fffffff;
333*a58d3d2aSXin Li return res.f;
334*a58d3d2aSXin Li }
335*a58d3d2aSXin Li #define lpcnet_exp(x) lpcnet_exp2((x)*1.44269504f)
336*a58d3d2aSXin Li
337*a58d3d2aSXin Li #define fmadd(a, b, c) ((a)*(b)+(c))
tanh_approx(float x)338*a58d3d2aSXin Li static OPUS_INLINE float tanh_approx(float x)
339*a58d3d2aSXin Li {
340*a58d3d2aSXin Li const float N0 = 952.52801514f;
341*a58d3d2aSXin Li const float N1 = 96.39235687f;
342*a58d3d2aSXin Li const float N2 = 0.60863042f;
343*a58d3d2aSXin Li const float D0 = 952.72399902f;
344*a58d3d2aSXin Li const float D1 = 413.36801147f;
345*a58d3d2aSXin Li const float D2 = 11.88600922f;
346*a58d3d2aSXin Li float X2, num, den;
347*a58d3d2aSXin Li X2 = x*x;
348*a58d3d2aSXin Li num = fmadd(fmadd(N2, X2, N1), X2, N0);
349*a58d3d2aSXin Li den = fmadd(fmadd(D2, X2, D1), X2, D0);
350*a58d3d2aSXin Li num = num*x/den;
351*a58d3d2aSXin Li return MAX32(-1.f, MIN32(1.f, num));
352*a58d3d2aSXin Li }
353*a58d3d2aSXin Li
sigmoid_approx(float x)354*a58d3d2aSXin Li static inline float sigmoid_approx(float x)
355*a58d3d2aSXin Li {
356*a58d3d2aSXin Li return .5f + .5f*tanh_approx(.5f*x);
357*a58d3d2aSXin Li }
358*a58d3d2aSXin Li
softmax(float * y,const float * x,int N)359*a58d3d2aSXin Li static inline void softmax(float *y, const float *x, int N)
360*a58d3d2aSXin Li {
361*a58d3d2aSXin Li int i;
362*a58d3d2aSXin Li for (i=0;i<N;i++)
363*a58d3d2aSXin Li y[i] = lpcnet_exp(x[i]);
364*a58d3d2aSXin Li }
365*a58d3d2aSXin Li
vec_tanh(float * y,const float * x,int N)366*a58d3d2aSXin Li static inline void vec_tanh(float *y, const float *x, int N)
367*a58d3d2aSXin Li {
368*a58d3d2aSXin Li int i;
369*a58d3d2aSXin Li for (i=0;i<N;i++)
370*a58d3d2aSXin Li {
371*a58d3d2aSXin Li y[i] = tanh_approx(x[i]);
372*a58d3d2aSXin Li }
373*a58d3d2aSXin Li }
374*a58d3d2aSXin Li
vec_sigmoid(float * y,const float * x,int N)375*a58d3d2aSXin Li static inline void vec_sigmoid(float *y, const float *x, int N)
376*a58d3d2aSXin Li {
377*a58d3d2aSXin Li int i;
378*a58d3d2aSXin Li for (i=0;i<N;i++)
379*a58d3d2aSXin Li {
380*a58d3d2aSXin Li y[i] = sigmoid_approx(x[i]);
381*a58d3d2aSXin Li }
382*a58d3d2aSXin Li }
383*a58d3d2aSXin Li #endif
384*a58d3d2aSXin Li
385*a58d3d2aSXin Li #define SCALE (128.f*127.f)
386*a58d3d2aSXin Li #define SCALE_1 (1.f/128.f/127.f)
387*a58d3d2aSXin Li
388*a58d3d2aSXin Li #endif /*no optimizations*/
389*a58d3d2aSXin Li #endif /*VEC_H*/
390