1 /* Copyright (c) 2018-2019 Mozilla
2 2023 Amazon */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifndef NNET_ARCH_H
29 #define NNET_ARCH_H
30
31 #include "nnet.h"
32 #include "arch.h"
33 #include "os_support.h"
34 #include "vec.h"
35
36 #define CAT_SUFFIX2(a,b) a ## b
37 #define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
38
39 #define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
40
41 /* Force vectorization on for DNN code because some of the loops rely on
42 compiler vectorization rather than explicitly using intrinsics. */
43 #if OPUS_GNUC_PREREQ(5,1)
44 #define GCC_POP_OPTIONS
45 #pragma GCC push_options
46 #pragma GCC optimize("tree-vectorize")
47 #endif
48
49
50 #define MAX_ACTIVATIONS (4096)
51
vec_swish(float * y,const float * x,int N)52 static OPUS_INLINE void vec_swish(float *y, const float *x, int N)
53 {
54 int i;
55 float tmp[MAX_ACTIVATIONS];
56 celt_assert(N <= MAX_ACTIVATIONS);
57 vec_sigmoid(tmp, x, N);
58 for (i=0;i<N;i++)
59 y[i] = x[i]*tmp[i];
60 }
61
relu(float x)62 static OPUS_INLINE float relu(float x)
63 {
64 return x < 0 ? 0 : x;
65 }
66
67 /*#define HIGH_ACCURACY */
68
RTCD_SUF(compute_activation_)69 void RTCD_SUF(compute_activation_)(float *output, const float *input, int N, int activation)
70 {
71 int i;
72 if (activation == ACTIVATION_SIGMOID) {
73 #ifdef HIGH_ACCURACY
74 for (int n=0; n<N; n++)
75 {
76 output[n] = 1.f / (1 + exp(-input[n]));
77 }
78 #else
79 vec_sigmoid(output, input, N);
80 #endif
81 } else if (activation == ACTIVATION_TANH) {
82 #ifdef HIGH_ACCURACY
83 for (int n=0; n<N; n++)
84 {
85 output[n] = tanh(input[n]);
86 }
87 #else
88 vec_tanh(output, input, N);
89 #endif
90 } else if (activation == ACTIVATION_SWISH) {
91 vec_swish(output, input, N);
92 } else if (activation == ACTIVATION_RELU) {
93 for (i=0;i<N;i++)
94 output[i] = relu(input[i]);
95 } else if (activation == ACTIVATION_SOFTMAX) {
96 #ifdef SOFTMAX_HACK
97 OPUS_COPY(output, input, N);
98 /*for (i=0;i<N;i++)
99 output[i] = input[i];*/
100 #else
101 float sum = 0;
102 softmax(output, input, N);
103 for (i=0;i<N;i++) {
104 sum += output[i];
105 }
106 sum = 1.f/(sum+1e-30);
107 for (i=0;i<N;i++)
108 output[i] = sum*output[i];
109 #endif
110 } else {
111 celt_assert(activation == ACTIVATION_LINEAR);
112 if (input != output) {
113 for (i=0;i<N;i++)
114 output[i] = input[i];
115 }
116 }
117 }
118
119
RTCD_SUF(compute_linear_)120 void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
121 {
122 int i, M, N;
123 const float *bias;
124 celt_assert(in != out);
125 bias = linear->bias;
126 M = linear->nb_inputs;
127 N = linear->nb_outputs;
128 if (linear->float_weights != NULL) {
129 if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
130 else sgemv(out, linear->float_weights, N, M, N, in);
131 } else if (linear->weights != NULL) {
132 if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
133 else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
134 /* Only use SU biases on for integer matrices on SU archs. */
135 #ifdef USE_SU_BIAS
136 bias = linear->subias;
137 #endif
138 }
139 else OPUS_CLEAR(out, N);
140 if (bias != NULL) {
141 for (i=0;i<N;i++) out[i] += bias[i];
142 }
143 if (linear->diag) {
144 /* Diag is only used for GRU recurrent weights. */
145 celt_assert(3*M == N);
146 for (i=0;i<M;i++) {
147 out[i] += linear->diag[i]*in[i];
148 out[i+M] += linear->diag[i+M]*in[i];
149 out[i+2*M] += linear->diag[i+2*M]*in[i];
150 }
151 }
152 }
153
154 /* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],
155 kernel [ out_channels x in_channels x ksize1 x ksize2 ],
156 storing the output as [ out_channels x len2 ].
157 We assume that the output dimension along the ksize1 axis is 1,
158 i.e. processing one frame at a time. */
conv2d_float(float * out,const float * weights,int in_channels,int out_channels,int ktime,int kheight,const float * in,int height,int hstride)159 static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)
160 {
161 int i;
162 int in_stride;
163 in_stride = height+kheight-1;
164 for (i=0;i<out_channels;i++) {
165 int m;
166 OPUS_CLEAR(&out[i*hstride], height);
167 for (m=0;m<in_channels;m++) {
168 int t;
169 for (t=0;t<ktime;t++) {
170 int h;
171 for (h=0;h<kheight;h++) {
172 int j;
173 for (j=0;j<height;j++) {
174 out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
175 in[t*in_channels*in_stride + m*in_stride + j + h];
176 }
177 }
178 }
179 }
180 }
181 }
182
183 /* There's no intrinsics in this function (or the one above) because the gcc (and hopefully other compiler) auto-vectorizer is smart enough to
184 produce the right code by itself based on the compile flags. */
conv2d_3x3_float(float * out,const float * weights,int in_channels,int out_channels,const float * in,int height,int hstride)185 static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
186 {
187 int i;
188 int in_stride;
189 int kheight, ktime;
190 kheight = ktime = 3;
191 in_stride = height+kheight-1;
192 for (i=0;i<out_channels;i++) {
193 int m;
194 OPUS_CLEAR(&out[i*hstride], height);
195 for (m=0;m<in_channels;m++) {
196 int j;
197 for (j=0;j<height;j++) {
198 /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
199 out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
200 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
201 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
202 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
203 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
204 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
205 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
206 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
207 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
208 }
209 }
210 }
211 }
212
213 #define MAX_CONV2D_INPUTS 8192
214
RTCD_SUF(compute_conv2d_)215 void RTCD_SUF(compute_conv2d_)(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
216 {
217 int i;
218 const float *bias;
219 float in_buf[MAX_CONV2D_INPUTS];
220 int time_stride;
221 celt_assert(in != out);
222 time_stride = conv->in_channels*(height+conv->kheight-1);
223 celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
224 OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
225 OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
226 OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
227 bias = conv->bias;
228 if (conv->kheight == 3 && conv->ktime == 3)
229 conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
230 else
231 conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
232 if (bias != NULL) {
233 for (i=0;i<conv->out_channels;i++) {
234 int j;
235 for (j=0;j<height;j++) out[i*hstride+j] += bias[i];
236 }
237 }
238 for (i=0;i<conv->out_channels;i++) {
239 RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation);
240 }
241 }
242
243 #ifdef GCC_POP_OPTIONS
244 #pragma GCC pop_options
245 #endif
246
247 #endif
248