xref: /aosp_15_r20/external/libopus/dnn/nnet_arch.h (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /* Copyright (c) 2018-2019 Mozilla
2                  2023 Amazon */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifndef NNET_ARCH_H
29 #define NNET_ARCH_H
30 
31 #include "nnet.h"
32 #include "arch.h"
33 #include "os_support.h"
34 #include "vec.h"
35 
36 #define CAT_SUFFIX2(a,b) a ## b
37 #define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
38 
39 #define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
40 
41 /* Force vectorization on for DNN code because some of the loops rely on
42    compiler vectorization rather than explicitly using intrinsics. */
43 #if OPUS_GNUC_PREREQ(5,1)
44 #define GCC_POP_OPTIONS
45 #pragma GCC push_options
46 #pragma GCC optimize("tree-vectorize")
47 #endif
48 
49 
50 #define MAX_ACTIVATIONS (4096)
51 
vec_swish(float * y,const float * x,int N)52 static OPUS_INLINE void vec_swish(float *y, const float *x, int N)
53 {
54    int i;
55    float tmp[MAX_ACTIVATIONS];
56    celt_assert(N <= MAX_ACTIVATIONS);
57    vec_sigmoid(tmp, x, N);
58    for (i=0;i<N;i++)
59       y[i] = x[i]*tmp[i];
60 }
61 
relu(float x)62 static OPUS_INLINE float relu(float x)
63 {
64    return x < 0 ? 0 : x;
65 }
66 
67 /*#define HIGH_ACCURACY */
68 
RTCD_SUF(compute_activation_)69 void RTCD_SUF(compute_activation_)(float *output, const float *input, int N, int activation)
70 {
71    int i;
72    if (activation == ACTIVATION_SIGMOID) {
73 #ifdef HIGH_ACCURACY
74       for (int n=0; n<N; n++)
75       {
76          output[n] = 1.f  / (1 + exp(-input[n]));
77       }
78 #else
79       vec_sigmoid(output, input, N);
80 #endif
81    } else if (activation == ACTIVATION_TANH) {
82 #ifdef HIGH_ACCURACY
83       for (int n=0; n<N; n++)
84       {
85          output[n] = tanh(input[n]);
86       }
87 #else
88       vec_tanh(output, input, N);
89 #endif
90    } else if (activation == ACTIVATION_SWISH) {
91       vec_swish(output, input, N);
92    } else if (activation == ACTIVATION_RELU) {
93       for (i=0;i<N;i++)
94          output[i] = relu(input[i]);
95    } else if (activation == ACTIVATION_SOFTMAX) {
96 #ifdef SOFTMAX_HACK
97       OPUS_COPY(output, input, N);
98       /*for (i=0;i<N;i++)
99          output[i] = input[i];*/
100 #else
101       float sum = 0;
102       softmax(output, input, N);
103       for (i=0;i<N;i++) {
104          sum += output[i];
105       }
106       sum = 1.f/(sum+1e-30);
107       for (i=0;i<N;i++)
108          output[i] = sum*output[i];
109 #endif
110    } else {
111       celt_assert(activation == ACTIVATION_LINEAR);
112       if (input != output) {
113          for (i=0;i<N;i++)
114             output[i] = input[i];
115       }
116    }
117 }
118 
119 
RTCD_SUF(compute_linear_)120 void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
121 {
122    int i, M, N;
123    const float *bias;
124    celt_assert(in != out);
125    bias = linear->bias;
126    M = linear->nb_inputs;
127    N = linear->nb_outputs;
128    if (linear->float_weights != NULL) {
129      if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
130      else sgemv(out, linear->float_weights, N, M, N, in);
131    } else if (linear->weights != NULL) {
132      if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
133      else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
134      /* Only use SU biases on for integer matrices on SU archs. */
135 #ifdef USE_SU_BIAS
136      bias = linear->subias;
137 #endif
138    }
139    else OPUS_CLEAR(out, N);
140    if (bias != NULL) {
141       for (i=0;i<N;i++) out[i] += bias[i];
142    }
143    if (linear->diag) {
144       /* Diag is only used for GRU recurrent weights. */
145       celt_assert(3*M == N);
146       for (i=0;i<M;i++) {
147          out[i] += linear->diag[i]*in[i];
148          out[i+M] += linear->diag[i+M]*in[i];
149          out[i+2*M] += linear->diag[i+2*M]*in[i];
150       }
151    }
152 }
153 
154 /* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],
155    kernel [ out_channels x in_channels x ksize1 x ksize2 ],
156    storing the output as [ out_channels x len2 ].
157    We assume that the output dimension along the ksize1 axis is 1,
158    i.e. processing one frame at a time. */
conv2d_float(float * out,const float * weights,int in_channels,int out_channels,int ktime,int kheight,const float * in,int height,int hstride)159 static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)
160 {
161    int i;
162    int in_stride;
163    in_stride = height+kheight-1;
164    for (i=0;i<out_channels;i++) {
165       int m;
166       OPUS_CLEAR(&out[i*hstride], height);
167       for (m=0;m<in_channels;m++) {
168          int t;
169          for (t=0;t<ktime;t++) {
170             int h;
171             for (h=0;h<kheight;h++) {
172                int j;
173                for (j=0;j<height;j++) {
174                   out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
175                                      in[t*in_channels*in_stride + m*in_stride + j + h];
176                }
177             }
178          }
179       }
180    }
181 }
182 
183 /* There's no intrinsics in this function (or the one above) because the gcc (and hopefully other compiler) auto-vectorizer is smart enough to
184    produce the right code by itself based on the compile flags. */
conv2d_3x3_float(float * out,const float * weights,int in_channels,int out_channels,const float * in,int height,int hstride)185 static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
186 {
187    int i;
188    int in_stride;
189    int kheight, ktime;
190    kheight = ktime = 3;
191    in_stride = height+kheight-1;
192    for (i=0;i<out_channels;i++) {
193       int m;
194       OPUS_CLEAR(&out[i*hstride], height);
195       for (m=0;m<in_channels;m++) {
196          int j;
197          for (j=0;j<height;j++) {
198             /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
199             out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
200                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
201                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
202                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
203                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
204                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
205                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
206                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
207                                 + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
208                }
209       }
210    }
211 }
212 
213 #define MAX_CONV2D_INPUTS 8192
214 
RTCD_SUF(compute_conv2d_)215 void RTCD_SUF(compute_conv2d_)(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
216 {
217    int i;
218    const float *bias;
219    float in_buf[MAX_CONV2D_INPUTS];
220    int time_stride;
221    celt_assert(in != out);
222    time_stride = conv->in_channels*(height+conv->kheight-1);
223    celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
224    OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
225    OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
226    OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
227    bias = conv->bias;
228    if (conv->kheight == 3 && conv->ktime == 3)
229      conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
230    else
231      conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
232    if (bias != NULL) {
233      for (i=0;i<conv->out_channels;i++) {
234        int j;
235        for (j=0;j<height;j++) out[i*hstride+j] += bias[i];
236      }
237    }
238    for (i=0;i<conv->out_channels;i++) {
239      RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation);
240    }
241 }
242 
243 #ifdef GCC_POP_OPTIONS
244 #pragma GCC pop_options
245 #endif
246 
247 #endif
248