xref: /btstack/3rd-party/lc3-google/test/neon/neon.h (revision da8e14c5aa3783b6bb7dd63e71572a901bcf168b)
1 /******************************************************************************
2  *
3  *  Copyright 2022 Google LLC
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at:
8  *
9  *  http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  ******************************************************************************/
18 
19 #if __ARM_NEON
20 
21 #include <arm_neon.h>
22 
23 #else
24 #define __ARM_NEON 1
25 
26 #include <stdint.h>
27 
28 
29 /* ----------------------------------------------------------------------------
30  *  Integer
31  * -------------------------------------------------------------------------- */
32 
33 typedef struct { int16_t e[4]; } int16x4_t;
34 
35 typedef struct { int16_t e[8]; } int16x8_t;
36 typedef struct { int32_t e[4]; } int32x4_t;
37 typedef struct { int64_t e[2]; } int64x2_t;
38 
39 
40 /**
41  * Load / Store
42  */
43 
44 __attribute__((unused))
45 static int16x4_t vld1_s16(const int16_t *p)
46 {
47     return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
48 }
49 
50 
51 /**
52  * Arithmetic
53  */
54 
55 __attribute__((unused))
56 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
57 {
58     return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
59                           a.e[2] * b.e[2], a.e[3] * b.e[3]  } };
60 }
61 
62 __attribute__((unused))
63 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
64 {
65     return (int32x4_t){ {
66         r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
67         r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
68 }
69 
70 __attribute__((unused))
71 static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
72 {
73     int64x2_t r;
74 
75     r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]);
76     r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]);
77 
78     return r;
79 }
80 
81 
82 /**
83  * Reduce
84  */
85 
86 __attribute__((unused))
87 static int32_t vaddvq_s32(int32x4_t v)
88 {
89     return v.e[0] + v.e[1] + v.e[2] + v.e[3];
90 }
91 
92 __attribute__((unused))
93 static int64_t vaddvq_s64(int64x2_t v)
94 {
95     return v.e[0] + v.e[1];
96 }
97 
98 
99 /**
100  * Manipulation
101  */
102 
103 __attribute__((unused))
104 static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
105 {
106     int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
107                     b.e[0], b.e[1], b.e[2], b.e[3] };
108 
109     return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
110 }
111 
112 __attribute__((unused))
113 static int32x4_t vmovq_n_s32(uint32_t v)
114 {
115     return (int32x4_t){ { v, v, v, v } };
116 }
117 
118 __attribute__((unused))
119 static int64x2_t vmovq_n_s64(int64_t v)
120 {
121     return (int64x2_t){ { v, v, } };
122 }
123 
124 
125 
126 /* ----------------------------------------------------------------------------
127  *  Floating Point
128  * -------------------------------------------------------------------------- */
129 
130 typedef struct { float e[2]; } float32x2_t;
131 typedef struct { float e[4]; } float32x4_t;
132 
133 typedef struct { float32x2_t val[2]; } float32x2x2_t;
134 typedef struct { float32x4_t val[2]; } float32x4x2_t;
135 
136 
137 /**
138  * Load / Store
139  */
140 
141 __attribute__((unused))
142 static float32x2_t vld1_f32(const float *p)
143 {
144     return (float32x2_t){ { p[0], p[1] } };
145 }
146 
147 __attribute__((unused))
148 static float32x4_t vld1q_f32(const float *p)
149 {
150     return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
151 }
152 
153 __attribute__((unused))
154 static float32x4_t vld1q_dup_f32(const float *p)
155 {
156     return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
157 }
158 
159 __attribute__((unused))
160 static float32x2x2_t vld2_f32(const float *p)
161 {
162     return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
163                             .val[1] = { { p[1], p[3] } } };
164 }
165 
166 __attribute__((unused))
167 static float32x4x2_t vld2q_f32(const float *p)
168 {
169     return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
170                             .val[1] = { { p[1], p[3], p[5], p[7] } } };
171 }
172 
173 __attribute__((unused))
174 static void vst1_f32(float *p, float32x2_t v)
175 {
176     p[0] = v.e[0], p[1] = v.e[1];
177 }
178 
179 __attribute__((unused))
180 static void vst1q_f32(float *p, float32x4_t v)
181 {
182     p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
183 }
184 
185 /**
186  * Arithmetic
187  */
188 
189 __attribute__((unused))
190 static float32x2_t vneg_f32(float32x2_t a)
191 {
192     return (float32x2_t){ { -a.e[0], -a.e[1] } };
193 }
194 
195 __attribute__((unused))
196 static float32x4_t vnegq_f32(float32x4_t a)
197 {
198     return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
199 }
200 
201 __attribute__((unused))
202 static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
203 {
204     return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
205                             a.e[2] + b.e[2], a.e[3] + b.e[3] } };
206 }
207 
208 __attribute__((unused))
209 static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
210 {
211     return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
212                             a.e[2] - b.e[2], a.e[3] - b.e[3] } };
213 }
214 
215 __attribute__((unused))
216 static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
217 {
218     return (float32x2_t){ {
219         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
220 }
221 
222 __attribute__((unused))
223 static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
224 {
225     return (float32x4_t){ {
226         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
227         a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
228 }
229 
230 __attribute__((unused))
231 static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
232 {
233     return (float32x2_t){ {
234         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
235 }
236 
237 __attribute__((unused))
238 static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
239 {
240     return (float32x4_t){ {
241         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
242         a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
243 }
244 
245 
246 /**
247  * Manipulation
248  */
249 
250 __attribute__((unused))
251 static float32x2_t vcreate_f32(uint64_t u)
252 {
253     float *f = (float *)&u;
254     return (float32x2_t){ { f[0] , f[1] } };
255 }
256 
257 __attribute__((unused))
258 static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
259 {
260     return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
261 }
262 
263 __attribute__((unused))
264 static float32x2_t vget_low_f32(float32x4_t a)
265 {
266     return (float32x2_t){ { a.e[0], a.e[1] } };
267 }
268 
269 __attribute__((unused))
270 static float32x2_t vget_high_f32(float32x4_t a)
271 {
272     return (float32x2_t){ { a.e[2], a.e[3] } };
273 }
274 
275 __attribute__((unused))
276 static float32x4_t vmovq_n_f32(float v)
277 {
278     return (float32x4_t){ { v, v, v, v } };
279 }
280 
281 __attribute__((unused))
282 static float32x2_t vrev64_f32(float32x2_t v)
283 {
284     return (float32x2_t){ { v.e[1], v.e[0] } };
285 }
286 
287 __attribute__((unused))
288 static float32x4_t vrev64q_f32(float32x4_t v)
289 {
290     return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
291 }
292 
293 __attribute__((unused))
294 static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
295 {
296     return (float32x2_t){ { a.e[0], b.e[0] } };
297 }
298 
299 __attribute__((unused))
300 static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
301 {
302     return (float32x2_t){ { a.e[1], b.e[1] } };
303 }
304 
305 __attribute__((unused))
306 static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
307 {
308     return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
309 }
310 
311 __attribute__((unused))
312 static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
313 {
314     return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
315 }
316 
317 __attribute__((unused))
318 static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
319 {
320     return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
321 }
322 
323 __attribute__((unused))
324 static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
325 {
326     return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
327 }
328 
329 
330 #endif /* __ARM_NEON */
331