xref: /btstack/3rd-party/lc3-google/test/neon/neon.h (revision 4930cef6e21e6da2d7571b9259c7f0fb8bed3d01)
1*4930cef6SMatthias Ringwald /******************************************************************************
2*4930cef6SMatthias Ringwald  *
3*4930cef6SMatthias Ringwald  *  Copyright 2022 Google LLC
4*4930cef6SMatthias Ringwald  *
5*4930cef6SMatthias Ringwald  *  Licensed under the Apache License, Version 2.0 (the "License");
6*4930cef6SMatthias Ringwald  *  you may not use this file except in compliance with the License.
7*4930cef6SMatthias Ringwald  *  You may obtain a copy of the License at:
8*4930cef6SMatthias Ringwald  *
9*4930cef6SMatthias Ringwald  *  http://www.apache.org/licenses/LICENSE-2.0
10*4930cef6SMatthias Ringwald  *
11*4930cef6SMatthias Ringwald  *  Unless required by applicable law or agreed to in writing, software
12*4930cef6SMatthias Ringwald  *  distributed under the License is distributed on an "AS IS" BASIS,
13*4930cef6SMatthias Ringwald  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*4930cef6SMatthias Ringwald  *  See the License for the specific language governing permissions and
15*4930cef6SMatthias Ringwald  *  limitations under the License.
16*4930cef6SMatthias Ringwald  *
17*4930cef6SMatthias Ringwald  ******************************************************************************/
18*4930cef6SMatthias Ringwald 
19*4930cef6SMatthias Ringwald #if __ARM_NEON
20*4930cef6SMatthias Ringwald 
21*4930cef6SMatthias Ringwald #include <arm_neon.h>
22*4930cef6SMatthias Ringwald 
23*4930cef6SMatthias Ringwald #else
24*4930cef6SMatthias Ringwald #define __ARM_NEON 1
25*4930cef6SMatthias Ringwald 
26*4930cef6SMatthias Ringwald #include <stdint.h>
27*4930cef6SMatthias Ringwald 
28*4930cef6SMatthias Ringwald 
29*4930cef6SMatthias Ringwald /* ----------------------------------------------------------------------------
30*4930cef6SMatthias Ringwald  *  Integer
31*4930cef6SMatthias Ringwald  * -------------------------------------------------------------------------- */
32*4930cef6SMatthias Ringwald 
33*4930cef6SMatthias Ringwald typedef struct { int16_t e[4]; } int16x4_t;
34*4930cef6SMatthias Ringwald 
35*4930cef6SMatthias Ringwald typedef struct { int16_t e[8]; } int16x8_t;
36*4930cef6SMatthias Ringwald typedef struct { int32_t e[4]; } int32x4_t;
37*4930cef6SMatthias Ringwald typedef struct { int64_t e[2]; } int64x2_t;
38*4930cef6SMatthias Ringwald 
39*4930cef6SMatthias Ringwald 
40*4930cef6SMatthias Ringwald /**
41*4930cef6SMatthias Ringwald  * Load / Store
42*4930cef6SMatthias Ringwald  */
43*4930cef6SMatthias Ringwald 
44*4930cef6SMatthias Ringwald __attribute__((unused))
45*4930cef6SMatthias Ringwald static int16x4_t vld1_s16(const int16_t *p)
46*4930cef6SMatthias Ringwald {
47*4930cef6SMatthias Ringwald     return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
48*4930cef6SMatthias Ringwald }
49*4930cef6SMatthias Ringwald 
50*4930cef6SMatthias Ringwald 
51*4930cef6SMatthias Ringwald /**
52*4930cef6SMatthias Ringwald  * Arithmetic
53*4930cef6SMatthias Ringwald  */
54*4930cef6SMatthias Ringwald 
55*4930cef6SMatthias Ringwald __attribute__((unused))
56*4930cef6SMatthias Ringwald static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
57*4930cef6SMatthias Ringwald {
58*4930cef6SMatthias Ringwald     return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
59*4930cef6SMatthias Ringwald                           a.e[2] * b.e[2], a.e[3] * b.e[3]  } };
60*4930cef6SMatthias Ringwald }
61*4930cef6SMatthias Ringwald 
62*4930cef6SMatthias Ringwald __attribute__((unused))
63*4930cef6SMatthias Ringwald static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
64*4930cef6SMatthias Ringwald {
65*4930cef6SMatthias Ringwald     return (int32x4_t){ {
66*4930cef6SMatthias Ringwald         r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
67*4930cef6SMatthias Ringwald         r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
68*4930cef6SMatthias Ringwald }
69*4930cef6SMatthias Ringwald 
70*4930cef6SMatthias Ringwald __attribute__((unused))
71*4930cef6SMatthias Ringwald static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
72*4930cef6SMatthias Ringwald {
73*4930cef6SMatthias Ringwald     int64x2_t r;
74*4930cef6SMatthias Ringwald 
75*4930cef6SMatthias Ringwald     r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]);
76*4930cef6SMatthias Ringwald     r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]);
77*4930cef6SMatthias Ringwald 
78*4930cef6SMatthias Ringwald     return r;
79*4930cef6SMatthias Ringwald }
80*4930cef6SMatthias Ringwald 
81*4930cef6SMatthias Ringwald 
82*4930cef6SMatthias Ringwald /**
83*4930cef6SMatthias Ringwald  * Reduce
84*4930cef6SMatthias Ringwald  */
85*4930cef6SMatthias Ringwald 
86*4930cef6SMatthias Ringwald __attribute__((unused))
87*4930cef6SMatthias Ringwald static int32_t vaddvq_s32(int32x4_t v)
88*4930cef6SMatthias Ringwald {
89*4930cef6SMatthias Ringwald     return v.e[0] + v.e[1] + v.e[2] + v.e[3];
90*4930cef6SMatthias Ringwald }
91*4930cef6SMatthias Ringwald 
92*4930cef6SMatthias Ringwald __attribute__((unused))
93*4930cef6SMatthias Ringwald static int64_t vaddvq_s64(int64x2_t v)
94*4930cef6SMatthias Ringwald {
95*4930cef6SMatthias Ringwald     return v.e[0] + v.e[1];
96*4930cef6SMatthias Ringwald }
97*4930cef6SMatthias Ringwald 
98*4930cef6SMatthias Ringwald 
99*4930cef6SMatthias Ringwald /**
100*4930cef6SMatthias Ringwald  * Manipulation
101*4930cef6SMatthias Ringwald  */
102*4930cef6SMatthias Ringwald 
103*4930cef6SMatthias Ringwald __attribute__((unused))
104*4930cef6SMatthias Ringwald static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
105*4930cef6SMatthias Ringwald {
106*4930cef6SMatthias Ringwald     int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
107*4930cef6SMatthias Ringwald                     b.e[0], b.e[1], b.e[2], b.e[3] };
108*4930cef6SMatthias Ringwald 
109*4930cef6SMatthias Ringwald     return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
110*4930cef6SMatthias Ringwald }
111*4930cef6SMatthias Ringwald 
112*4930cef6SMatthias Ringwald __attribute__((unused))
113*4930cef6SMatthias Ringwald static int32x4_t vmovq_n_s32(uint32_t v)
114*4930cef6SMatthias Ringwald {
115*4930cef6SMatthias Ringwald     return (int32x4_t){ { v, v, v, v } };
116*4930cef6SMatthias Ringwald }
117*4930cef6SMatthias Ringwald 
118*4930cef6SMatthias Ringwald __attribute__((unused))
119*4930cef6SMatthias Ringwald static int64x2_t vmovq_n_s64(int64_t v)
120*4930cef6SMatthias Ringwald {
121*4930cef6SMatthias Ringwald     return (int64x2_t){ { v, v, } };
122*4930cef6SMatthias Ringwald }
123*4930cef6SMatthias Ringwald 
124*4930cef6SMatthias Ringwald 
125*4930cef6SMatthias Ringwald 
126*4930cef6SMatthias Ringwald /* ----------------------------------------------------------------------------
127*4930cef6SMatthias Ringwald  *  Floating Point
128*4930cef6SMatthias Ringwald  * -------------------------------------------------------------------------- */
129*4930cef6SMatthias Ringwald 
130*4930cef6SMatthias Ringwald typedef struct { float e[2]; } float32x2_t;
131*4930cef6SMatthias Ringwald typedef struct { float e[4]; } float32x4_t;
132*4930cef6SMatthias Ringwald 
133*4930cef6SMatthias Ringwald typedef struct { float32x2_t val[2]; } float32x2x2_t;
134*4930cef6SMatthias Ringwald typedef struct { float32x4_t val[2]; } float32x4x2_t;
135*4930cef6SMatthias Ringwald 
136*4930cef6SMatthias Ringwald 
137*4930cef6SMatthias Ringwald /**
138*4930cef6SMatthias Ringwald  * Load / Store
139*4930cef6SMatthias Ringwald  */
140*4930cef6SMatthias Ringwald 
141*4930cef6SMatthias Ringwald __attribute__((unused))
142*4930cef6SMatthias Ringwald static float32x2_t vld1_f32(const float *p)
143*4930cef6SMatthias Ringwald {
144*4930cef6SMatthias Ringwald     return (float32x2_t){ { p[0], p[1] } };
145*4930cef6SMatthias Ringwald }
146*4930cef6SMatthias Ringwald 
147*4930cef6SMatthias Ringwald __attribute__((unused))
148*4930cef6SMatthias Ringwald static float32x4_t vld1q_f32(const float *p)
149*4930cef6SMatthias Ringwald {
150*4930cef6SMatthias Ringwald     return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
151*4930cef6SMatthias Ringwald }
152*4930cef6SMatthias Ringwald 
153*4930cef6SMatthias Ringwald __attribute__((unused))
154*4930cef6SMatthias Ringwald static float32x4_t vld1q_dup_f32(const float *p)
155*4930cef6SMatthias Ringwald {
156*4930cef6SMatthias Ringwald     return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
157*4930cef6SMatthias Ringwald }
158*4930cef6SMatthias Ringwald 
159*4930cef6SMatthias Ringwald __attribute__((unused))
160*4930cef6SMatthias Ringwald static float32x2x2_t vld2_f32(const float *p)
161*4930cef6SMatthias Ringwald {
162*4930cef6SMatthias Ringwald     return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
163*4930cef6SMatthias Ringwald                             .val[1] = { { p[1], p[3] } } };
164*4930cef6SMatthias Ringwald }
165*4930cef6SMatthias Ringwald 
166*4930cef6SMatthias Ringwald __attribute__((unused))
167*4930cef6SMatthias Ringwald static float32x4x2_t vld2q_f32(const float *p)
168*4930cef6SMatthias Ringwald {
169*4930cef6SMatthias Ringwald     return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
170*4930cef6SMatthias Ringwald                             .val[1] = { { p[1], p[3], p[5], p[7] } } };
171*4930cef6SMatthias Ringwald }
172*4930cef6SMatthias Ringwald 
173*4930cef6SMatthias Ringwald __attribute__((unused))
174*4930cef6SMatthias Ringwald static void vst1_f32(float *p, float32x2_t v)
175*4930cef6SMatthias Ringwald {
176*4930cef6SMatthias Ringwald     p[0] = v.e[0], p[1] = v.e[1];
177*4930cef6SMatthias Ringwald }
178*4930cef6SMatthias Ringwald 
179*4930cef6SMatthias Ringwald __attribute__((unused))
180*4930cef6SMatthias Ringwald static void vst1q_f32(float *p, float32x4_t v)
181*4930cef6SMatthias Ringwald {
182*4930cef6SMatthias Ringwald     p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
183*4930cef6SMatthias Ringwald }
184*4930cef6SMatthias Ringwald 
185*4930cef6SMatthias Ringwald /**
186*4930cef6SMatthias Ringwald  * Arithmetic
187*4930cef6SMatthias Ringwald  */
188*4930cef6SMatthias Ringwald 
189*4930cef6SMatthias Ringwald __attribute__((unused))
190*4930cef6SMatthias Ringwald static float32x2_t vneg_f32(float32x2_t a)
191*4930cef6SMatthias Ringwald {
192*4930cef6SMatthias Ringwald     return (float32x2_t){ { -a.e[0], -a.e[1] } };
193*4930cef6SMatthias Ringwald }
194*4930cef6SMatthias Ringwald 
195*4930cef6SMatthias Ringwald __attribute__((unused))
196*4930cef6SMatthias Ringwald static float32x4_t vnegq_f32(float32x4_t a)
197*4930cef6SMatthias Ringwald {
198*4930cef6SMatthias Ringwald     return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
199*4930cef6SMatthias Ringwald }
200*4930cef6SMatthias Ringwald 
201*4930cef6SMatthias Ringwald __attribute__((unused))
202*4930cef6SMatthias Ringwald static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
203*4930cef6SMatthias Ringwald {
204*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
205*4930cef6SMatthias Ringwald                             a.e[2] + b.e[2], a.e[3] + b.e[3] } };
206*4930cef6SMatthias Ringwald }
207*4930cef6SMatthias Ringwald 
208*4930cef6SMatthias Ringwald __attribute__((unused))
209*4930cef6SMatthias Ringwald static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
210*4930cef6SMatthias Ringwald {
211*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
212*4930cef6SMatthias Ringwald                             a.e[2] - b.e[2], a.e[3] - b.e[3] } };
213*4930cef6SMatthias Ringwald }
214*4930cef6SMatthias Ringwald 
215*4930cef6SMatthias Ringwald __attribute__((unused))
216*4930cef6SMatthias Ringwald static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
217*4930cef6SMatthias Ringwald {
218*4930cef6SMatthias Ringwald     return (float32x2_t){ {
219*4930cef6SMatthias Ringwald         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
220*4930cef6SMatthias Ringwald }
221*4930cef6SMatthias Ringwald 
222*4930cef6SMatthias Ringwald __attribute__((unused))
223*4930cef6SMatthias Ringwald static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
224*4930cef6SMatthias Ringwald {
225*4930cef6SMatthias Ringwald     return (float32x4_t){ {
226*4930cef6SMatthias Ringwald         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
227*4930cef6SMatthias Ringwald         a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
228*4930cef6SMatthias Ringwald }
229*4930cef6SMatthias Ringwald 
230*4930cef6SMatthias Ringwald __attribute__((unused))
231*4930cef6SMatthias Ringwald static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
232*4930cef6SMatthias Ringwald {
233*4930cef6SMatthias Ringwald     return (float32x2_t){ {
234*4930cef6SMatthias Ringwald         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
235*4930cef6SMatthias Ringwald }
236*4930cef6SMatthias Ringwald 
237*4930cef6SMatthias Ringwald __attribute__((unused))
238*4930cef6SMatthias Ringwald static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
239*4930cef6SMatthias Ringwald {
240*4930cef6SMatthias Ringwald     return (float32x4_t){ {
241*4930cef6SMatthias Ringwald         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
242*4930cef6SMatthias Ringwald         a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
243*4930cef6SMatthias Ringwald }
244*4930cef6SMatthias Ringwald 
245*4930cef6SMatthias Ringwald 
246*4930cef6SMatthias Ringwald /**
247*4930cef6SMatthias Ringwald  * Manipulation
248*4930cef6SMatthias Ringwald  */
249*4930cef6SMatthias Ringwald 
250*4930cef6SMatthias Ringwald __attribute__((unused))
251*4930cef6SMatthias Ringwald static float32x2_t vcreate_f32(uint64_t u)
252*4930cef6SMatthias Ringwald {
253*4930cef6SMatthias Ringwald     float *f = (float *)&u;
254*4930cef6SMatthias Ringwald     return (float32x2_t){ { f[0] , f[1] } };
255*4930cef6SMatthias Ringwald }
256*4930cef6SMatthias Ringwald 
257*4930cef6SMatthias Ringwald __attribute__((unused))
258*4930cef6SMatthias Ringwald static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
259*4930cef6SMatthias Ringwald {
260*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
261*4930cef6SMatthias Ringwald }
262*4930cef6SMatthias Ringwald 
263*4930cef6SMatthias Ringwald __attribute__((unused))
264*4930cef6SMatthias Ringwald static float32x2_t vget_low_f32(float32x4_t a)
265*4930cef6SMatthias Ringwald {
266*4930cef6SMatthias Ringwald     return (float32x2_t){ { a.e[0], a.e[1] } };
267*4930cef6SMatthias Ringwald }
268*4930cef6SMatthias Ringwald 
269*4930cef6SMatthias Ringwald __attribute__((unused))
270*4930cef6SMatthias Ringwald static float32x2_t vget_high_f32(float32x4_t a)
271*4930cef6SMatthias Ringwald {
272*4930cef6SMatthias Ringwald     return (float32x2_t){ { a.e[2], a.e[3] } };
273*4930cef6SMatthias Ringwald }
274*4930cef6SMatthias Ringwald 
275*4930cef6SMatthias Ringwald __attribute__((unused))
276*4930cef6SMatthias Ringwald static float32x4_t vmovq_n_f32(float v)
277*4930cef6SMatthias Ringwald {
278*4930cef6SMatthias Ringwald     return (float32x4_t){ { v, v, v, v } };
279*4930cef6SMatthias Ringwald }
280*4930cef6SMatthias Ringwald 
281*4930cef6SMatthias Ringwald __attribute__((unused))
282*4930cef6SMatthias Ringwald static float32x2_t vrev64_f32(float32x2_t v)
283*4930cef6SMatthias Ringwald {
284*4930cef6SMatthias Ringwald     return (float32x2_t){ { v.e[1], v.e[0] } };
285*4930cef6SMatthias Ringwald }
286*4930cef6SMatthias Ringwald 
287*4930cef6SMatthias Ringwald __attribute__((unused))
288*4930cef6SMatthias Ringwald static float32x4_t vrev64q_f32(float32x4_t v)
289*4930cef6SMatthias Ringwald {
290*4930cef6SMatthias Ringwald     return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
291*4930cef6SMatthias Ringwald }
292*4930cef6SMatthias Ringwald 
293*4930cef6SMatthias Ringwald __attribute__((unused))
294*4930cef6SMatthias Ringwald static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
295*4930cef6SMatthias Ringwald {
296*4930cef6SMatthias Ringwald     return (float32x2_t){ { a.e[0], b.e[0] } };
297*4930cef6SMatthias Ringwald }
298*4930cef6SMatthias Ringwald 
299*4930cef6SMatthias Ringwald __attribute__((unused))
300*4930cef6SMatthias Ringwald static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
301*4930cef6SMatthias Ringwald {
302*4930cef6SMatthias Ringwald     return (float32x2_t){ { a.e[1], b.e[1] } };
303*4930cef6SMatthias Ringwald }
304*4930cef6SMatthias Ringwald 
305*4930cef6SMatthias Ringwald __attribute__((unused))
306*4930cef6SMatthias Ringwald static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
307*4930cef6SMatthias Ringwald {
308*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
309*4930cef6SMatthias Ringwald }
310*4930cef6SMatthias Ringwald 
311*4930cef6SMatthias Ringwald __attribute__((unused))
312*4930cef6SMatthias Ringwald static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
313*4930cef6SMatthias Ringwald {
314*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
315*4930cef6SMatthias Ringwald }
316*4930cef6SMatthias Ringwald 
317*4930cef6SMatthias Ringwald __attribute__((unused))
318*4930cef6SMatthias Ringwald static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
319*4930cef6SMatthias Ringwald {
320*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
321*4930cef6SMatthias Ringwald }
322*4930cef6SMatthias Ringwald 
323*4930cef6SMatthias Ringwald __attribute__((unused))
324*4930cef6SMatthias Ringwald static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
325*4930cef6SMatthias Ringwald {
326*4930cef6SMatthias Ringwald     return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
327*4930cef6SMatthias Ringwald }
328*4930cef6SMatthias Ringwald 
329*4930cef6SMatthias Ringwald 
330*4930cef6SMatthias Ringwald #endif /* __ARM_NEON */
331