xref: /btstack/3rd-party/lc3-google/src/ltpf_neon.h (revision 4930cef6e21e6da2d7571b9259c7f0fb8bed3d01)
1*4930cef6SMatthias Ringwald /******************************************************************************
2*4930cef6SMatthias Ringwald  *
3*4930cef6SMatthias Ringwald  *  Copyright 2022 Google LLC
4*4930cef6SMatthias Ringwald  *
5*4930cef6SMatthias Ringwald  *  Licensed under the Apache License, Version 2.0 (the "License");
6*4930cef6SMatthias Ringwald  *  you may not use this file except in compliance with the License.
7*4930cef6SMatthias Ringwald  *  You may obtain a copy of the License at:
8*4930cef6SMatthias Ringwald  *
9*4930cef6SMatthias Ringwald  *  http://www.apache.org/licenses/LICENSE-2.0
10*4930cef6SMatthias Ringwald  *
11*4930cef6SMatthias Ringwald  *  Unless required by applicable law or agreed to in writing, software
12*4930cef6SMatthias Ringwald  *  distributed under the License is distributed on an "AS IS" BASIS,
13*4930cef6SMatthias Ringwald  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*4930cef6SMatthias Ringwald  *  See the License for the specific language governing permissions and
15*4930cef6SMatthias Ringwald  *  limitations under the License.
16*4930cef6SMatthias Ringwald  *
17*4930cef6SMatthias Ringwald  ******************************************************************************/
18*4930cef6SMatthias Ringwald 
19*4930cef6SMatthias Ringwald #if __ARM_NEON && __ARM_ARCH_ISA_A64
20*4930cef6SMatthias Ringwald 
21*4930cef6SMatthias Ringwald #ifndef TEST_NEON
22*4930cef6SMatthias Ringwald #include <arm_neon.h>
23*4930cef6SMatthias Ringwald #endif /* TEST_NEON */
24*4930cef6SMatthias Ringwald 
25*4930cef6SMatthias Ringwald 
26*4930cef6SMatthias Ringwald /**
27*4930cef6SMatthias Ringwald  * Import
28*4930cef6SMatthias Ringwald  */
29*4930cef6SMatthias Ringwald 
30*4930cef6SMatthias Ringwald static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t);
31*4930cef6SMatthias Ringwald 
32*4930cef6SMatthias Ringwald 
33*4930cef6SMatthias Ringwald /**
34*4930cef6SMatthias Ringwald  * Resample from 16 Khz to 12.8 KHz
35*4930cef6SMatthias Ringwald  */
36*4930cef6SMatthias Ringwald #ifndef resample_16k_12k8
37*4930cef6SMatthias Ringwald #define resample_16k_12k8 neon_resample_16k_12k8
38*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_16k_12k8(
39*4930cef6SMatthias Ringwald     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
40*4930cef6SMatthias Ringwald {
41*4930cef6SMatthias Ringwald     static const int16_t h[4][20] = {
42*4930cef6SMatthias Ringwald 
43*4930cef6SMatthias Ringwald     {   -61,   214,  -398,   417,     0, -1052,  2686, -4529,  5997, 26233,
44*4930cef6SMatthias Ringwald        5997, -4529,  2686, -1052,     0,   417,  -398,   214,   -61,     0 },
45*4930cef6SMatthias Ringwald 
46*4930cef6SMatthias Ringwald     {   -79,   180,  -213,     0,   598, -1522,  2389, -2427,     0, 24506,
47*4930cef6SMatthias Ringwald       13068, -5289,  1873,     0,  -752,   763,  -457,   156,     0,   -28 },
48*4930cef6SMatthias Ringwald 
49*4930cef6SMatthias Ringwald     {   -61,    92,     0,  -323,   861, -1361,  1317,     0, -3885, 19741,
50*4930cef6SMatthias Ringwald       19741, -3885,     0,  1317, -1361,   861,  -323,     0,    92,   -61 },
51*4930cef6SMatthias Ringwald 
52*4930cef6SMatthias Ringwald     {   -28,     0,   156,  -457,   763,  -752,     0,  1873, -5289, 13068,
53*4930cef6SMatthias Ringwald       24506,     0, -2427,  2389, -1522,   598,     0,  -213,   180,   -79 },
54*4930cef6SMatthias Ringwald 
55*4930cef6SMatthias Ringwald     };
56*4930cef6SMatthias Ringwald 
57*4930cef6SMatthias Ringwald     x -= 20 - 1;
58*4930cef6SMatthias Ringwald 
59*4930cef6SMatthias Ringwald     for (int i = 0; i < 5*n; i += 5) {
60*4930cef6SMatthias Ringwald         const int16_t *hn = h[i & 3];
61*4930cef6SMatthias Ringwald         const int16_t *xn = x + (i >> 2);
62*4930cef6SMatthias Ringwald         int32x4_t un;
63*4930cef6SMatthias Ringwald 
64*4930cef6SMatthias Ringwald         un = vmull_s16(    vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
65*4930cef6SMatthias Ringwald         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
66*4930cef6SMatthias Ringwald         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
67*4930cef6SMatthias Ringwald         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
68*4930cef6SMatthias Ringwald         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
69*4930cef6SMatthias Ringwald 
70*4930cef6SMatthias Ringwald         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
71*4930cef6SMatthias Ringwald         *(y++) = (yn + (1 << 15)) >> 16;
72*4930cef6SMatthias Ringwald     }
73*4930cef6SMatthias Ringwald }
74*4930cef6SMatthias Ringwald #endif /* resample_16k_12k8 */
75*4930cef6SMatthias Ringwald 
76*4930cef6SMatthias Ringwald /**
77*4930cef6SMatthias Ringwald  * Resample from 32 Khz to 12.8 KHz
78*4930cef6SMatthias Ringwald  */
79*4930cef6SMatthias Ringwald #ifndef resample_32k_12k8
80*4930cef6SMatthias Ringwald #define resample_32k_12k8 neon_resample_32k_12k8
81*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_32k_12k8(
82*4930cef6SMatthias Ringwald     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
83*4930cef6SMatthias Ringwald {
84*4930cef6SMatthias Ringwald     x -= 40 - 1;
85*4930cef6SMatthias Ringwald 
86*4930cef6SMatthias Ringwald     static const int16_t h[2][40] = {
87*4930cef6SMatthias Ringwald 
88*4930cef6SMatthias Ringwald     {   -30,   -31,    46,   107,     0,  -199,  -162,   209,   430,     0,
89*4930cef6SMatthias Ringwald        -681,  -526,   658,  1343,     0, -2264, -1943,  2999,  9871, 13116,
90*4930cef6SMatthias Ringwald        9871,  2999, -1943, -2264,     0,  1343,   658,  -526,  -681,     0,
91*4930cef6SMatthias Ringwald         430,   209,  -162,  -199,     0,   107,    46,   -31,   -30,     0 },
92*4930cef6SMatthias Ringwald 
93*4930cef6SMatthias Ringwald     {   -14,   -39,     0,    90,    78,  -106,  -229,     0,   382,   299,
94*4930cef6SMatthias Ringwald        -376,  -761,     0,  1194,   937, -1214, -2644,     0,  6534, 12253,
95*4930cef6SMatthias Ringwald       12253,  6534,     0, -2644, -1214,   937,  1194,     0,  -761,  -376,
96*4930cef6SMatthias Ringwald         299,   382,     0,  -229,  -106,    78,    90,     0,   -39,   -14 },
97*4930cef6SMatthias Ringwald 
98*4930cef6SMatthias Ringwald     };
99*4930cef6SMatthias Ringwald 
100*4930cef6SMatthias Ringwald     for (int i = 0; i < 5*n; i += 5) {
101*4930cef6SMatthias Ringwald         const int16_t *hn = h[i & 1];
102*4930cef6SMatthias Ringwald         const int16_t *xn = x + (i >> 1);
103*4930cef6SMatthias Ringwald 
104*4930cef6SMatthias Ringwald         int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
105*4930cef6SMatthias Ringwald         xn += 4, hn += 4;
106*4930cef6SMatthias Ringwald 
107*4930cef6SMatthias Ringwald         for (int i = 1; i < 10; i++)
108*4930cef6SMatthias Ringwald             un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
109*4930cef6SMatthias Ringwald 
110*4930cef6SMatthias Ringwald         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
111*4930cef6SMatthias Ringwald         *(y++) = (yn + (1 << 15)) >> 16;
112*4930cef6SMatthias Ringwald     }
113*4930cef6SMatthias Ringwald }
114*4930cef6SMatthias Ringwald #endif /* resample_32k_12k8 */
115*4930cef6SMatthias Ringwald 
116*4930cef6SMatthias Ringwald /**
117*4930cef6SMatthias Ringwald  * Resample from 48 Khz to 12.8 KHz
118*4930cef6SMatthias Ringwald  */
119*4930cef6SMatthias Ringwald #ifndef resample_48k_12k8
120*4930cef6SMatthias Ringwald #define resample_48k_12k8 neon_resample_48k_12k8
121*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_48k_12k8(
122*4930cef6SMatthias Ringwald     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
123*4930cef6SMatthias Ringwald {
124*4930cef6SMatthias Ringwald     static const int16_t alignas(16) h[4][64] = {
125*4930cef6SMatthias Ringwald 
126*4930cef6SMatthias Ringwald     {  -13,   -25,   -20,    10,    51,    71,    38,   -47,  -133,  -145,
127*4930cef6SMatthias Ringwald        -42,   139,   277,   242,     0,  -329,  -511,  -351,   144,   698,
128*4930cef6SMatthias Ringwald        895,   450,  -535, -1510, -1697,  -521,  1999,  5138,  7737,  8744,
129*4930cef6SMatthias Ringwald       7737,  5138,  1999,  -521, -1697, -1510,  -535,   450,   895,   698,
130*4930cef6SMatthias Ringwald        144,  -351,  -511,  -329,     0,   242,   277,   139,   -42,  -145,
131*4930cef6SMatthias Ringwald       -133,   -47,    38,    71,    51,    10,   -20,   -25,   -13,     0 },
132*4930cef6SMatthias Ringwald 
133*4930cef6SMatthias Ringwald     {   -9,   -23,   -24,     0,    41,    71,    52,   -23,  -115,  -152,
134*4930cef6SMatthias Ringwald        -78,    92,   254,   272,    76,  -251,  -493,  -427,     0,   576,
135*4930cef6SMatthias Ringwald        900,   624,  -262, -1309, -1763,  -954,  1272,  4356,  7203,  8679,
136*4930cef6SMatthias Ringwald       8169,  5886,  2767,     0, -1542, -1660,  -809,   240,   848,   796,
137*4930cef6SMatthias Ringwald        292,  -252,  -507,  -398,   -82,   199,   288,   183,     0,  -130,
138*4930cef6SMatthias Ringwald       -145,   -71,    20,    69,    60,    20,   -15,   -26,   -17,    -3 },
139*4930cef6SMatthias Ringwald 
140*4930cef6SMatthias Ringwald     {   -6,   -20,   -26,    -8,    31,    67,    62,     0,   -94,  -152,
141*4930cef6SMatthias Ringwald       -108,    45,   223,   287,   143,  -167,  -454,  -480,  -134,   439,
142*4930cef6SMatthias Ringwald        866,   758,     0, -1071, -1748, -1295,   601,  3559,  6580,  8485,
143*4930cef6SMatthias Ringwald       8485,  6580,  3559,   601, -1295, -1748, -1071,     0,   758,   866,
144*4930cef6SMatthias Ringwald        439,  -134,  -480,  -454,  -167,   143,   287,   223,    45,  -108,
145*4930cef6SMatthias Ringwald       -152,   -94,     0,    62,    67,    31,    -8,   -26,   -20,    -6 },
146*4930cef6SMatthias Ringwald 
147*4930cef6SMatthias Ringwald     {   -3,   -17,   -26,   -15,    20,    60,    69,    20,   -71,  -145,
148*4930cef6SMatthias Ringwald       -130,     0,   183,   288,   199,   -82,  -398,  -507,  -252,   292,
149*4930cef6SMatthias Ringwald        796,   848,   240,  -809, -1660, -1542,     0,  2767,  5886,  8169,
150*4930cef6SMatthias Ringwald       8679,  7203,  4356,  1272,  -954, -1763, -1309,  -262,   624,   900,
151*4930cef6SMatthias Ringwald        576,     0,  -427,  -493,  -251,    76,   272,   254,    92,   -78,
152*4930cef6SMatthias Ringwald       -152,  -115,   -23,    52,    71,    41,     0,   -24,   -23,    -9 },
153*4930cef6SMatthias Ringwald 
154*4930cef6SMatthias Ringwald     };
155*4930cef6SMatthias Ringwald 
156*4930cef6SMatthias Ringwald     x -= 60 - 1;
157*4930cef6SMatthias Ringwald 
158*4930cef6SMatthias Ringwald     for (int i = 0; i < 15*n; i += 15) {
159*4930cef6SMatthias Ringwald         const int16_t *hn = h[i & 3];
160*4930cef6SMatthias Ringwald         const int16_t *xn = x + (i >> 2);
161*4930cef6SMatthias Ringwald 
162*4930cef6SMatthias Ringwald         int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
163*4930cef6SMatthias Ringwald         xn += 4, hn += 4;
164*4930cef6SMatthias Ringwald 
165*4930cef6SMatthias Ringwald         for (int i = 1; i < 15; i++)
166*4930cef6SMatthias Ringwald             un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
167*4930cef6SMatthias Ringwald 
168*4930cef6SMatthias Ringwald         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
169*4930cef6SMatthias Ringwald         *(y++) = (yn + (1 << 15)) >> 16;
170*4930cef6SMatthias Ringwald     }
171*4930cef6SMatthias Ringwald }
172*4930cef6SMatthias Ringwald #endif /* resample_48k_12k8 */
173*4930cef6SMatthias Ringwald 
174*4930cef6SMatthias Ringwald /**
175*4930cef6SMatthias Ringwald  * Return dot product of 2 vectors
176*4930cef6SMatthias Ringwald  */
177*4930cef6SMatthias Ringwald #ifndef dot
178*4930cef6SMatthias Ringwald #define dot neon_dot
179*4930cef6SMatthias Ringwald LC3_HOT static inline float neon_dot(const int16_t *a, const int16_t *b, int n)
180*4930cef6SMatthias Ringwald {
181*4930cef6SMatthias Ringwald     int64x2_t v = vmovq_n_s64(0);
182*4930cef6SMatthias Ringwald 
183*4930cef6SMatthias Ringwald     for (int i = 0; i < (n >> 4); i++) {
184*4930cef6SMatthias Ringwald         int32x4_t u;
185*4930cef6SMatthias Ringwald 
186*4930cef6SMatthias Ringwald         u = vmull_s16(   vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
187*4930cef6SMatthias Ringwald         u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
188*4930cef6SMatthias Ringwald         v = vpadalq_s32(v, u);
189*4930cef6SMatthias Ringwald 
190*4930cef6SMatthias Ringwald         u = vmull_s16(   vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
191*4930cef6SMatthias Ringwald         u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
192*4930cef6SMatthias Ringwald         v = vpadalq_s32(v, u);
193*4930cef6SMatthias Ringwald     }
194*4930cef6SMatthias Ringwald 
195*4930cef6SMatthias Ringwald     int32_t v32 = (vaddvq_s64(v) + (1 << 5)) >> 6;
196*4930cef6SMatthias Ringwald     return (float)v32;
197*4930cef6SMatthias Ringwald }
198*4930cef6SMatthias Ringwald #endif /* dot */
199*4930cef6SMatthias Ringwald 
200*4930cef6SMatthias Ringwald /**
201*4930cef6SMatthias Ringwald  * Return vector of correlations
202*4930cef6SMatthias Ringwald  */
203*4930cef6SMatthias Ringwald #ifndef correlate
204*4930cef6SMatthias Ringwald #define correlate neon_correlate
205*4930cef6SMatthias Ringwald LC3_HOT static void neon_correlate(
206*4930cef6SMatthias Ringwald     const int16_t *a, const int16_t *b, int n, float *y, int nc)
207*4930cef6SMatthias Ringwald {
208*4930cef6SMatthias Ringwald     for ( ; nc >= 4; nc -= 4, b -= 4) {
209*4930cef6SMatthias Ringwald         const int16_t *an = (const int16_t *)a;
210*4930cef6SMatthias Ringwald         const int16_t *bn = (const int16_t *)b;
211*4930cef6SMatthias Ringwald 
212*4930cef6SMatthias Ringwald         int64x2_t v0 = vmovq_n_s64(0), v1 = v0, v2 = v0, v3 = v0;
213*4930cef6SMatthias Ringwald         int16x4_t ax, b0, b1;
214*4930cef6SMatthias Ringwald 
215*4930cef6SMatthias Ringwald         b0 = vld1_s16(bn-4);
216*4930cef6SMatthias Ringwald 
217*4930cef6SMatthias Ringwald         for (int i=0; i < (n >> 4); i++ )
218*4930cef6SMatthias Ringwald             for (int j = 0; j < 2; j++) {
219*4930cef6SMatthias Ringwald                 int32x4_t u0, u1, u2, u3;
220*4930cef6SMatthias Ringwald 
221*4930cef6SMatthias Ringwald                 b1 = b0;
222*4930cef6SMatthias Ringwald                 b0 = vld1_s16(bn), bn += 4;
223*4930cef6SMatthias Ringwald                 ax = vld1_s16(an), an += 4;
224*4930cef6SMatthias Ringwald 
225*4930cef6SMatthias Ringwald                 u0 = vmull_s16(ax, b0);
226*4930cef6SMatthias Ringwald                 u1 = vmull_s16(ax, vext_s16(b1, b0, 3));
227*4930cef6SMatthias Ringwald                 u2 = vmull_s16(ax, vext_s16(b1, b0, 2));
228*4930cef6SMatthias Ringwald                 u3 = vmull_s16(ax, vext_s16(b1, b0, 1));
229*4930cef6SMatthias Ringwald 
230*4930cef6SMatthias Ringwald                 b1 = b0;
231*4930cef6SMatthias Ringwald                 b0 = vld1_s16(bn), bn += 4;
232*4930cef6SMatthias Ringwald                 ax = vld1_s16(an), an += 4;
233*4930cef6SMatthias Ringwald 
234*4930cef6SMatthias Ringwald                 u0 = vmlal_s16(u0, ax, b0);
235*4930cef6SMatthias Ringwald                 u1 = vmlal_s16(u1, ax, vext_s16(b1, b0, 3));
236*4930cef6SMatthias Ringwald                 u2 = vmlal_s16(u2, ax, vext_s16(b1, b0, 2));
237*4930cef6SMatthias Ringwald                 u3 = vmlal_s16(u3, ax, vext_s16(b1, b0, 1));
238*4930cef6SMatthias Ringwald 
239*4930cef6SMatthias Ringwald                 v0 = vpadalq_s32(v0, u0);
240*4930cef6SMatthias Ringwald                 v1 = vpadalq_s32(v1, u1);
241*4930cef6SMatthias Ringwald                 v2 = vpadalq_s32(v2, u2);
242*4930cef6SMatthias Ringwald                 v3 = vpadalq_s32(v3, u3);
243*4930cef6SMatthias Ringwald             }
244*4930cef6SMatthias Ringwald 
245*4930cef6SMatthias Ringwald         *(y++) = (float)((int32_t)((vaddvq_s64(v0) + (1 << 5)) >> 6));
246*4930cef6SMatthias Ringwald         *(y++) = (float)((int32_t)((vaddvq_s64(v1) + (1 << 5)) >> 6));
247*4930cef6SMatthias Ringwald         *(y++) = (float)((int32_t)((vaddvq_s64(v2) + (1 << 5)) >> 6));
248*4930cef6SMatthias Ringwald         *(y++) = (float)((int32_t)((vaddvq_s64(v3) + (1 << 5)) >> 6));
249*4930cef6SMatthias Ringwald     }
250*4930cef6SMatthias Ringwald 
251*4930cef6SMatthias Ringwald     for ( ; nc > 0; nc--)
252*4930cef6SMatthias Ringwald         *(y++) = neon_dot(a, b--, n);
253*4930cef6SMatthias Ringwald }
254*4930cef6SMatthias Ringwald #endif /* correlate */
255*4930cef6SMatthias Ringwald 
256*4930cef6SMatthias Ringwald #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
257