xref: /aosp_15_r20/external/liblc3/src/ltpf_neon.h (revision 49fe348c0058011ee60b6957cdd9d52742df84bc)
1*49fe348cSAndroid Build Coastguard Worker /******************************************************************************
2*49fe348cSAndroid Build Coastguard Worker  *
3*49fe348cSAndroid Build Coastguard Worker  *  Copyright 2022 Google LLC
4*49fe348cSAndroid Build Coastguard Worker  *
5*49fe348cSAndroid Build Coastguard Worker  *  Licensed under the Apache License, Version 2.0 (the "License");
6*49fe348cSAndroid Build Coastguard Worker  *  you may not use this file except in compliance with the License.
7*49fe348cSAndroid Build Coastguard Worker  *  You may obtain a copy of the License at:
8*49fe348cSAndroid Build Coastguard Worker  *
9*49fe348cSAndroid Build Coastguard Worker  *  http://www.apache.org/licenses/LICENSE-2.0
10*49fe348cSAndroid Build Coastguard Worker  *
11*49fe348cSAndroid Build Coastguard Worker  *  Unless required by applicable law or agreed to in writing, software
12*49fe348cSAndroid Build Coastguard Worker  *  distributed under the License is distributed on an "AS IS" BASIS,
13*49fe348cSAndroid Build Coastguard Worker  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*49fe348cSAndroid Build Coastguard Worker  *  See the License for the specific language governing permissions and
15*49fe348cSAndroid Build Coastguard Worker  *  limitations under the License.
16*49fe348cSAndroid Build Coastguard Worker  *
17*49fe348cSAndroid Build Coastguard Worker  ******************************************************************************/
18*49fe348cSAndroid Build Coastguard Worker 
19*49fe348cSAndroid Build Coastguard Worker #if __ARM_NEON && __ARM_ARCH_ISA_A64 && \
20*49fe348cSAndroid Build Coastguard Worker         !defined(TEST_ARM) || defined(TEST_NEON)
21*49fe348cSAndroid Build Coastguard Worker 
22*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
23*49fe348cSAndroid Build Coastguard Worker #include <arm_neon.h>
24*49fe348cSAndroid Build Coastguard Worker #endif /* TEST_NEON */
25*49fe348cSAndroid Build Coastguard Worker 
26*49fe348cSAndroid Build Coastguard Worker 
27*49fe348cSAndroid Build Coastguard Worker /**
28*49fe348cSAndroid Build Coastguard Worker  * Import
29*49fe348cSAndroid Build Coastguard Worker  */
30*49fe348cSAndroid Build Coastguard Worker 
31*49fe348cSAndroid Build Coastguard Worker static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t);
32*49fe348cSAndroid Build Coastguard Worker 
33*49fe348cSAndroid Build Coastguard Worker 
34*49fe348cSAndroid Build Coastguard Worker /**
35*49fe348cSAndroid Build Coastguard Worker  * Resample from 16 Khz to 12.8 KHz
36*49fe348cSAndroid Build Coastguard Worker  */
37*49fe348cSAndroid Build Coastguard Worker #ifndef resample_16k_12k8
38*49fe348cSAndroid Build Coastguard Worker 
neon_resample_16k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)39*49fe348cSAndroid Build Coastguard Worker LC3_HOT static void neon_resample_16k_12k8(
40*49fe348cSAndroid Build Coastguard Worker     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
41*49fe348cSAndroid Build Coastguard Worker {
42*49fe348cSAndroid Build Coastguard Worker     static const int16_t h[4][20] = {
43*49fe348cSAndroid Build Coastguard Worker 
44*49fe348cSAndroid Build Coastguard Worker     {   -61,   214,  -398,   417,     0, -1052,  2686, -4529,  5997, 26233,
45*49fe348cSAndroid Build Coastguard Worker        5997, -4529,  2686, -1052,     0,   417,  -398,   214,   -61,     0 },
46*49fe348cSAndroid Build Coastguard Worker 
47*49fe348cSAndroid Build Coastguard Worker     {   -79,   180,  -213,     0,   598, -1522,  2389, -2427,     0, 24506,
48*49fe348cSAndroid Build Coastguard Worker       13068, -5289,  1873,     0,  -752,   763,  -457,   156,     0,   -28 },
49*49fe348cSAndroid Build Coastguard Worker 
50*49fe348cSAndroid Build Coastguard Worker     {   -61,    92,     0,  -323,   861, -1361,  1317,     0, -3885, 19741,
51*49fe348cSAndroid Build Coastguard Worker       19741, -3885,     0,  1317, -1361,   861,  -323,     0,    92,   -61 },
52*49fe348cSAndroid Build Coastguard Worker 
53*49fe348cSAndroid Build Coastguard Worker     {   -28,     0,   156,  -457,   763,  -752,     0,  1873, -5289, 13068,
54*49fe348cSAndroid Build Coastguard Worker       24506,     0, -2427,  2389, -1522,   598,     0,  -213,   180,   -79 },
55*49fe348cSAndroid Build Coastguard Worker 
56*49fe348cSAndroid Build Coastguard Worker     };
57*49fe348cSAndroid Build Coastguard Worker 
58*49fe348cSAndroid Build Coastguard Worker     x -= 20 - 1;
59*49fe348cSAndroid Build Coastguard Worker 
60*49fe348cSAndroid Build Coastguard Worker     for (int i = 0; i < 5*n; i += 5) {
61*49fe348cSAndroid Build Coastguard Worker         const int16_t *hn = h[i & 3];
62*49fe348cSAndroid Build Coastguard Worker         const int16_t *xn = x + (i >> 2);
63*49fe348cSAndroid Build Coastguard Worker         int32x4_t un;
64*49fe348cSAndroid Build Coastguard Worker 
65*49fe348cSAndroid Build Coastguard Worker         un = vmull_s16(    vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
66*49fe348cSAndroid Build Coastguard Worker         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
67*49fe348cSAndroid Build Coastguard Worker         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
68*49fe348cSAndroid Build Coastguard Worker         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
69*49fe348cSAndroid Build Coastguard Worker         un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
70*49fe348cSAndroid Build Coastguard Worker 
71*49fe348cSAndroid Build Coastguard Worker         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
72*49fe348cSAndroid Build Coastguard Worker         *(y++) = (yn + (1 << 15)) >> 16;
73*49fe348cSAndroid Build Coastguard Worker     }
74*49fe348cSAndroid Build Coastguard Worker }
75*49fe348cSAndroid Build Coastguard Worker 
76*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
77*49fe348cSAndroid Build Coastguard Worker #define resample_16k_12k8 neon_resample_16k_12k8
78*49fe348cSAndroid Build Coastguard Worker #endif
79*49fe348cSAndroid Build Coastguard Worker 
80*49fe348cSAndroid Build Coastguard Worker #endif /* resample_16k_12k8 */
81*49fe348cSAndroid Build Coastguard Worker 
82*49fe348cSAndroid Build Coastguard Worker /**
83*49fe348cSAndroid Build Coastguard Worker  * Resample from 32 Khz to 12.8 KHz
84*49fe348cSAndroid Build Coastguard Worker  */
85*49fe348cSAndroid Build Coastguard Worker #ifndef resample_32k_12k8
86*49fe348cSAndroid Build Coastguard Worker 
neon_resample_32k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)87*49fe348cSAndroid Build Coastguard Worker LC3_HOT static void neon_resample_32k_12k8(
88*49fe348cSAndroid Build Coastguard Worker     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
89*49fe348cSAndroid Build Coastguard Worker {
90*49fe348cSAndroid Build Coastguard Worker     x -= 40 - 1;
91*49fe348cSAndroid Build Coastguard Worker 
92*49fe348cSAndroid Build Coastguard Worker     static const int16_t h[2][40] = {
93*49fe348cSAndroid Build Coastguard Worker 
94*49fe348cSAndroid Build Coastguard Worker     {   -30,   -31,    46,   107,     0,  -199,  -162,   209,   430,     0,
95*49fe348cSAndroid Build Coastguard Worker        -681,  -526,   658,  1343,     0, -2264, -1943,  2999,  9871, 13116,
96*49fe348cSAndroid Build Coastguard Worker        9871,  2999, -1943, -2264,     0,  1343,   658,  -526,  -681,     0,
97*49fe348cSAndroid Build Coastguard Worker         430,   209,  -162,  -199,     0,   107,    46,   -31,   -30,     0 },
98*49fe348cSAndroid Build Coastguard Worker 
99*49fe348cSAndroid Build Coastguard Worker     {   -14,   -39,     0,    90,    78,  -106,  -229,     0,   382,   299,
100*49fe348cSAndroid Build Coastguard Worker        -376,  -761,     0,  1194,   937, -1214, -2644,     0,  6534, 12253,
101*49fe348cSAndroid Build Coastguard Worker       12253,  6534,     0, -2644, -1214,   937,  1194,     0,  -761,  -376,
102*49fe348cSAndroid Build Coastguard Worker         299,   382,     0,  -229,  -106,    78,    90,     0,   -39,   -14 },
103*49fe348cSAndroid Build Coastguard Worker 
104*49fe348cSAndroid Build Coastguard Worker     };
105*49fe348cSAndroid Build Coastguard Worker 
106*49fe348cSAndroid Build Coastguard Worker     for (int i = 0; i < 5*n; i += 5) {
107*49fe348cSAndroid Build Coastguard Worker         const int16_t *hn = h[i & 1];
108*49fe348cSAndroid Build Coastguard Worker         const int16_t *xn = x + (i >> 1);
109*49fe348cSAndroid Build Coastguard Worker 
110*49fe348cSAndroid Build Coastguard Worker         int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
111*49fe348cSAndroid Build Coastguard Worker         xn += 4, hn += 4;
112*49fe348cSAndroid Build Coastguard Worker 
113*49fe348cSAndroid Build Coastguard Worker         for (int i = 1; i < 10; i++)
114*49fe348cSAndroid Build Coastguard Worker             un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
115*49fe348cSAndroid Build Coastguard Worker 
116*49fe348cSAndroid Build Coastguard Worker         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
117*49fe348cSAndroid Build Coastguard Worker         *(y++) = (yn + (1 << 15)) >> 16;
118*49fe348cSAndroid Build Coastguard Worker     }
119*49fe348cSAndroid Build Coastguard Worker }
120*49fe348cSAndroid Build Coastguard Worker 
121*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
122*49fe348cSAndroid Build Coastguard Worker #define resample_32k_12k8 neon_resample_32k_12k8
123*49fe348cSAndroid Build Coastguard Worker #endif
124*49fe348cSAndroid Build Coastguard Worker 
125*49fe348cSAndroid Build Coastguard Worker #endif /* resample_32k_12k8 */
126*49fe348cSAndroid Build Coastguard Worker 
127*49fe348cSAndroid Build Coastguard Worker /**
128*49fe348cSAndroid Build Coastguard Worker  * Resample from 48 Khz to 12.8 KHz
129*49fe348cSAndroid Build Coastguard Worker  */
130*49fe348cSAndroid Build Coastguard Worker #ifndef resample_48k_12k8
131*49fe348cSAndroid Build Coastguard Worker 
neon_resample_48k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)132*49fe348cSAndroid Build Coastguard Worker LC3_HOT static void neon_resample_48k_12k8(
133*49fe348cSAndroid Build Coastguard Worker     struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
134*49fe348cSAndroid Build Coastguard Worker {
135*49fe348cSAndroid Build Coastguard Worker     static const int16_t alignas(16) h[4][64] = {
136*49fe348cSAndroid Build Coastguard Worker 
137*49fe348cSAndroid Build Coastguard Worker     {  -13,   -25,   -20,    10,    51,    71,    38,   -47,  -133,  -145,
138*49fe348cSAndroid Build Coastguard Worker        -42,   139,   277,   242,     0,  -329,  -511,  -351,   144,   698,
139*49fe348cSAndroid Build Coastguard Worker        895,   450,  -535, -1510, -1697,  -521,  1999,  5138,  7737,  8744,
140*49fe348cSAndroid Build Coastguard Worker       7737,  5138,  1999,  -521, -1697, -1510,  -535,   450,   895,   698,
141*49fe348cSAndroid Build Coastguard Worker        144,  -351,  -511,  -329,     0,   242,   277,   139,   -42,  -145,
142*49fe348cSAndroid Build Coastguard Worker       -133,   -47,    38,    71,    51,    10,   -20,   -25,   -13,     0 },
143*49fe348cSAndroid Build Coastguard Worker 
144*49fe348cSAndroid Build Coastguard Worker     {   -9,   -23,   -24,     0,    41,    71,    52,   -23,  -115,  -152,
145*49fe348cSAndroid Build Coastguard Worker        -78,    92,   254,   272,    76,  -251,  -493,  -427,     0,   576,
146*49fe348cSAndroid Build Coastguard Worker        900,   624,  -262, -1309, -1763,  -954,  1272,  4356,  7203,  8679,
147*49fe348cSAndroid Build Coastguard Worker       8169,  5886,  2767,     0, -1542, -1660,  -809,   240,   848,   796,
148*49fe348cSAndroid Build Coastguard Worker        292,  -252,  -507,  -398,   -82,   199,   288,   183,     0,  -130,
149*49fe348cSAndroid Build Coastguard Worker       -145,   -71,    20,    69,    60,    20,   -15,   -26,   -17,    -3 },
150*49fe348cSAndroid Build Coastguard Worker 
151*49fe348cSAndroid Build Coastguard Worker     {   -6,   -20,   -26,    -8,    31,    67,    62,     0,   -94,  -152,
152*49fe348cSAndroid Build Coastguard Worker       -108,    45,   223,   287,   143,  -167,  -454,  -480,  -134,   439,
153*49fe348cSAndroid Build Coastguard Worker        866,   758,     0, -1071, -1748, -1295,   601,  3559,  6580,  8485,
154*49fe348cSAndroid Build Coastguard Worker       8485,  6580,  3559,   601, -1295, -1748, -1071,     0,   758,   866,
155*49fe348cSAndroid Build Coastguard Worker        439,  -134,  -480,  -454,  -167,   143,   287,   223,    45,  -108,
156*49fe348cSAndroid Build Coastguard Worker       -152,   -94,     0,    62,    67,    31,    -8,   -26,   -20,    -6 },
157*49fe348cSAndroid Build Coastguard Worker 
158*49fe348cSAndroid Build Coastguard Worker     {   -3,   -17,   -26,   -15,    20,    60,    69,    20,   -71,  -145,
159*49fe348cSAndroid Build Coastguard Worker       -130,     0,   183,   288,   199,   -82,  -398,  -507,  -252,   292,
160*49fe348cSAndroid Build Coastguard Worker        796,   848,   240,  -809, -1660, -1542,     0,  2767,  5886,  8169,
161*49fe348cSAndroid Build Coastguard Worker       8679,  7203,  4356,  1272,  -954, -1763, -1309,  -262,   624,   900,
162*49fe348cSAndroid Build Coastguard Worker        576,     0,  -427,  -493,  -251,    76,   272,   254,    92,   -78,
163*49fe348cSAndroid Build Coastguard Worker       -152,  -115,   -23,    52,    71,    41,     0,   -24,   -23,    -9 },
164*49fe348cSAndroid Build Coastguard Worker 
165*49fe348cSAndroid Build Coastguard Worker     };
166*49fe348cSAndroid Build Coastguard Worker 
167*49fe348cSAndroid Build Coastguard Worker     x -= 60 - 1;
168*49fe348cSAndroid Build Coastguard Worker 
169*49fe348cSAndroid Build Coastguard Worker     for (int i = 0; i < 15*n; i += 15) {
170*49fe348cSAndroid Build Coastguard Worker         const int16_t *hn = h[i & 3];
171*49fe348cSAndroid Build Coastguard Worker         const int16_t *xn = x + (i >> 2);
172*49fe348cSAndroid Build Coastguard Worker 
173*49fe348cSAndroid Build Coastguard Worker         int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
174*49fe348cSAndroid Build Coastguard Worker         xn += 4, hn += 4;
175*49fe348cSAndroid Build Coastguard Worker 
176*49fe348cSAndroid Build Coastguard Worker         for (int i = 1; i < 15; i++)
177*49fe348cSAndroid Build Coastguard Worker             un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
178*49fe348cSAndroid Build Coastguard Worker 
179*49fe348cSAndroid Build Coastguard Worker         int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
180*49fe348cSAndroid Build Coastguard Worker         *(y++) = (yn + (1 << 15)) >> 16;
181*49fe348cSAndroid Build Coastguard Worker     }
182*49fe348cSAndroid Build Coastguard Worker }
183*49fe348cSAndroid Build Coastguard Worker 
184*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
185*49fe348cSAndroid Build Coastguard Worker #define resample_48k_12k8 neon_resample_48k_12k8
186*49fe348cSAndroid Build Coastguard Worker #endif
187*49fe348cSAndroid Build Coastguard Worker 
188*49fe348cSAndroid Build Coastguard Worker #endif /* resample_48k_12k8 */
189*49fe348cSAndroid Build Coastguard Worker 
190*49fe348cSAndroid Build Coastguard Worker /**
191*49fe348cSAndroid Build Coastguard Worker  * Return dot product of 2 vectors
192*49fe348cSAndroid Build Coastguard Worker  */
193*49fe348cSAndroid Build Coastguard Worker #ifndef dot
194*49fe348cSAndroid Build Coastguard Worker 
neon_dot(const int16_t * a,const int16_t * b,int n)195*49fe348cSAndroid Build Coastguard Worker LC3_HOT static inline float neon_dot(const int16_t *a, const int16_t *b, int n)
196*49fe348cSAndroid Build Coastguard Worker {
197*49fe348cSAndroid Build Coastguard Worker     int64x2_t v = vmovq_n_s64(0);
198*49fe348cSAndroid Build Coastguard Worker 
199*49fe348cSAndroid Build Coastguard Worker     for (int i = 0; i < (n >> 4); i++) {
200*49fe348cSAndroid Build Coastguard Worker         int32x4_t u;
201*49fe348cSAndroid Build Coastguard Worker 
202*49fe348cSAndroid Build Coastguard Worker         u = vmull_s16(   vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
203*49fe348cSAndroid Build Coastguard Worker         u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
204*49fe348cSAndroid Build Coastguard Worker         v = vpadalq_s32(v, u);
205*49fe348cSAndroid Build Coastguard Worker 
206*49fe348cSAndroid Build Coastguard Worker         u = vmull_s16(   vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
207*49fe348cSAndroid Build Coastguard Worker         u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
208*49fe348cSAndroid Build Coastguard Worker         v = vpadalq_s32(v, u);
209*49fe348cSAndroid Build Coastguard Worker     }
210*49fe348cSAndroid Build Coastguard Worker 
211*49fe348cSAndroid Build Coastguard Worker     int32_t v32 = (vaddvq_s64(v) + (1 << 5)) >> 6;
212*49fe348cSAndroid Build Coastguard Worker     return (float)v32;
213*49fe348cSAndroid Build Coastguard Worker }
214*49fe348cSAndroid Build Coastguard Worker 
215*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
216*49fe348cSAndroid Build Coastguard Worker #define dot neon_dot
217*49fe348cSAndroid Build Coastguard Worker #endif
218*49fe348cSAndroid Build Coastguard Worker 
219*49fe348cSAndroid Build Coastguard Worker #endif /* dot */
220*49fe348cSAndroid Build Coastguard Worker 
221*49fe348cSAndroid Build Coastguard Worker /**
222*49fe348cSAndroid Build Coastguard Worker  * Return vector of correlations
223*49fe348cSAndroid Build Coastguard Worker  */
224*49fe348cSAndroid Build Coastguard Worker #ifndef correlate
225*49fe348cSAndroid Build Coastguard Worker 
neon_correlate(const int16_t * a,const int16_t * b,int n,float * y,int nc)226*49fe348cSAndroid Build Coastguard Worker LC3_HOT static void neon_correlate(
227*49fe348cSAndroid Build Coastguard Worker     const int16_t *a, const int16_t *b, int n, float *y, int nc)
228*49fe348cSAndroid Build Coastguard Worker {
229*49fe348cSAndroid Build Coastguard Worker     for ( ; nc >= 4; nc -= 4, b -= 4) {
230*49fe348cSAndroid Build Coastguard Worker         const int16_t *an = (const int16_t *)a;
231*49fe348cSAndroid Build Coastguard Worker         const int16_t *bn = (const int16_t *)b;
232*49fe348cSAndroid Build Coastguard Worker 
233*49fe348cSAndroid Build Coastguard Worker         int64x2_t v0 = vmovq_n_s64(0), v1 = v0, v2 = v0, v3 = v0;
234*49fe348cSAndroid Build Coastguard Worker         int16x4_t ax, b0, b1;
235*49fe348cSAndroid Build Coastguard Worker 
236*49fe348cSAndroid Build Coastguard Worker         b0 = vld1_s16(bn-4);
237*49fe348cSAndroid Build Coastguard Worker 
238*49fe348cSAndroid Build Coastguard Worker         for (int i=0; i < (n >> 4); i++ )
239*49fe348cSAndroid Build Coastguard Worker             for (int j = 0; j < 2; j++) {
240*49fe348cSAndroid Build Coastguard Worker                 int32x4_t u0, u1, u2, u3;
241*49fe348cSAndroid Build Coastguard Worker 
242*49fe348cSAndroid Build Coastguard Worker                 b1 = b0;
243*49fe348cSAndroid Build Coastguard Worker                 b0 = vld1_s16(bn), bn += 4;
244*49fe348cSAndroid Build Coastguard Worker                 ax = vld1_s16(an), an += 4;
245*49fe348cSAndroid Build Coastguard Worker 
246*49fe348cSAndroid Build Coastguard Worker                 u0 = vmull_s16(ax, b0);
247*49fe348cSAndroid Build Coastguard Worker                 u1 = vmull_s16(ax, vext_s16(b1, b0, 3));
248*49fe348cSAndroid Build Coastguard Worker                 u2 = vmull_s16(ax, vext_s16(b1, b0, 2));
249*49fe348cSAndroid Build Coastguard Worker                 u3 = vmull_s16(ax, vext_s16(b1, b0, 1));
250*49fe348cSAndroid Build Coastguard Worker 
251*49fe348cSAndroid Build Coastguard Worker                 b1 = b0;
252*49fe348cSAndroid Build Coastguard Worker                 b0 = vld1_s16(bn), bn += 4;
253*49fe348cSAndroid Build Coastguard Worker                 ax = vld1_s16(an), an += 4;
254*49fe348cSAndroid Build Coastguard Worker 
255*49fe348cSAndroid Build Coastguard Worker                 u0 = vmlal_s16(u0, ax, b0);
256*49fe348cSAndroid Build Coastguard Worker                 u1 = vmlal_s16(u1, ax, vext_s16(b1, b0, 3));
257*49fe348cSAndroid Build Coastguard Worker                 u2 = vmlal_s16(u2, ax, vext_s16(b1, b0, 2));
258*49fe348cSAndroid Build Coastguard Worker                 u3 = vmlal_s16(u3, ax, vext_s16(b1, b0, 1));
259*49fe348cSAndroid Build Coastguard Worker 
260*49fe348cSAndroid Build Coastguard Worker                 v0 = vpadalq_s32(v0, u0);
261*49fe348cSAndroid Build Coastguard Worker                 v1 = vpadalq_s32(v1, u1);
262*49fe348cSAndroid Build Coastguard Worker                 v2 = vpadalq_s32(v2, u2);
263*49fe348cSAndroid Build Coastguard Worker                 v3 = vpadalq_s32(v3, u3);
264*49fe348cSAndroid Build Coastguard Worker             }
265*49fe348cSAndroid Build Coastguard Worker 
266*49fe348cSAndroid Build Coastguard Worker         *(y++) = (float)((int32_t)((vaddvq_s64(v0) + (1 << 5)) >> 6));
267*49fe348cSAndroid Build Coastguard Worker         *(y++) = (float)((int32_t)((vaddvq_s64(v1) + (1 << 5)) >> 6));
268*49fe348cSAndroid Build Coastguard Worker         *(y++) = (float)((int32_t)((vaddvq_s64(v2) + (1 << 5)) >> 6));
269*49fe348cSAndroid Build Coastguard Worker         *(y++) = (float)((int32_t)((vaddvq_s64(v3) + (1 << 5)) >> 6));
270*49fe348cSAndroid Build Coastguard Worker     }
271*49fe348cSAndroid Build Coastguard Worker 
272*49fe348cSAndroid Build Coastguard Worker     for ( ; nc > 0; nc--)
273*49fe348cSAndroid Build Coastguard Worker         *(y++) = neon_dot(a, b--, n);
274*49fe348cSAndroid Build Coastguard Worker }
275*49fe348cSAndroid Build Coastguard Worker #endif /* correlate */
276*49fe348cSAndroid Build Coastguard Worker 
277*49fe348cSAndroid Build Coastguard Worker #ifndef TEST_NEON
278*49fe348cSAndroid Build Coastguard Worker #define correlate neon_correlate
279*49fe348cSAndroid Build Coastguard Worker #endif
280*49fe348cSAndroid Build Coastguard Worker 
281*49fe348cSAndroid Build Coastguard Worker #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
282