xref: /aosp_15_r20/external/flac/src/libFLAC/lpc_intrin_neon.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2023  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include "private/cpu.h"
34 
35 #ifndef FLAC__INTEGER_ONLY_LIBRARY
36 #ifndef FLAC__NO_ASM
37 #if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN
38 #include "private/lpc.h"
39 #include "FLAC/assert.h"
40 #include "FLAC/format.h"
41 #include "private/macros.h"
42 #include <arm_neon.h>
43 
44 #if FLAC__HAS_A64NEONINTRIN
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])45 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
46 {
47 #undef MAX_LAG
48 #define MAX_LAG 14
49 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
50 }
51 
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])52 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
53 {
54 #undef MAX_LAG
55 #define MAX_LAG 10
56 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
57 }
58 
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])59 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
60 {
61 #undef MAX_LAG
62 #define MAX_LAG 8
63 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
64 }
65 
66 #endif /* ifdef FLAC__HAS_A64NEONINTRIN */
67 
68 
69 #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
70                         summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
71                         summ_1 = vmulq_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane); \
72                         summ_2 = vmulq_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
73 
74 
75 #define MACC_32BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
76                         summ_0 = vmlaq_laneq_s32(summ_0,tmp_vec[tmp_vec_ind] ,qlp_coeff_vec, lane); \
77                         summ_1 = vmlaq_laneq_s32(summ_1,tmp_vec[tmp_vec_ind+4] ,qlp_coeff_vec, lane); \
78                         summ_2 = vmlaq_laneq_s32(summ_2,tmp_vec[tmp_vec_ind+8] ,qlp_coeff_vec, lane);
79 
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81 {
82     int i;
83     FLAC__int32 sum;
84     int32x4_t tmp_vec[20];
85 
86     FLAC__ASSERT(order > 0);
87     FLAC__ASSERT(order <= 32);
88 
89     // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....)
90     if(order <= 12) {
91         if(order > 8) {
92             if(order > 10) {
93                 if (order == 12) {
94                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
95                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
96                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]};
97 
98                     tmp_vec[0] = vld1q_s32(data - 12);
99                     tmp_vec[1] = vld1q_s32(data - 11);
100                     tmp_vec[2] = vld1q_s32(data - 10);
101                     tmp_vec[3] = vld1q_s32(data - 9);
102                     tmp_vec[4] = vld1q_s32(data - 8);
103                     tmp_vec[5] = vld1q_s32(data - 7);
104                     tmp_vec[6] = vld1q_s32(data - 6);
105                     tmp_vec[7] = vld1q_s32(data - 5);
106 
107                     for (i = 0; i < (int)data_len - 11; i += 12)
108                     {
109                         int32x4_t summ_0, summ_1, summ_2;
110 
111                         tmp_vec[8] = vld1q_s32(data + i - 4);
112                         tmp_vec[9] = vld1q_s32(data+i-3);
113                         tmp_vec[10] = vld1q_s32(data+i-2);
114                         tmp_vec[11] = vld1q_s32(data+i-1);
115                         tmp_vec[12] = vld1q_s32(data+i);
116                         tmp_vec[13] = vld1q_s32(data+i+1);
117                         tmp_vec[14] = vld1q_s32(data+i+2);
118                         tmp_vec[15] = vld1q_s32(data+i+3);
119                         tmp_vec[16] = vld1q_s32(data + i + 4);
120                         tmp_vec[17] = vld1q_s32(data + i + 5);
121                         tmp_vec[18] = vld1q_s32(data + i + 6);
122                         tmp_vec[19] = vld1q_s32(data + i + 7);
123 
124                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
125                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
126                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
127                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
128                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
129                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
130                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
131                         MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
132                         MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
133                         MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
134                         MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 1)
135                         MACC_32BIT_LOOP_UNROOL_3(11, qlp_coeff_0, 0)
136 
137                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
138                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
139                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
140 
141                         tmp_vec[0] = tmp_vec[12];
142                         tmp_vec[1] = tmp_vec[13];
143                         tmp_vec[2] = tmp_vec[14];
144                         tmp_vec[3] = tmp_vec[15];
145                         tmp_vec[4] = tmp_vec[16];
146                         tmp_vec[5] = tmp_vec[17];
147                         tmp_vec[6] = tmp_vec[18];
148                         tmp_vec[7] = tmp_vec[19];
149                     }
150                 }
151 
152                 else { /* order == 11 */
153                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
154                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
155                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0};
156 
157                     tmp_vec[0] = vld1q_s32(data - 11);
158                     tmp_vec[1] = vld1q_s32(data - 10);
159                     tmp_vec[2] = vld1q_s32(data - 9);
160                     tmp_vec[3] = vld1q_s32(data - 8);
161                     tmp_vec[4] = vld1q_s32(data - 7);
162                     tmp_vec[5] = vld1q_s32(data - 6);
163                     tmp_vec[6] = vld1q_s32(data - 5);
164 
165                     for (i = 0; i < (int)data_len - 11; i += 12)
166                     {
167                         int32x4_t summ_0, summ_1, summ_2;
168                         tmp_vec[7] = vld1q_s32(data + i - 4);
169                         tmp_vec[8] = vld1q_s32(data + i - 3);
170                         tmp_vec[9] = vld1q_s32(data + i - 2);
171                         tmp_vec[10] = vld1q_s32(data + i - 1);
172                         tmp_vec[11] = vld1q_s32(data + i - 0);
173                         tmp_vec[12] = vld1q_s32(data + i + 1);
174                         tmp_vec[13] = vld1q_s32(data + i + 2);
175                         tmp_vec[14] = vld1q_s32(data + i + 3);
176                         tmp_vec[15] = vld1q_s32(data + i + 4);
177                         tmp_vec[16] = vld1q_s32(data + i + 5);
178                         tmp_vec[17] = vld1q_s32(data + i + 6);
179                         tmp_vec[18] = vld1q_s32(data + i + 7);
180 
181 
182                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
183                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
184                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
185                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
186                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
187                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
188                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
189                         MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
190                         MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
191                         MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
192                         MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 0)
193 
194                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
195                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
196                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
197 
198 
199                         tmp_vec[0] = tmp_vec[12];
200                         tmp_vec[1] = tmp_vec[13];
201                         tmp_vec[2] = tmp_vec[14];
202                         tmp_vec[3] = tmp_vec[15];
203                         tmp_vec[4] = tmp_vec[16];
204                         tmp_vec[5] = tmp_vec[17];
205                         tmp_vec[6] = tmp_vec[18];
206                     }
207                 }
208             }
209             else {
210                 if(order == 10) {
211                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
212                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
213                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
214 
215                     tmp_vec[0] = vld1q_s32(data - 10);
216                     tmp_vec[1] = vld1q_s32(data - 9);
217                     tmp_vec[2] = vld1q_s32(data - 8);
218                     tmp_vec[3] = vld1q_s32(data - 7);
219                     tmp_vec[4] = vld1q_s32(data - 6);
220                     tmp_vec[5] = vld1q_s32(data - 5);
221 
222                     for (i = 0; i < (int)data_len - 11; i += 12)
223                     {
224                         int32x4_t summ_0, summ_1, summ_2;
225                         tmp_vec[6] = vld1q_s32(data + i - 4);
226                         tmp_vec[7] = vld1q_s32(data + i - 3);
227                         tmp_vec[8] = vld1q_s32(data + i - 2);
228                         tmp_vec[9] = vld1q_s32(data + i - 1);
229                         tmp_vec[10] = vld1q_s32(data + i - 0);
230                         tmp_vec[11] = vld1q_s32(data + i + 1);
231                         tmp_vec[12] = vld1q_s32(data + i + 2);
232                         tmp_vec[13] = vld1q_s32(data + i + 3);
233                         tmp_vec[14] = vld1q_s32(data + i + 4);
234                         tmp_vec[15] = vld1q_s32(data + i + 5);
235                         tmp_vec[16] = vld1q_s32(data + i + 6);
236                         tmp_vec[17] = vld1q_s32(data + i + 7);
237 
238 
239                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
240                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
241                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
242                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
243                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
244                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
245                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
246                         MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
247                         MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
248                         MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
249 
250                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
251                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
252                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
253 
254 
255                         tmp_vec[0] = tmp_vec[12];
256                         tmp_vec[1] = tmp_vec[13];
257                         tmp_vec[2] = tmp_vec[14];
258                         tmp_vec[3] = tmp_vec[15];
259                         tmp_vec[4] = tmp_vec[16];
260                         tmp_vec[5] = tmp_vec[17];
261                     }
262                 }
263                 else { /* order == 9 */
264                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
265                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
266                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
267 
268                     tmp_vec[0] = vld1q_s32(data - 9);
269                     tmp_vec[1] = vld1q_s32(data - 8);
270                     tmp_vec[2] = vld1q_s32(data - 7);
271                     tmp_vec[3] = vld1q_s32(data - 6);
272                     tmp_vec[4] = vld1q_s32(data - 5);
273 
274                     for (i = 0; i < (int)data_len - 11; i += 12)
275                     {
276                         int32x4_t summ_0, summ_1, summ_2;
277                         tmp_vec[5] = vld1q_s32(data + i - 4);
278                         tmp_vec[6] = vld1q_s32(data + i - 3);
279                         tmp_vec[7] = vld1q_s32(data + i - 2);
280                         tmp_vec[8] = vld1q_s32(data + i - 1);
281                         tmp_vec[9] = vld1q_s32(data + i - 0);
282                         tmp_vec[10] = vld1q_s32(data + i + 1);
283                         tmp_vec[11] = vld1q_s32(data + i + 2);
284                         tmp_vec[12] = vld1q_s32(data + i + 3);
285                         tmp_vec[13] = vld1q_s32(data + i + 4);
286                         tmp_vec[14] = vld1q_s32(data + i + 5);
287                         tmp_vec[15] = vld1q_s32(data + i + 6);
288                         tmp_vec[16] = vld1q_s32(data + i + 7);
289 
290                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
291                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
292                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
293                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
294                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
295                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
296                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
297                         MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
298                         MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
299 
300                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
301                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
302                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
303 
304                         tmp_vec[0] = tmp_vec[12];
305                         tmp_vec[1] = tmp_vec[13];
306                         tmp_vec[2] = tmp_vec[14];
307                         tmp_vec[3] = tmp_vec[15];
308                         tmp_vec[4] = tmp_vec[16];
309                     }
310                 }
311             }
312         }
313         else if(order > 4) {
314             if(order > 6) {
315                 if(order == 8) {
316                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
317                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
318 
319                     tmp_vec[0] = vld1q_s32(data - 8);
320                     tmp_vec[1] = vld1q_s32(data - 7);
321                     tmp_vec[2] = vld1q_s32(data - 6);
322                     tmp_vec[3] = vld1q_s32(data - 5);
323 
324                     for (i = 0; i < (int)data_len - 11; i += 12)
325                     {
326                         int32x4_t summ_0, summ_1, summ_2;
327                         tmp_vec[4] = vld1q_s32(data + i - 4);
328                         tmp_vec[5] = vld1q_s32(data + i - 3);
329                         tmp_vec[6] = vld1q_s32(data + i - 2);
330                         tmp_vec[7] = vld1q_s32(data + i - 1);
331                         tmp_vec[8] = vld1q_s32(data + i - 0);
332                         tmp_vec[9] = vld1q_s32(data + i + 1);
333                         tmp_vec[10] = vld1q_s32(data + i + 2);
334                         tmp_vec[11] = vld1q_s32(data + i + 3);
335                         tmp_vec[12] = vld1q_s32(data + i + 4);
336                         tmp_vec[13] = vld1q_s32(data + i + 5);
337                         tmp_vec[14] = vld1q_s32(data + i + 6);
338                         tmp_vec[15] = vld1q_s32(data + i + 7);
339 
340                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
341                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
342                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
343                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
344                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
345                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
346                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
347                         MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
348 
349                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
350                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
351                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
352 
353                         tmp_vec[0] = tmp_vec[12];
354                         tmp_vec[1] = tmp_vec[13];
355                         tmp_vec[2] = tmp_vec[14];
356                         tmp_vec[3] = tmp_vec[15];
357                     }
358                 }
359                 else { /* order == 7 */
360                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
361                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
362 
363                     tmp_vec[0] = vld1q_s32(data - 7);
364                     tmp_vec[1] = vld1q_s32(data - 6);
365                     tmp_vec[2] = vld1q_s32(data - 5);
366 
367                     for (i = 0; i < (int)data_len - 11; i += 12)
368                     {
369                         int32x4_t summ_0, summ_1, summ_2;
370                         tmp_vec[3] = vld1q_s32(data + i - 4);
371                         tmp_vec[4] = vld1q_s32(data + i - 3);
372                         tmp_vec[5] = vld1q_s32(data + i - 2);
373                         tmp_vec[6] = vld1q_s32(data + i - 1);
374                         tmp_vec[7] = vld1q_s32(data + i - 0);
375                         tmp_vec[8] = vld1q_s32(data + i + 1);
376                         tmp_vec[9] = vld1q_s32(data + i + 2);
377                         tmp_vec[10] = vld1q_s32(data + i + 3);
378                         tmp_vec[11] = vld1q_s32(data + i + 4);
379                         tmp_vec[12] = vld1q_s32(data + i + 5);
380                         tmp_vec[13] = vld1q_s32(data + i + 6);
381                         tmp_vec[14] = vld1q_s32(data + i + 7);
382 
383                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
384                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
385                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
386                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
387                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
388                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
389                         MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
390 
391                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
392                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
393                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
394 
395                         tmp_vec[0] = tmp_vec[12];
396                         tmp_vec[1] = tmp_vec[13];
397                         tmp_vec[2] = tmp_vec[14];
398                     }
399                 }
400             }
401             else {
402                 if(order == 6) {
403                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
404                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
405 
406                     tmp_vec[0] = vld1q_s32(data - 6);
407                     tmp_vec[1] = vld1q_s32(data - 5);
408 
409                     for (i = 0; i < (int)data_len - 11; i += 12)
410                     {
411                         int32x4_t summ_0, summ_1, summ_2;
412                         tmp_vec[2] = vld1q_s32(data + i - 4);
413                         tmp_vec[3] = vld1q_s32(data + i - 3);
414                         tmp_vec[4] = vld1q_s32(data + i - 2);
415                         tmp_vec[5] = vld1q_s32(data + i - 1);
416                         tmp_vec[6] = vld1q_s32(data + i - 0);
417                         tmp_vec[7] = vld1q_s32(data + i + 1);
418                         tmp_vec[8] = vld1q_s32(data + i + 2);
419                         tmp_vec[9] = vld1q_s32(data + i + 3);
420                         tmp_vec[10] = vld1q_s32(data + i + 4);
421                         tmp_vec[11] = vld1q_s32(data + i + 5);
422                         tmp_vec[12] = vld1q_s32(data + i + 6);
423                         tmp_vec[13] = vld1q_s32(data + i + 7);
424 
425                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
426                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
427                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
428                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
429                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
430                         MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
431 
432                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
433                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
434                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
435 
436                         tmp_vec[0] = tmp_vec[12];
437                         tmp_vec[1] = tmp_vec[13];
438                     }
439                 }
440                 else { /* order == 5 */
441                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
442                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
443 
444                     tmp_vec[0] = vld1q_s32(data - 5);
445 
446                     for (i = 0; i < (int)data_len - 11; i += 12)
447                     {
448                         int32x4_t summ_0, summ_1, summ_2;
449 
450                         tmp_vec[1] = vld1q_s32(data + i - 4);
451                         tmp_vec[2] = vld1q_s32(data + i - 3);
452                         tmp_vec[3] = vld1q_s32(data + i - 2);
453                         tmp_vec[4] = vld1q_s32(data + i - 1);
454                         tmp_vec[5] = vld1q_s32(data + i - 0);
455                         tmp_vec[6] = vld1q_s32(data + i + 1);
456                         tmp_vec[7] = vld1q_s32(data + i + 2);
457                         tmp_vec[8] = vld1q_s32(data + i + 3);
458                         tmp_vec[9] = vld1q_s32(data + i + 4);
459                         tmp_vec[10] = vld1q_s32(data + i + 5);
460                         tmp_vec[11] = vld1q_s32(data + i + 6);
461                         tmp_vec[12] = vld1q_s32(data + i + 7);
462 
463                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
464                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
465                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
466                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
467                         MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
468 
469                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
470                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
471                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
472 
473                         tmp_vec[0] = tmp_vec[12];
474                     }
475                 }
476             }
477         }
478         else {
479             if(order > 2) {
480                 if(order == 4) {
481                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
482 
483                     for (i = 0; i < (int)data_len - 11; i += 12)
484                     {
485                         int32x4_t summ_0, summ_1, summ_2;
486                         tmp_vec[0] = vld1q_s32(data + i - 4);
487                         tmp_vec[1] = vld1q_s32(data + i - 3);
488                         tmp_vec[2] = vld1q_s32(data + i - 2);
489                         tmp_vec[3] = vld1q_s32(data + i - 1);
490                         tmp_vec[4] = vld1q_s32(data + i - 0);
491                         tmp_vec[5] = vld1q_s32(data + i + 1);
492                         tmp_vec[6] = vld1q_s32(data + i + 2);
493                         tmp_vec[7] = vld1q_s32(data + i + 3);
494                         tmp_vec[8] = vld1q_s32(data + i + 4);
495                         tmp_vec[9] = vld1q_s32(data + i + 5);
496                         tmp_vec[10] = vld1q_s32(data + i + 6);
497                         tmp_vec[11] = vld1q_s32(data + i + 7);
498 
499                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
500                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
501                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
502                         MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
503 
504                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
505                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
506                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
507                     }
508                 }
509                 else { /* order == 3 */
510                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
511 
512                     for (i = 0; i < (int)data_len - 11; i += 12)
513                     {
514                         int32x4_t summ_0, summ_1, summ_2;
515                         tmp_vec[0] = vld1q_s32(data + i - 3);
516                         tmp_vec[1] = vld1q_s32(data + i - 2);
517                         tmp_vec[2] = vld1q_s32(data + i - 1);
518                         tmp_vec[4] = vld1q_s32(data + i + 1);
519                         tmp_vec[5] = vld1q_s32(data + i + 2);
520                         tmp_vec[6] = vld1q_s32(data + i + 3);
521                         tmp_vec[8] = vld1q_s32(data + i + 5);
522                         tmp_vec[9] = vld1q_s32(data + i + 6);
523                         tmp_vec[10] = vld1q_s32(data + i + 7);
524 
525                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
526                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
527                         MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
528 
529                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
530                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
531                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
532                     }
533                 }
534             }
535             else {
536                 if(order == 2) {
537                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
538 
539                     for (i = 0; i < (int)data_len - 11; i += 12)
540                     {
541                         int32x4_t summ_0, summ_1, summ_2;
542                         tmp_vec[0] = vld1q_s32(data + i - 2);
543                         tmp_vec[1] = vld1q_s32(data + i - 1);
544                         tmp_vec[4] = vld1q_s32(data + i + 2);
545                         tmp_vec[5] = vld1q_s32(data + i + 3);
546                         tmp_vec[8] = vld1q_s32(data + i + 6);
547                         tmp_vec[9] = vld1q_s32(data + i + 7);
548 
549                         MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
550                         MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
551 
552                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
553                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
554                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
555                     }
556                 }
557                 else { /* order == 1 */
558                     int32x4_t qlp_coeff_0 = vdupq_n_s32(qlp_coeff[0]);
559 
560                     for (i = 0; i < (int)data_len - 11; i += 12)
561                     {
562                         int32x4_t summ_0, summ_1, summ_2;
563                         tmp_vec[0] = vld1q_s32(data + i - 1);
564                         tmp_vec[4] = vld1q_s32(data + i + 3);
565                         tmp_vec[8] = vld1q_s32(data + i + 7);
566 
567                         summ_0 = vmulq_s32(tmp_vec[0], qlp_coeff_0);
568                         summ_1 = vmulq_s32(tmp_vec[4], qlp_coeff_0);
569                         summ_2 = vmulq_s32(tmp_vec[8], qlp_coeff_0);
570 
571                         vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
572                         vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
573                         vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
574                     }
575                 }
576             }
577         }
578         for(; i < (int)data_len; i++) {
579             sum = 0;
580             switch(order) {
581                 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
582                 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
583                 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
584                 case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
585                 case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
586                 case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
587                 case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
588                 case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
589                 case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
590                 case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
591                 case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
592                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
593             }
594             residual[i] = data[i] - (sum >> lp_quantization);
595         }
596     }
597     else { /* order > 12 */
598         for(i = 0; i < (int)data_len; i++) {
599             sum = 0;
600             switch(order) {
601                 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
602                 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
603                 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
604                 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
605                 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
606                 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
607                 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
608                 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
609                 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
610                 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
611                 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
612                 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
613                 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
614                 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
615                 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
616                 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
617                 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
618                 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
619                 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
620                 case 13: sum += qlp_coeff[12] * data[i-13];
621                          sum += qlp_coeff[11] * data[i-12];
622                          sum += qlp_coeff[10] * data[i-11];
623                          sum += qlp_coeff[ 9] * data[i-10];
624                          sum += qlp_coeff[ 8] * data[i- 9];
625                          sum += qlp_coeff[ 7] * data[i- 8];
626                          sum += qlp_coeff[ 6] * data[i- 7];
627                          sum += qlp_coeff[ 5] * data[i- 6];
628                          sum += qlp_coeff[ 4] * data[i- 5];
629                          sum += qlp_coeff[ 3] * data[i- 4];
630                          sum += qlp_coeff[ 2] * data[i- 3];
631                          sum += qlp_coeff[ 1] * data[i- 2];
632                          sum += qlp_coeff[ 0] * data[i- 1];
633             }
634             residual[i] = data[i] - (sum >> lp_quantization);
635         }
636     }
637 }
638 
639 
640 
641 #define MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
642                         summ_l_0 = vmull_laneq_s32(vget_low_s32(tmp_vec[0]),qlp_coeff_vec, lane); \
643                         summ_h_0 = vmull_high_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane);\
644                         summ_l_1 = vmull_laneq_s32(vget_low_s32(tmp_vec[4]),qlp_coeff_vec, lane); \
645                         summ_h_1 = vmull_high_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane);\
646                         summ_l_2 = vmull_laneq_s32(vget_low_s32(tmp_vec[8]),qlp_coeff_vec, lane);\
647                         summ_h_2 = vmull_high_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
648 
649 
650 #define MACC_64_BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
651                         summ_l_0 = vmlal_laneq_s32(summ_l_0,vget_low_s32(tmp_vec[tmp_vec_ind]),qlp_coeff_vec, lane); \
652                         summ_h_0 = vmlal_high_laneq_s32(summ_h_0, tmp_vec[tmp_vec_ind], qlp_coeff_vec, lane); \
653                         summ_l_1 = vmlal_laneq_s32(summ_l_1, vget_low_s32(tmp_vec[tmp_vec_ind+4]),qlp_coeff_vec, lane); \
654                         summ_h_1 = vmlal_high_laneq_s32(summ_h_1, tmp_vec[tmp_vec_ind+4], qlp_coeff_vec, lane); \
655                         summ_l_2 = vmlal_laneq_s32(summ_l_2, vget_low_s32(tmp_vec[tmp_vec_ind+8]),qlp_coeff_vec, lane);\
656                         summ_h_2 = vmlal_high_laneq_s32(summ_h_2,tmp_vec[tmp_vec_ind+8], qlp_coeff_vec, lane);
657 
658 #define SHIFT_SUMS_64BITS_AND_STORE_SUB() \
659                         res0 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_0,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_0,lp_quantization_vec))); \
660                         res1 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_1,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_1,lp_quantization_vec))); \
661                         res2 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_2,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_2,lp_quantization_vec))); \
662                         vst1q_s32(residual+i+0, vsubq_s32(vld1q_s32(data+i+0), res0));\
663                         vst1q_s32(residual+i+4, vsubq_s32(vld1q_s32(data+i+4), res1));\
664                         vst1q_s32(residual+i+8, vsubq_s32(vld1q_s32(data+i+8), res2));
665 
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])666 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]) {
667 	int i;
668 	FLAC__int64 sum;
669 
670     int32x4_t tmp_vec[20];
671     int32x4_t res0, res1, res2;
672     int64x2_t  lp_quantization_vec = vdupq_n_s64(-lp_quantization);
673 
674     FLAC__ASSERT(order > 0);
675 	FLAC__ASSERT(order <= 32);
676 
677     // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit(signal+order,....)
678 	if(order <= 12) {
679 		if(order > 8) {
680 			if(order > 10) {
681 				if(order == 12) {
682                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
683                     int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
684                     int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]};
685 
686                     tmp_vec[0] = vld1q_s32(data - 12);
687                     tmp_vec[1] = vld1q_s32(data - 11);
688                     tmp_vec[2] = vld1q_s32(data - 10);
689                     tmp_vec[3] = vld1q_s32(data - 9);
690                     tmp_vec[4] = vld1q_s32(data - 8);
691                     tmp_vec[5] = vld1q_s32(data - 7);
692                     tmp_vec[6] = vld1q_s32(data - 6);
693                     tmp_vec[7] = vld1q_s32(data - 5);
694 
695                     for (i = 0; i < (int)data_len - 11; i += 12)
696                     {
697                         int64x2_t  summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
698 
699                         tmp_vec[8] = vld1q_s32(data+i-4);
700                         tmp_vec[9] = vld1q_s32(data+i-3);
701                         tmp_vec[10] = vld1q_s32(data+i-2);
702                         tmp_vec[11] = vld1q_s32(data+i-1);
703                         tmp_vec[12] = vld1q_s32(data+i);
704                         tmp_vec[13] = vld1q_s32(data+i+1);
705                         tmp_vec[14] = vld1q_s32(data+i+2);
706                         tmp_vec[15] = vld1q_s32(data+i+3);
707                         tmp_vec[16] = vld1q_s32(data + i + 4);
708                         tmp_vec[17] = vld1q_s32(data + i + 5);
709                         tmp_vec[18] = vld1q_s32(data + i + 6);
710                         tmp_vec[19] = vld1q_s32(data + i + 7);
711 
712                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
713                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
714                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
715                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
716                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
717                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
718                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
719                         MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
720                         MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
721                         MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
722                         MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 1)
723                         MACC_64_BIT_LOOP_UNROOL_3(11,qlp_coeff_0, 0)
724 
725                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
726 
727                         tmp_vec[0] = tmp_vec[12];
728                         tmp_vec[1] = tmp_vec[13];
729                         tmp_vec[2] = tmp_vec[14];
730                         tmp_vec[3] = tmp_vec[15];
731                         tmp_vec[4] = tmp_vec[16];
732                         tmp_vec[5] = tmp_vec[17];
733                         tmp_vec[6] = tmp_vec[18];
734                         tmp_vec[7] = tmp_vec[19];
735                     }
736                 }
737 				else { /* order == 11 */
738                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
739                     int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
740                     int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0};
741 
742                     tmp_vec[0] = vld1q_s32(data - 11);
743                     tmp_vec[1] = vld1q_s32(data - 10);
744                     tmp_vec[2] = vld1q_s32(data - 9);
745                     tmp_vec[3] = vld1q_s32(data - 8);
746                     tmp_vec[4] = vld1q_s32(data - 7);
747                     tmp_vec[5] = vld1q_s32(data - 6);
748                     tmp_vec[6] = vld1q_s32(data - 5);
749 
750                     for (i = 0; i < (int)data_len - 11; i += 12)
751                     {
752                         int64x2_t  summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
753 
754                         tmp_vec[7] = vld1q_s32(data+i-4);
755                         tmp_vec[8] = vld1q_s32(data+i-3);
756                         tmp_vec[9] = vld1q_s32(data+i-2);
757                         tmp_vec[10] = vld1q_s32(data+i-1);
758                         tmp_vec[11] = vld1q_s32(data+i);
759                         tmp_vec[12] = vld1q_s32(data+i+1);
760                         tmp_vec[13] = vld1q_s32(data+i+2);
761                         tmp_vec[14] = vld1q_s32(data+i+3);
762                         tmp_vec[15] = vld1q_s32(data + i + 4);
763                         tmp_vec[16] = vld1q_s32(data + i + 5);
764                         tmp_vec[17] = vld1q_s32(data + i + 6);
765                         tmp_vec[18] = vld1q_s32(data + i + 7);
766 
767                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
768                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
769                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
770                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
771                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
772                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
773                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
774                         MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
775                         MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
776                         MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
777                         MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 0)
778 
779                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
780 
781                         tmp_vec[0] = tmp_vec[12];
782                         tmp_vec[1] = tmp_vec[13];
783                         tmp_vec[2] = tmp_vec[14];
784                         tmp_vec[3] = tmp_vec[15];
785                         tmp_vec[4] = tmp_vec[16];
786                         tmp_vec[5] = tmp_vec[17];
787                         tmp_vec[6] = tmp_vec[18];
788                     }
789                 }
790             }
791             else
792             {
793                 if (order == 10) {
794                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
795                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
796                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
797 
798                     tmp_vec[0] = vld1q_s32(data - 10);
799                     tmp_vec[1] = vld1q_s32(data - 9);
800                     tmp_vec[2] = vld1q_s32(data - 8);
801                     tmp_vec[3] = vld1q_s32(data - 7);
802                     tmp_vec[4] = vld1q_s32(data - 6);
803                     tmp_vec[5] = vld1q_s32(data - 5);
804 
805 
806                     for (i = 0; i < (int)data_len - 11; i += 12)
807                     {
808                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
809 
810                         tmp_vec[6] = vld1q_s32(data + i - 4);
811                         tmp_vec[7] = vld1q_s32(data + i - 3);
812                         tmp_vec[8] = vld1q_s32(data + i - 2);
813                         tmp_vec[9] = vld1q_s32(data + i - 1);
814                         tmp_vec[10] = vld1q_s32(data + i - 0);
815                         tmp_vec[11] = vld1q_s32(data + i + 1);
816                         tmp_vec[12] = vld1q_s32(data + i + 2);
817                         tmp_vec[13] = vld1q_s32(data + i + 3);
818                         tmp_vec[14] = vld1q_s32(data + i + 4);
819                         tmp_vec[15] = vld1q_s32(data + i + 5);
820                         tmp_vec[16] = vld1q_s32(data + i + 6);
821                         tmp_vec[17] = vld1q_s32(data + i + 7);
822 
823                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
824                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
825                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
826                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
827                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
828                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
829                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
830                         MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
831                         MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
832                         MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
833 
834                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
835 
836                         tmp_vec[0] = tmp_vec[12];
837                         tmp_vec[1] = tmp_vec[13];
838                         tmp_vec[2] = tmp_vec[14];
839                         tmp_vec[3] = tmp_vec[15];
840                         tmp_vec[4] = tmp_vec[16];
841                         tmp_vec[5] = tmp_vec[17];
842                     }
843                 }
844 
845                 else /* order == 9 */ {
846                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
847                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
848                     int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
849 
850                     tmp_vec[0] = vld1q_s32(data - 9);
851                     tmp_vec[1] = vld1q_s32(data - 8);
852                     tmp_vec[2] = vld1q_s32(data - 7);
853                     tmp_vec[3] = vld1q_s32(data - 6);
854                     tmp_vec[4] = vld1q_s32(data - 5);
855 
856                     for (i = 0; i < (int)data_len - 11; i += 12)
857                     {
858                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
859 
860                         tmp_vec[5] = vld1q_s32(data + i - 4);
861                         tmp_vec[6] = vld1q_s32(data + i - 3);
862                         tmp_vec[7] = vld1q_s32(data + i - 2);
863                         tmp_vec[8] = vld1q_s32(data + i - 1);
864                         tmp_vec[9] = vld1q_s32(data + i - 0);
865                         tmp_vec[10] = vld1q_s32(data + i + 1);
866                         tmp_vec[11] = vld1q_s32(data + i + 2);
867                         tmp_vec[12] = vld1q_s32(data + i + 3);
868                         tmp_vec[13] = vld1q_s32(data + i + 4);
869                         tmp_vec[14] = vld1q_s32(data + i + 5);
870                         tmp_vec[15] = vld1q_s32(data + i + 6);
871                         tmp_vec[16] = vld1q_s32(data + i + 7);
872 
873                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
874                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
875                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
876                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
877                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
878                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
879                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
880                         MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
881                         MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
882 
883                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
884 
885                         tmp_vec[0] = tmp_vec[12];
886                         tmp_vec[1] = tmp_vec[13];
887                         tmp_vec[2] = tmp_vec[14];
888                         tmp_vec[3] = tmp_vec[15];
889                         tmp_vec[4] = tmp_vec[16];
890                     }
891                 }
892             }
893         }
894         else if (order > 4)
895         {
896             if (order > 6)
897             {
898                 if (order == 8)
899                 {
900                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
901                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
902 
903                     tmp_vec[0] = vld1q_s32(data - 8);
904                     tmp_vec[1] = vld1q_s32(data - 7);
905                     tmp_vec[2] = vld1q_s32(data - 6);
906                     tmp_vec[3] = vld1q_s32(data - 5);
907 
908                     for (i = 0; i < (int)data_len - 11; i += 12)
909                     {
910                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
911 
912                         tmp_vec[4] = vld1q_s32(data + i - 4);
913                         tmp_vec[5] = vld1q_s32(data + i - 3);
914                         tmp_vec[6] = vld1q_s32(data + i - 2);
915                         tmp_vec[7] = vld1q_s32(data + i - 1);
916                         tmp_vec[8] = vld1q_s32(data + i - 0);
917                         tmp_vec[9] = vld1q_s32(data + i + 1);
918                         tmp_vec[10] = vld1q_s32(data + i + 2);
919                         tmp_vec[11] = vld1q_s32(data + i + 3);
920                         tmp_vec[12] = vld1q_s32(data + i + 4);
921                         tmp_vec[13] = vld1q_s32(data + i + 5);
922                         tmp_vec[14] = vld1q_s32(data + i + 6);
923                         tmp_vec[15] = vld1q_s32(data + i + 7);
924 
925 
926                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
927                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
928                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
929                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
930                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
931                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
932                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
933                         MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
934 
935                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
936 
937                         tmp_vec[0] = tmp_vec[12];
938                         tmp_vec[1] = tmp_vec[13];
939                         tmp_vec[2] = tmp_vec[14];
940                         tmp_vec[3] = tmp_vec[15];
941                     }
942                 }
943                 else /* order == 7 */
944                 {
945                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
946                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
947 
948                     tmp_vec[0] = vld1q_s32(data - 7);
949                     tmp_vec[1] = vld1q_s32(data - 6);
950                     tmp_vec[2] = vld1q_s32(data - 5);
951 
952 
953                     for (i = 0; i < (int)data_len - 11; i += 12)
954                     {
955                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
956                         tmp_vec[3] = vld1q_s32(data +i - 4);
957                         tmp_vec[4] = vld1q_s32(data + i - 3);
958                         tmp_vec[5] = vld1q_s32(data + i - 2);
959                         tmp_vec[6] = vld1q_s32(data + i - 1);
960                         tmp_vec[7] = vld1q_s32(data + i - 0);
961                         tmp_vec[8] = vld1q_s32(data + i + 1);
962                         tmp_vec[9] = vld1q_s32(data + i + 2);
963                         tmp_vec[10] = vld1q_s32(data + i + 3);
964                         tmp_vec[11] = vld1q_s32(data + i + 4);
965                         tmp_vec[12] = vld1q_s32(data + i + 5);
966                         tmp_vec[13] = vld1q_s32(data + i + 6);
967                         tmp_vec[14] = vld1q_s32(data + i + 7);
968 
969 
970                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
971                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
972                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
973                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
974                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
975                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
976                         MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
977 
978                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
979 
980                         tmp_vec[0] = tmp_vec[12];
981                         tmp_vec[1] = tmp_vec[13];
982                         tmp_vec[2] = tmp_vec[14];
983                     }
984                 }
985             }
986             else
987             {
988                 if (order == 6) {
989                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
990                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
991 
992                     tmp_vec[0] = vld1q_s32(data - 6);
993                     tmp_vec[1] = vld1q_s32(data - 5);
994 
995                     for (i = 0; i < (int)data_len - 11; i += 12)
996                     {
997                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
998 
999                         tmp_vec[2] = vld1q_s32(data + i - 4);
1000                         tmp_vec[3] = vld1q_s32(data + i - 3);
1001                         tmp_vec[4] = vld1q_s32(data + i - 2);
1002                         tmp_vec[5] = vld1q_s32(data + i - 1);
1003                         tmp_vec[6] = vld1q_s32(data + i - 0);
1004                         tmp_vec[7] = vld1q_s32(data + i + 1);
1005                         tmp_vec[8] = vld1q_s32(data + i + 2);
1006                         tmp_vec[9] = vld1q_s32(data + i + 3);
1007                         tmp_vec[10] = vld1q_s32(data + i + 4);
1008                         tmp_vec[11] = vld1q_s32(data + i + 5);
1009                         tmp_vec[12] = vld1q_s32(data + i + 6);
1010                         tmp_vec[13] = vld1q_s32(data + i + 7);
1011 
1012 
1013                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
1014                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
1015                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
1016                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
1017                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
1018                         MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
1019 
1020                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1021 
1022                         tmp_vec[0] = tmp_vec[12];
1023                         tmp_vec[1] = tmp_vec[13];
1024                     }
1025                 }
1026 
1027                 else
1028                 { /* order == 5 */
1029                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1030                     int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
1031 
1032                     tmp_vec[0] = vld1q_s32(data - 5);
1033 
1034                     for (i = 0; i < (int)data_len - 11; i += 12)
1035                     {
1036                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1037                         tmp_vec[1] = vld1q_s32(data + i - 4);
1038                         tmp_vec[2] = vld1q_s32(data + i - 3);
1039                         tmp_vec[3] = vld1q_s32(data + i - 2);
1040                         tmp_vec[4] = vld1q_s32(data + i - 1);
1041                         tmp_vec[5] = vld1q_s32(data + i - 0);
1042                         tmp_vec[6] = vld1q_s32(data + i + 1);
1043                         tmp_vec[7] = vld1q_s32(data + i + 2);
1044                         tmp_vec[8] = vld1q_s32(data + i + 3);
1045                         tmp_vec[9] = vld1q_s32(data + i + 4);
1046                         tmp_vec[10] = vld1q_s32(data + i + 5);
1047                         tmp_vec[11] = vld1q_s32(data + i + 6);
1048                         tmp_vec[12] = vld1q_s32(data + i + 7);
1049 
1050                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
1051                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
1052                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
1053                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
1054                         MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
1055 
1056                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1057 
1058                         tmp_vec[0] = tmp_vec[12];
1059                     }
1060                 }
1061             }
1062         }
1063         else
1064         {
1065             if (order > 2)
1066             {
1067                 if (order == 4)
1068                 {
1069                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1070 
1071                     for (i = 0; i < (int)data_len - 11; i += 12)
1072                     {
1073                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1074                         tmp_vec[0] = vld1q_s32(data + i - 4);
1075                         tmp_vec[1] = vld1q_s32(data + i - 3);
1076                         tmp_vec[2] = vld1q_s32(data + i - 2);
1077                         tmp_vec[3] = vld1q_s32(data + i - 1);
1078                         tmp_vec[4] = vld1q_s32(data + i - 0);
1079                         tmp_vec[5] = vld1q_s32(data + i + 1);
1080                         tmp_vec[6] = vld1q_s32(data + i + 2);
1081                         tmp_vec[7] = vld1q_s32(data + i + 3);
1082                         tmp_vec[8] = vld1q_s32(data + i + 4);
1083                         tmp_vec[9] = vld1q_s32(data + i + 5);
1084                         tmp_vec[10] = vld1q_s32(data + i + 6);
1085                         tmp_vec[11] = vld1q_s32(data + i + 7);
1086 
1087                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
1088                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
1089                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
1090                         MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
1091 
1092                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1093                     }
1094                 }
1095                 else
1096                 { /* order == 3 */
1097 
1098                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
1099 
1100                     for (i = 0; i < (int)data_len - 11; i += 12)
1101                     {
1102                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1103                         tmp_vec[0] = vld1q_s32(data + i - 3);
1104                         tmp_vec[1] = vld1q_s32(data + i - 2);
1105                         tmp_vec[2] = vld1q_s32(data + i - 1);
1106                         tmp_vec[4] = vld1q_s32(data + i + 1);
1107                         tmp_vec[5] = vld1q_s32(data + i + 2);
1108                         tmp_vec[6] = vld1q_s32(data + i + 3);
1109                         tmp_vec[8] = vld1q_s32(data + i + 5);
1110                         tmp_vec[9] = vld1q_s32(data + i + 6);
1111                         tmp_vec[10] = vld1q_s32(data + i + 7);
1112 
1113                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
1114                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
1115                         MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
1116 
1117                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1118                     }
1119                 }
1120             }
1121             else
1122             {
1123                 if (order == 2)
1124                 {
1125                     int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
1126 
1127                     for (i = 0; i < (int)data_len - 11; i += 12)
1128                     {
1129                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1130                         tmp_vec[0] = vld1q_s32(data + i - 2);
1131                         tmp_vec[1] = vld1q_s32(data + i - 1);
1132                         tmp_vec[4] = vld1q_s32(data + i + 2);
1133                         tmp_vec[5] = vld1q_s32(data + i + 3);
1134                         tmp_vec[8] = vld1q_s32(data + i + 6);
1135                         tmp_vec[9] = vld1q_s32(data + i + 7);
1136 
1137                         MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
1138                         MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
1139 
1140                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1141                     }
1142                 }
1143 
1144                 else
1145                 { /* order == 1 */
1146 
1147                     int32x2_t qlp_coeff_0_2 = vdup_n_s32(qlp_coeff[0]);
1148                     int32x4_t qlp_coeff_0_4 = vdupq_n_s32(qlp_coeff[0]);
1149 
1150                     for (i = 0; i < (int)data_len - 11; i += 12)
1151                     {
1152                         int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1153                         tmp_vec[0] = vld1q_s32(data + i - 1);
1154                         tmp_vec[4] = vld1q_s32(data + i + 3);
1155                         tmp_vec[8] = vld1q_s32(data + i + 7);
1156 
1157                         summ_l_0 = vmull_s32(vget_low_s32(tmp_vec[0]), qlp_coeff_0_2);
1158                         summ_h_0 = vmull_high_s32(tmp_vec[0], qlp_coeff_0_4);
1159 
1160                         summ_l_1 = vmull_s32(vget_low_s32(tmp_vec[4]), qlp_coeff_0_2);
1161                         summ_h_1 = vmull_high_s32(tmp_vec[4], qlp_coeff_0_4);
1162 
1163                         summ_l_2 = vmull_s32(vget_low_s32(tmp_vec[8]), qlp_coeff_0_2);
1164                         summ_h_2 = vmull_high_s32(tmp_vec[8], qlp_coeff_0_4);
1165 
1166                         SHIFT_SUMS_64BITS_AND_STORE_SUB()
1167                     }
1168                 }
1169             }
1170         }
1171         for (; i < (int)data_len; i++)
1172         {
1173             sum = 0;
1174             switch (order)
1175             {
1176             case 12:
1177                 sum += qlp_coeff[11] * (FLAC__int64)data[i - 12]; /* Falls through. */
1178             case 11:
1179                 sum += qlp_coeff[10] * (FLAC__int64)data[i - 11]; /* Falls through. */
1180             case 10:
1181                 sum += qlp_coeff[9] * (FLAC__int64)data[i - 10]; /* Falls through. */
1182             case 9:
1183                 sum += qlp_coeff[8] * (FLAC__int64)data[i - 9]; /* Falls through. */
1184             case 8:
1185                 sum += qlp_coeff[7] * (FLAC__int64)data[i - 8]; /* Falls through. */
1186             case 7:
1187                 sum += qlp_coeff[6] * (FLAC__int64)data[i - 7]; /* Falls through. */
1188             case 6:
1189                 sum += qlp_coeff[5] * (FLAC__int64)data[i - 6]; /* Falls through. */
1190             case 5:
1191                 sum += qlp_coeff[4] * (FLAC__int64)data[i - 5]; /* Falls through. */
1192             case 4:
1193                 sum += qlp_coeff[3] * (FLAC__int64)data[i - 4]; /* Falls through. */
1194             case 3:
1195                 sum += qlp_coeff[2] * (FLAC__int64)data[i - 3]; /* Falls through. */
1196             case 2:
1197                 sum += qlp_coeff[1] * (FLAC__int64)data[i - 2]; /* Falls through. */
1198             case 1:
1199                 sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1200             }
1201             residual[i] = data[i] - (sum >> lp_quantization);
1202         }
1203     }
1204     else
1205     { /* order > 12 */
1206         for (i = 0; i < (int)data_len; i++)
1207         {
1208             sum = 0;
1209             switch (order)
1210             {
1211             case 32:
1212                 sum += qlp_coeff[31] * (FLAC__int64)data[i - 32]; /* Falls through. */
1213             case 31:
1214                 sum += qlp_coeff[30] * (FLAC__int64)data[i - 31]; /* Falls through. */
1215             case 30:
1216                 sum += qlp_coeff[29] * (FLAC__int64)data[i - 30]; /* Falls through. */
1217             case 29:
1218                 sum += qlp_coeff[28] * (FLAC__int64)data[i - 29]; /* Falls through. */
1219             case 28:
1220                 sum += qlp_coeff[27] * (FLAC__int64)data[i - 28]; /* Falls through. */
1221             case 27:
1222                 sum += qlp_coeff[26] * (FLAC__int64)data[i - 27]; /* Falls through. */
1223             case 26:
1224                 sum += qlp_coeff[25] * (FLAC__int64)data[i - 26]; /* Falls through. */
1225             case 25:
1226                 sum += qlp_coeff[24] * (FLAC__int64)data[i - 25]; /* Falls through. */
1227             case 24:
1228                 sum += qlp_coeff[23] * (FLAC__int64)data[i - 24]; /* Falls through. */
1229             case 23:
1230                 sum += qlp_coeff[22] * (FLAC__int64)data[i - 23]; /* Falls through. */
1231             case 22:
1232                 sum += qlp_coeff[21] * (FLAC__int64)data[i - 22]; /* Falls through. */
1233             case 21:
1234                 sum += qlp_coeff[20] * (FLAC__int64)data[i - 21]; /* Falls through. */
1235             case 20:
1236                 sum += qlp_coeff[19] * (FLAC__int64)data[i - 20]; /* Falls through. */
1237             case 19:
1238                 sum += qlp_coeff[18] * (FLAC__int64)data[i - 19]; /* Falls through. */
1239             case 18:
1240                 sum += qlp_coeff[17] * (FLAC__int64)data[i - 18]; /* Falls through. */
1241             case 17:
1242                 sum += qlp_coeff[16] * (FLAC__int64)data[i - 17]; /* Falls through. */
1243             case 16:
1244                 sum += qlp_coeff[15] * (FLAC__int64)data[i - 16]; /* Falls through. */
1245             case 15:
1246                 sum += qlp_coeff[14] * (FLAC__int64)data[i - 15]; /* Falls through. */
1247             case 14:
1248                 sum += qlp_coeff[13] * (FLAC__int64)data[i - 14]; /* Falls through. */
1249             case 13:
1250                 sum += qlp_coeff[12] * (FLAC__int64)data[i - 13];
1251                 sum += qlp_coeff[11] * (FLAC__int64)data[i - 12];
1252                 sum += qlp_coeff[10] * (FLAC__int64)data[i - 11];
1253                 sum += qlp_coeff[9] * (FLAC__int64)data[i - 10];
1254                 sum += qlp_coeff[8] * (FLAC__int64)data[i - 9];
1255                 sum += qlp_coeff[7] * (FLAC__int64)data[i - 8];
1256                 sum += qlp_coeff[6] * (FLAC__int64)data[i - 7];
1257                 sum += qlp_coeff[5] * (FLAC__int64)data[i - 6];
1258                 sum += qlp_coeff[4] * (FLAC__int64)data[i - 5];
1259                 sum += qlp_coeff[3] * (FLAC__int64)data[i - 4];
1260                 sum += qlp_coeff[2] * (FLAC__int64)data[i - 3];
1261                 sum += qlp_coeff[1] * (FLAC__int64)data[i - 2];
1262                 sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1263             }
1264             residual[i] = data[i] - (sum >> lp_quantization);
1265         }
1266     }
1267 
1268     return;
1269 }
1270 
1271 #endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
1272 #endif /* FLAC__NO_ASM */
1273 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
1274